package com.mandarintools;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.lang.Character;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Vector;

/* loaded from: input_file:com/mandarintools/Segmenter.class */
public class Segmenter {
    private TreeMap zhwords;
    public static final int TRAD = 0;
    public static final int SIMP = 1;
    public static final int BOTH = 2;
    static Class class$0;
    private boolean debug = false;
    private String debugencoding = "UTF-8";
    private TreeSet csurname = new TreeSet();
    private TreeSet cforeign = new TreeSet();
    private TreeSet cnumbers = new TreeSet();
    private TreeSet cnotname = new TreeSet();

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r1v34, types: [java.lang.Throwable] */
    public Segmenter(int i, boolean z) {
        int i2 = 0;
        if (i == 1) {
            loadset(this.cnumbers, "data/snumbers_u8.txt");
            loadset(this.cforeign, "data/sforeign_u8.txt");
            loadset(this.csurname, "data/ssurname_u8.txt");
            loadset(this.cnotname, "data/snotname_u8.txt");
        } else if (i == 0) {
            loadset(this.cnumbers, "data/tnumbers_u8.txt");
            loadset(this.cforeign, "data/tforeign_u8.txt");
            loadset(this.csurname, "data/tsurname_u8.txt");
            loadset(this.cnotname, "data/tnotname_u8.txt");
        } else {
            loadset(this.cnumbers, "data/snumbers_u8.txt");
            loadset(this.cforeign, "data/sforeign_u8.txt");
            loadset(this.csurname, "data/ssurname_u8.txt");
            loadset(this.cnotname, "data/snotname_u8.txt");
            loadset(this.cnumbers, "data/tnumbers_u8.txt");
            loadset(this.cforeign, "data/tforeign_u8.txt");
            loadset(this.csurname, "data/tsurname_u8.txt");
            loadset(this.cnotname, "data/tnotname_u8.txt");
        }
        this.zhwords = new TreeMap();
        if (!z) {
            return;
        }
        Class<?> cls = class$0;
        if (cls == null) {
            try {
                cls = Class.forName("com.mandarintools.Segmenter");
                class$0 = cls;
            } catch (ClassNotFoundException unused) {
                throw new NoClassDefFoundError(cls.getMessage());
            }
        }
        ClassLoader classLoader = cls.getClassLoader();
        InputStream inputStream = null;
        try {
            if (i == 1) {
                inputStream = classLoader.getResourceAsStream("simplexu8.txt");
            } else if (i == 0) {
                inputStream = classLoader.getResourceAsStream("tradlexu8.txt");
            } else if (i == 2) {
                inputStream = classLoader.getResourceAsStream("bothlexu8.txt");
            }
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    return;
                }
                if (readLine.indexOf("#") == -1 && readLine.length() < 5) {
                    this.zhwords.put(readLine.intern(), "1");
                    if (readLine.length() == 3 && !this.zhwords.containsKey(readLine.substring(0, 2).intern())) {
                        this.zhwords.put(readLine.substring(0, 2).intern(), "2");
                    }
                    if (readLine.length() == 4) {
                        if (!this.zhwords.containsKey(readLine.substring(0, 2).intern())) {
                            this.zhwords.put(readLine.substring(0, 2).intern(), "2");
                        }
                        if (!this.zhwords.containsKey(readLine.substring(0, 3).intern())) {
                            this.zhwords.put(readLine.substring(0, 3).intern(), "2");
                        }
                    }
                    int i3 = i2;
                    i2++;
                    if (i3 % 20000 == 0) {
                        System.err.println(i2);
                    }
                }
            }
        } catch (IOException e) {
            System.err.println(new StringBuffer("IOException: ").append(e).toString());
        }
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r1v7, types: [java.lang.Throwable] */
    private void loadset(TreeSet treeSet, String str) {
        try {
            Class<?> cls = class$0;
            if (cls == null) {
                try {
                    cls = Class.forName("com.mandarintools.Segmenter");
                    class$0 = cls;
                } catch (ClassNotFoundException unused) {
                    throw new NoClassDefFoundError(cls.getMessage());
                }
            }
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(cls.getClassLoader().getResourceAsStream(str), "UTF-8"));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    return;
                } else if (readLine.indexOf("#") <= -1 && readLine.length() != 0) {
                    treeSet.add(readLine.intern());
                }
            }
        } catch (Exception e) {
            System.err.println(new StringBuffer("Exception loading data file ").append(str).append(" ").append(e).toString());
        }
    }

    public Map getZhwords() {
        return this.zhwords;
    }

    public boolean isNumber(String str) {
        boolean z = true;
        int i = 0;
        while (true) {
            if (i >= str.length()) {
                break;
            }
            if (!this.cnumbers.contains(str.substring(i, i + 1).intern())) {
                z = false;
                break;
            }
            i++;
        }
        if (this.debug) {
            try {
                System.out.println(new StringBuffer(String.valueOf(new String(str.getBytes("UTF-8")))).append(" ").append(z).toString());
            } catch (Exception e) {
            }
        }
        return z;
    }

    public boolean isAllForeign(String str) {
        boolean z = true;
        int i = 0;
        while (true) {
            if (i >= str.length()) {
                break;
            }
            if (!this.cforeign.contains(str.substring(i, i + 1).intern())) {
                z = false;
                break;
            }
            i++;
        }
        return z;
    }

    public boolean isNotCJK(String str) {
        boolean z = true;
        int i = 0;
        while (true) {
            if (i >= str.length()) {
                break;
            }
            if (Character.UnicodeBlock.of(str.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
                z = false;
                break;
            }
            i++;
        }
        return z;
    }

    public String stemWord(String str) {
        String[] strArr = {"了", "的", "地", "下", "上", "中", "里", "到", "内", "外", "们"};
        String[] strArr2 = {"得", "不"};
        StringBuffer stringBuffer = new StringBuffer(str);
        for (String str2 : new String[]{"第", "副", "不"}) {
            if (stringBuffer.substring(0, 1).equals(str2) && (this.zhwords.get(stringBuffer.substring(1, stringBuffer.length()).intern()) != null || stringBuffer.length() == 2)) {
                System.out.println("Stemmed prefix");
                try {
                    System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                } catch (Exception e) {
                }
                stringBuffer.deleteCharAt(0);
                return stringBuffer.toString();
            }
        }
        for (String str3 : strArr) {
            if (stringBuffer.substring(stringBuffer.length() - 1, stringBuffer.length()).equals(str3) && (this.zhwords.get(stringBuffer.substring(0, stringBuffer.length() - 1).intern()) != null || stringBuffer.length() == 2)) {
                System.out.println("Stemmed suffix");
                try {
                    System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                } catch (Exception e2) {
                }
                stringBuffer.deleteCharAt(stringBuffer.length() - 1);
                return stringBuffer.toString();
            }
        }
        for (String str4 : strArr2) {
            if (stringBuffer.length() == 3 && stringBuffer.substring(1, 2).equals(str4) && this.zhwords.get(new String(new StringBuffer(String.valueOf(stringBuffer.substring(0, 1))).append(stringBuffer.substring(2, 3)).toString()).intern()) != null) {
                System.out.println("Stemmed infix");
                stringBuffer.deleteCharAt(1);
                return stringBuffer.toString();
            }
        }
        return stringBuffer.toString();
    }

    public String segmentLine(String str, String str2) {
        StringBuffer stringBuffer = new StringBuffer();
        StringBuffer stringBuffer2 = new StringBuffer();
        int length = str.length();
        for (int i = 0; i < length; i++) {
            char charAt = str.charAt(i);
            System.err.println(new StringBuffer(String.valueOf((int) charAt)).append(" ").append(isWhitespace(charAt)).toString());
            if (Character.UnicodeBlock.of(charAt) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS && !isNumber(str.substring(i, i + 1))) {
                if (stringBuffer.length() > 0) {
                    stringBuffer2.append(stringBuffer.toString());
                    if (!isWhitespace(charAt)) {
                        stringBuffer2.append(str2);
                    }
                    stringBuffer.setLength(0);
                }
                stringBuffer2.append(charAt);
            } else if (stringBuffer.length() == 0) {
                if (i > 0 && !isWhitespace(str.charAt(i - 1))) {
                    stringBuffer2.append(str2);
                }
                stringBuffer.append(charAt);
                if (this.debug) {
                    try {
                        System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                    } catch (Exception e) {
                    }
                }
            } else if (this.zhwords.containsKey(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).toString()).intern()) && ((String) this.zhwords.get(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).toString()).intern())).equals("1")) {
                stringBuffer.append(charAt);
                if (this.debug) {
                    try {
                        System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                    } catch (Exception e2) {
                    }
                }
            } else if (isAllForeign(stringBuffer.toString()) && this.cforeign.contains(new String(new char[]{charAt}).intern()) && i + 2 < length && !this.zhwords.containsKey(str.substring(i, i + 2).intern())) {
                stringBuffer.append(charAt);
                if (this.debug) {
                    try {
                        System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                    } catch (Exception e3) {
                    }
                }
            } else if (isNumber(stringBuffer.toString()) && this.cnumbers.contains(new String(new char[]{charAt}).intern())) {
                stringBuffer.append(charAt);
                if (this.debug) {
                    try {
                        System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                    } catch (Exception e4) {
                    }
                }
            } else if (this.zhwords.containsKey(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).toString()).intern()) && ((String) this.zhwords.get(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).toString()).intern())).equals("2") && i + 1 < length && this.zhwords.containsKey(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).append(str.charAt(i + 1)).toString()).intern())) {
                if (this.debug) {
                    try {
                        System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                    } catch (Exception e5) {
                    }
                }
                stringBuffer.append(charAt);
            } else {
                if (this.debug) {
                    try {
                        System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                    } catch (Exception e6) {
                    }
                }
                stringBuffer2.append(stringBuffer.toString());
                if (!isWhitespace(charAt)) {
                    stringBuffer2.append(str2);
                }
                stringBuffer.setLength(0);
                stringBuffer.append(charAt);
            }
        }
        stringBuffer2.append(stringBuffer.toString());
        return stringBuffer2.toString();
    }

    protected boolean isWhitespace(char c) {
        return Character.isWhitespace(c) || Character.isSpaceChar(c) || Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION;
    }

    public String tokenizeLine(String str, String str2) {
        StringBuffer stringBuffer = new StringBuffer();
        StringBuffer stringBuffer2 = new StringBuffer();
        int length = str.length();
        for (int i = 0; i < length; i++) {
            char charAt = str.charAt(i);
            if (Character.UnicodeBlock.of(charAt) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS && !isNumber(str.substring(i, i + 1))) {
                if (stringBuffer.length() > 0) {
                    stringBuffer2.append(stringBuffer.toString());
                    if (!Character.isWhitespace(charAt)) {
                        stringBuffer2.append(str2);
                        System.err.println(new StringBuffer("Break: ").append(i).toString());
                    }
                    stringBuffer.setLength(0);
                }
                stringBuffer2.append(charAt);
            } else if (stringBuffer.length() == 0) {
                if (i > 0 && !Character.isWhitespace(str.charAt(i - 1))) {
                    stringBuffer2.append(str2);
                    System.err.println(new StringBuffer("Break: ").append(i).toString());
                }
                stringBuffer.append(charAt);
                if (this.debug) {
                    try {
                        System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                    } catch (Exception e) {
                    }
                }
            } else if (this.zhwords.containsKey(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).toString()).intern()) && ((String) this.zhwords.get(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).toString()).intern())).equals("1")) {
                stringBuffer.append(charAt);
                if (this.debug) {
                    try {
                        System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                    } catch (Exception e2) {
                    }
                }
            } else if (isAllForeign(stringBuffer.toString()) && this.cforeign.contains(new String(new char[]{charAt}).intern()) && i + 2 < length && !this.zhwords.containsKey(str.substring(i, i + 2).intern())) {
                stringBuffer.append(charAt);
                if (this.debug) {
                    try {
                        System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                    } catch (Exception e3) {
                    }
                }
            } else if (isNumber(stringBuffer.toString()) && this.cnumbers.contains(new String(new char[]{charAt}).intern())) {
                stringBuffer.append(charAt);
                if (this.debug) {
                    try {
                        System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                    } catch (Exception e4) {
                    }
                }
            } else if (this.zhwords.containsKey(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).toString()).intern()) && ((String) this.zhwords.get(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).toString()).intern())).equals("2") && i + 1 < length && this.zhwords.containsKey(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).append(str.charAt(i + 1)).toString()).intern())) {
                if (this.debug) {
                    try {
                        System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                    } catch (Exception e5) {
                    }
                }
                stringBuffer.append(charAt);
            } else {
                if (this.debug) {
                    try {
                        System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                    } catch (Exception e6) {
                    }
                }
                stringBuffer2.append(stringBuffer.toString());
                if (!Character.isWhitespace(charAt)) {
                    stringBuffer2.append(str2);
                    System.err.println(new StringBuffer("Break: ").append(i).toString());
                }
                stringBuffer.setLength(0);
                stringBuffer.append(charAt);
            }
        }
        stringBuffer2.append(stringBuffer.toString());
        return stringBuffer2.toString();
    }

    public LinkedList segmentLine(String str) {
        StringBuffer stringBuffer = new StringBuffer();
        StringBuffer stringBuffer2 = new StringBuffer();
        LinkedList linkedList = new LinkedList();
        int i = 0;
        linkedList.add(new Integer(0));
        int length = str.length();
        int i2 = 0;
        while (i2 < length) {
            char charAt = str.charAt(i2);
            if (Character.UnicodeBlock.of(charAt) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS && !isNumber(str.substring(i2, i2 + 1))) {
                if (stringBuffer.length() > 0) {
                    stringBuffer2.append(stringBuffer.toString());
                    i += stringBuffer.length();
                    linkedList.add(new Integer(i));
                    if (!Character.isWhitespace(charAt)) {
                        stringBuffer2.append("");
                        i += "".length();
                        if ("".length() > 0) {
                            linkedList.add(new Integer(i));
                        }
                    }
                    stringBuffer.setLength(0);
                }
                while (i2 < length && Character.UnicodeBlock.of(str.charAt(i2)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
                    stringBuffer.append(str.charAt(i2));
                    i2++;
                }
                i2--;
                stringBuffer2.append(stringBuffer.toString());
                i += stringBuffer.length();
                linkedList.add(new Integer(i));
                stringBuffer.setLength(0);
            } else if (stringBuffer.length() == 0) {
                if (i2 > 0 && !Character.isWhitespace(str.charAt(i2 - 1))) {
                    stringBuffer2.append("");
                    i += "".length();
                    if ("".length() > 0) {
                        linkedList.add(new Integer(i));
                    }
                }
                stringBuffer.append(charAt);
            } else if (this.zhwords.containsKey(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).toString()).intern()) && ((String) this.zhwords.get(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).toString()).intern())).equals("1")) {
                stringBuffer.append(charAt);
            } else if (isNumber(stringBuffer.toString()) && this.cnumbers.contains(new String(new char[]{charAt}).intern())) {
                stringBuffer.append(charAt);
                if (this.debug) {
                    try {
                        System.out.println(new String(stringBuffer.toString().getBytes(this.debugencoding)));
                    } catch (Exception e) {
                    }
                }
            } else if (isAllForeign(stringBuffer.toString()) && this.cforeign.contains(new String(new char[]{charAt}).intern()) && i2 + 2 < length && !this.zhwords.containsKey(str.substring(i2, i2 + 2).intern())) {
                stringBuffer.append(charAt);
            } else if (this.zhwords.containsKey(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).toString()).intern()) && ((String) this.zhwords.get(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).toString()).intern())).equals("2") && i2 + 1 < length && this.zhwords.containsKey(new String(new StringBuffer(String.valueOf(stringBuffer.toString())).append(charAt).append(str.charAt(i2 + 1)).toString()).intern())) {
                stringBuffer.append(charAt);
            } else {
                stringBuffer2.append(stringBuffer.toString());
                i += stringBuffer.length();
                linkedList.add(new Integer(i));
                if (!Character.isWhitespace(charAt)) {
                    stringBuffer2.append("");
                    i += "".length();
                    if ("".length() > 0) {
                        linkedList.add(new Integer(i));
                    }
                }
                stringBuffer.setLength(0);
                stringBuffer.append(charAt);
            }
            i2++;
        }
        stringBuffer2.append(stringBuffer.toString());
        if (stringBuffer.length() > 0) {
            linkedList.add(new Integer(i + stringBuffer.length()));
        }
        return linkedList;
    }

    public void addword(String str) {
        this.zhwords.put(str.intern(), "1");
        int length = str.length();
        if (length > 2) {
            for (int i = 2; i < length; i++) {
                if (!this.zhwords.containsKey(str.substring(0, i - 1).intern())) {
                    this.zhwords.put(str.substring(0, i - 1).intern(), "2");
                }
            }
        }
    }

    public void segmentFile(String str, String str2) {
        String stringBuffer = new StringBuffer(String.valueOf(str)).append(".seg").toString();
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str), str2));
            BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(stringBuffer), str2));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    bufferedWriter.close();
                    return;
                }
                String segmentLine = segmentLine(readLine, " ");
                if (this.debug) {
                    System.err.println(new StringBuffer("Output: ").append(new String(segmentLine.getBytes(str2))).toString());
                }
                bufferedWriter.write(segmentLine);
                bufferedWriter.newLine();
            }
        } catch (Exception e) {
            System.err.println(new StringBuffer("Exception ").append(e.toString()).toString());
        }
    }

    public static void printHelp() {
        System.out.println("Usage:\njava -jar segmenter.jar [-b|-g|-8] inputfile.txt");
        System.out.println("\t-b Big5, -g GB2312, -8 UTF-8");
        System.out.println("  Segmented text will be saved to inputfile.txt.seg");
        System.exit(0);
    }

    public static void main(String[] strArr) {
        Vector vector = new Vector();
        String str = "BIG5";
        int i = 0;
        for (int i2 = 0; i2 < strArr.length; i2++) {
            if (strArr[i2].equals("-b")) {
                if (0 != 0) {
                    System.out.println("Setting to Big5, TRAD");
                }
                str = "BIG5";
                i = 0;
            } else if (strArr[i2].equals("-g")) {
                if (0 != 0) {
                    System.out.println("Setting to GB, SIMP");
                }
                str = "GBK";
                i = 1;
            } else if (strArr[i2].equals("-8")) {
                str = "UTF8";
                i = 2;
            } else if (strArr[i2].equals("-s")) {
                if (0 != 0) {
                    System.out.println("Setting to SIMP");
                }
                i = 1;
            } else if (strArr[i2].equals("-t")) {
                if (0 != 0) {
                    System.out.println("Setting to TRAD");
                }
                i = 0;
            } else if (strArr[i2].equals("-h")) {
                printHelp();
            } else {
                vector.add(strArr[i2]);
            }
        }
        if (vector.size() == 0) {
            System.out.println("ERROR: Please specify name of Chinese text file to segment.\n");
            printHelp();
        }
        System.err.println("Loading segmenter word list.  One moment please.");
        Segmenter segmenter = new Segmenter(i, true);
        System.err.println(new StringBuffer("Total keys ").append(segmenter.zhwords.size()).toString());
        for (int i3 = 0; i3 < vector.size(); i3++) {
            File file = new File((String) vector.get(i3));
            if (!file.exists()) {
                System.out.println(new StringBuffer("ERROR: Source file ").append((String) vector.get(i3)).append(" does not exist.\n").toString());
            } else if (file.isDirectory()) {
                String[] list = file.list();
                if (list != null) {
                    for (String str2 : list) {
                        vector.add(new StringBuffer(String.valueOf((String) vector.get(i3))).append(File.separator).append(str2).toString());
                    }
                }
            } else {
                System.err.println(new StringBuffer("Segmenting ").append(vector.get(i3)).append(" with encoding ").append(str).toString());
                segmenter.segmentFile((String) vector.get(i3), str);
            }
        }
    }
}
