package com.mandarintools;

import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenImpl;
import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenList;
import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenListImpl;
import edu.harvard.wcfia.yoshikoder.document.tokenizer.Tokenizer;
import java.io.File;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:com/mandarintools/SimplifiedChineseTokenizer.class */
public class SimplifiedChineseTokenizer extends AbstractChineseTokenizer implements Tokenizer {
    protected Map zhwords;
    protected Set csurname;
    protected Set cforeign;
    protected Set cnumbers;
    protected Set cnotname;
    private static Logger log = Logger.getLogger("com.mandarintools.SimplifiedChineseTokenizer");

    public SimplifiedChineseTokenizer() {
        log.info("Loading resources");
        this.csurname = new TreeSet();
        this.cforeign = new TreeSet();
        this.cnumbers = new TreeSet();
        this.cnotname = new TreeSet();
        this.zhwords = new TreeMap();
        loadset(this.cnumbers, "data/snumbers_u8.txt");
        loadset(this.cforeign, "data/sforeign_u8.txt");
        loadset(this.csurname, "data/ssurname_u8.txt");
        loadset(this.cnotname, "data/snotname_u8.txt");
        fillCharacterMap(this.zhwords, "simplexu8.txt");
    }

    @Override // com.mandarintools.AbstractChineseTokenizer
    protected Pattern getInCJKPattern() {
        return Pattern.compile("[\\p{InCJKUnifiedIdeographs}]+");
    }

    @Override // com.mandarintools.AbstractChineseTokenizer
    protected Pattern getNotCJKPattern() {
        return Pattern.compile("[\\P{InCJKUnifiedIdeographs}]+");
    }

    @Override // com.mandarintools.AbstractChineseTokenizer
    protected Tokenizer getFallbackTokenizer() {
        return new BITokenizerImpl();
    }

    @Override // com.mandarintools.AbstractChineseTokenizer, edu.harvard.wcfia.yoshikoder.document.tokenizer.Tokenizer
    public Locale[] getLocales() {
        return new Locale[]{Locale.SIMPLIFIED_CHINESE, Locale.CHINESE};
    }

    @Override // com.mandarintools.AbstractChineseTokenizer
    protected TokenList segment(String str) {
        TokenListImpl tokenListImpl = new TokenListImpl();
        Matcher matcher = this.inCJK.matcher(str);
        while (matcher.find()) {
            tokenListImpl.addAll(getTokens(str.substring(matcher.start(), matcher.end()), matcher.start()));
        }
        return tokenListImpl;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public TokenList getTokens(String str, int i) {
        TokenListImpl tokenListImpl = new TokenListImpl();
        StringBuffer stringBuffer = new StringBuffer();
        for (int i2 = 0; i2 < str.length(); i2++) {
            stringBuffer.append(str.charAt(i2));
            String stringBuffer2 = stringBuffer.toString();
            String str2 = (String) this.zhwords.get(stringBuffer2);
            if (str2 != null) {
                if (str2.equals("1")) {
                    int length = (i + i2) - (stringBuffer2.length() - 1);
                    tokenListImpl.add(new TokenImpl(stringBuffer2.intern(), length, length + stringBuffer2.length()));
                    stringBuffer.setLength(0);
                } else {
                    str2.equals("2");
                }
            } else if (stringBuffer.length() >= 2) {
                String substring = stringBuffer.substring(stringBuffer.length() - 2, stringBuffer.length());
                String str3 = (String) this.zhwords.get(substring);
                if (str3 != null) {
                    if (stringBuffer.length() > 2) {
                        String substring2 = stringBuffer.substring(0, stringBuffer.length() - 2);
                        int length2 = (i + i2) - (stringBuffer.length() - 1);
                        tokenListImpl.add(new TokenImpl(substring2.intern(), length2, length2 + substring2.length()));
                        stringBuffer.setLength(0);
                        stringBuffer.append(substring);
                    }
                    if (str3.equals("1")) {
                        tokenListImpl.add(new TokenImpl(substring.intern(), (i + i2) - 1, i + i2 + 1));
                        stringBuffer.setLength(0);
                    } else {
                        str3.equals("2");
                    }
                }
            }
        }
        if (stringBuffer.length() > 0) {
            String stringBuffer3 = stringBuffer.toString();
            tokenListImpl.add(new TokenImpl(stringBuffer3.intern(), (i + str.length()) - stringBuffer3.length(), i + str.length()));
        }
        return tokenListImpl;
    }

    public static void main(String[] strArr) throws Exception {
        System.out.println(new String(new SimplifiedChineseTokenizer().getTokens(FileUtil.slurp(new File(strArr[0]), "UTF-8")).toString().getBytes("UTF-8")));
    }
}
