package com.mandarintools;

import edu.harvard.wcfia.yoshikoder.document.tokenizer.Token;
import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenImpl;
import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenList;
import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenListImpl;
import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenizationException;
import edu.harvard.wcfia.yoshikoder.document.tokenizer.Tokenizer;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.Comparator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:com/mandarintools/AbstractChineseTokenizer.class */
public abstract class AbstractChineseTokenizer {
    private static Logger log = Logger.getLogger("com.mandarintools.AbstractChineseTokenizer");
    private static Comparator tokenComp = new Comparator() { // from class: com.mandarintools.AbstractChineseTokenizer.1
        @Override // java.util.Comparator
        public int compare(Object obj, Object obj2) {
            try {
                return ((Token) obj).getStart() - ((Token) obj2).getStart();
            } catch (Exception e) {
                return 0;
            }
        }
    };
    protected Tokenizer fallbackTokenizer = getFallbackTokenizer();
    protected Pattern inCJK = getInCJKPattern();
    protected Pattern notCJK = getNotCJKPattern();

    protected abstract Pattern getInCJKPattern();

    protected abstract Pattern getNotCJKPattern();

    protected abstract Tokenizer getFallbackTokenizer();

    /* JADX INFO: Access modifiers changed from: protected */
    public abstract Locale[] getLocales();

    protected abstract TokenList segment(String str);

    public TokenList getTokens(String str) throws TokenizationException {
        TokenList segment = segment(str);
        segment.addAll(fallbackSegment(str));
        Collections.sort(segment, tokenComp);
        return segment;
    }

    protected TokenList fallbackSegment(String str) throws TokenizationException {
        TokenListImpl tokenListImpl = new TokenListImpl();
        Matcher matcher = this.notCJK.matcher(str);
        while (matcher.find()) {
            tokenListImpl.addAll(getFallbackTokens(str.substring(matcher.start(), matcher.end()), matcher.start()));
        }
        return tokenListImpl;
    }

    protected TokenList getFallbackTokens(String str, int i) throws TokenizationException {
        TokenList<Token> tokens = this.fallbackTokenizer.getTokens(str);
        TokenListImpl tokenListImpl = new TokenListImpl();
        for (Token token : tokens) {
            tokenListImpl.add(new TokenImpl(token.getText(), token.getStart() + i, token.getEnd() + i));
        }
        return tokenListImpl;
    }

    public static void printUTF8(String str) {
        try {
            System.out.println(new String(str.getBytes("UTF-8")));
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void loadset(Set set, String str) {
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(Util.getResourceAsStream(str, new File("resources")), "UTF-8"));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    return;
                } else if (readLine.indexOf("#") <= -1 && readLine.length() != 0) {
                    set.add(readLine.intern());
                }
            }
        } catch (Exception e) {
            log.log(Level.WARNING, new StringBuffer("Could not load data file ").append(str).toString(), (Throwable) e);
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void fillCharacterMap(Map map, String str) {
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(Util.getResourceAsStream(str, new File("resources")), "UTF-8"));
            int i = 0;
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    return;
                }
                if (readLine.indexOf("#") == -1 && readLine.length() < 5) {
                    map.put(readLine.intern(), "1");
                    if (readLine.length() == 3 && !map.containsKey(readLine.substring(0, 2).intern())) {
                        map.put(readLine.substring(0, 2).intern(), "2");
                    }
                    if (readLine.length() == 4) {
                        if (!map.containsKey(readLine.substring(0, 2).intern())) {
                            map.put(readLine.substring(0, 2).intern(), "2");
                        }
                        if (!map.containsKey(readLine.substring(0, 3).intern())) {
                            map.put(readLine.substring(0, 3).intern(), "2");
                        }
                    }
                    int i2 = i;
                    i++;
                    if (i2 % 20000 == 0) {
                        System.err.println(i);
                    }
                }
            }
        } catch (IOException e) {
            log.log(Level.WARNING, "Failed to load resource from file system ", (Throwable) e);
        }
    }

    public static void main(String[] strArr) throws Exception {
        TokenImpl tokenImpl = new TokenImpl("foo", 0, 5);
        TokenImpl tokenImpl2 = new TokenImpl("bar", 6, 10);
        TokenListImpl tokenListImpl = new TokenListImpl();
        tokenListImpl.add(tokenImpl2);
        tokenListImpl.add(tokenImpl);
        System.out.println(tokenListImpl);
        Collections.sort(tokenListImpl, tokenComp);
        System.out.println(tokenListImpl);
    }
}
