/*
 * Decompiled with CFR 0.152.
 */
package weka.core;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.io.Serializable;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.Vector;
import weka.core.Aggregateable;
import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.SparseInstance;
import weka.core.Utils;
import weka.core.WekaException;
import weka.core.stemmers.NullStemmer;
import weka.core.stemmers.Stemmer;
import weka.core.stopwords.Null;
import weka.core.stopwords.StopwordsHandler;
import weka.core.tokenizers.Tokenizer;
import weka.core.tokenizers.WordTokenizer;
import weka.gui.ProgrammaticProperty;

public class DictionaryBuilder
implements Aggregateable<DictionaryBuilder>,
OptionHandler,
Serializable {
    private static final long serialVersionUID = 5579506627960356012L;
    protected Instances m_inputFormat;
    protected Instances m_outputFormat;
    protected Map<String, int[]>[] m_dictsPerClass;
    protected Map<String, int[]> m_consolidatedDict;
    protected transient Map<String, int[]> m_inputVector;
    protected boolean m_doNotOperateOnPerClassBasis;
    protected boolean m_outputCounts;
    protected boolean m_lowerCaseTokens;
    protected Stemmer m_stemmer = new NullStemmer();
    protected StopwordsHandler m_stopwordsHandler = new Null();
    protected int m_wordsToKeep = 1000;
    protected long m_periodicPruneRate;
    protected int m_minFrequency = 1;
    protected int m_count = 0;
    protected Tokenizer m_tokenizer = new WordTokenizer();
    protected Range m_selectedRange = new Range("first-last");
    protected int m_classIndex = -1;
    protected int m_numClasses = 1;
    protected String m_Prefix = "";
    protected boolean m_TFTransform;
    protected boolean m_IDFTransform;
    protected boolean m_normalize;
    protected double m_docLengthSum;
    protected double m_avgDocLength;
    protected boolean m_sortDictionary;
    protected boolean m_inputContainsStringAttributes;

    @ProgrammaticProperty
    public void setAverageDocLength(double averageDocLength) {
        this.m_avgDocLength = averageDocLength;
    }

    public double getAverageDocLength() {
        return this.m_avgDocLength;
    }

    public String sortDictionaryTipText() {
        return "Sort the dictionary alphabetically";
    }

    public void setSortDictionary(boolean sortDictionary) {
        this.m_sortDictionary = sortDictionary;
    }

    public boolean getSortDictionary() {
        return this.m_sortDictionary;
    }

    public boolean getOutputWordCounts() {
        return this.m_outputCounts;
    }

    public void setOutputWordCounts(boolean outputWordCounts) {
        this.m_outputCounts = outputWordCounts;
    }

    public String outputWordCountsTipText() {
        return "Output word counts rather than boolean 0 or 1(indicating presence or absence of a word).";
    }

    public Range getSelectedRange() {
        return this.m_selectedRange;
    }

    public void setSelectedRange(String newSelectedRange) {
        this.m_selectedRange = new Range(newSelectedRange);
    }

    public String attributeIndicesTipText() {
        return "Specify range of attributes to act on. This is a comma separated list of attribute indices, with \"first\" and \"last\" valid values. Specify an inclusive range with \"-\". E.g: \"first-3,5,6-10,last\".";
    }

    public String getAttributeIndices() {
        return this.m_selectedRange.getRanges();
    }

    public void setAttributeIndices(String rangeList) {
        this.m_selectedRange.setRanges(rangeList);
    }

    public void setAttributeIndicesArray(int[] attributes) {
        this.setAttributeIndices(Range.indicesToRangeList(attributes));
    }

    public String invertSelectionTipText() {
        return "Set attribute selection mode. If false, only selected attributes in the range will be worked on; if true, only non-selected attributes will be processed.";
    }

    public boolean getInvertSelection() {
        return this.m_selectedRange.getInvert();
    }

    public void setInvertSelection(boolean invert) {
        this.m_selectedRange.setInvert(invert);
    }

    public int getWordsToKeep() {
        return this.m_wordsToKeep;
    }

    public void setWordsToKeep(int newWordsToKeep) {
        this.m_wordsToKeep = newWordsToKeep;
    }

    public String wordsToKeepTipText() {
        return "The number of words (per class if there is a class attribute assigned) to attempt to keep.";
    }

    public long getPeriodicPruning() {
        return this.m_periodicPruneRate;
    }

    public void setPeriodicPruning(long newPeriodicPruning) {
        this.m_periodicPruneRate = newPeriodicPruning;
    }

    public String periodicPruningTipText() {
        return "Specify the rate (x% of the input dataset) at which to periodically prune the dictionary. wordsToKeep prunes after creating a full dictionary. You may not have enough memory for this approach.";
    }

    public boolean getTFTransform() {
        return this.m_TFTransform;
    }

    public void setTFTransform(boolean TFTransform) {
        this.m_TFTransform = TFTransform;
    }

    public String TFTransformTipText() {
        return "Sets whether if the word frequencies should be transformed into:\n    log(1+fij) \n       where fij is the frequency of word i in document (instance) j.";
    }

    public String getAttributeNamePrefix() {
        return this.m_Prefix;
    }

    public void setAttributeNamePrefix(String newPrefix) {
        this.m_Prefix = newPrefix;
    }

    public String attributeNamePrefixTipText() {
        return "Prefix for the created attribute names. (default: \"\")";
    }

    public boolean getIDFTransform() {
        return this.m_IDFTransform;
    }

    public void setIDFTransform(boolean IDFTransform) {
        this.m_IDFTransform = IDFTransform;
    }

    public String IDFTransformTipText() {
        return "Sets whether if the word frequencies in a document should be transformed into: \n   fij*log(num of Docs/num of Docs with word i) \n      where fij is the frequency of word i in document (instance) j.";
    }

    public boolean getNormalize() {
        return this.m_normalize;
    }

    public void setNormalize(boolean n) {
        this.m_normalize = n;
    }

    public String normalizeTipText() {
        return "Whether word frequencies for a document (instance) should be normalized or not";
    }

    public String normalizeDocLengthTipText() {
        return "Sets whether if the word frequencies for a document (instance) should be normalized or not.";
    }

    public boolean getLowerCaseTokens() {
        return this.m_lowerCaseTokens;
    }

    public void setLowerCaseTokens(boolean downCaseTokens) {
        this.m_lowerCaseTokens = downCaseTokens;
    }

    public String lowerCaseTokensTipText() {
        return "If set then all the word tokens are converted to lower case before being added to the dictionary.";
    }

    public String doNotOperateOnPerClassBasisTipText() {
        return "If this is set, the maximum number of words and the minimum term frequency is not enforced on a per-class basis but based on the documents in all the classes (even if a class attribute is set).";
    }

    public boolean getDoNotOperateOnPerClassBasis() {
        return this.m_doNotOperateOnPerClassBasis;
    }

    public void setDoNotOperateOnPerClassBasis(boolean newDoNotOperateOnPerClassBasis) {
        this.m_doNotOperateOnPerClassBasis = newDoNotOperateOnPerClassBasis;
    }

    public String minTermFreqTipText() {
        return "Sets the minimum term frequency. This is enforced on a per-class basis.";
    }

    public int getMinTermFreq() {
        return this.m_minFrequency;
    }

    public void setMinTermFreq(int newMinTermFreq) {
        this.m_minFrequency = newMinTermFreq;
    }

    public Stemmer getStemmer() {
        return this.m_stemmer;
    }

    public void setStemmer(Stemmer value) {
        this.m_stemmer = value != null ? value : new NullStemmer();
    }

    public String stemmerTipText() {
        return "The stemming algorithm to use on the words.";
    }

    public StopwordsHandler getStopwordsHandler() {
        return this.m_stopwordsHandler;
    }

    public void setStopwordsHandler(StopwordsHandler value) {
        this.m_stopwordsHandler = value != null ? value : new Null();
    }

    public String stopwordsHandlerTipText() {
        return "The stopwords handler to use (Null means no stopwords are used).";
    }

    public Tokenizer getTokenizer() {
        return this.m_tokenizer;
    }

    public void setTokenizer(Tokenizer value) {
        this.m_tokenizer = value;
    }

    public String tokenizerTipText() {
        return "The tokenizing algorithm to use on the strings.";
    }

    @Override
    public Enumeration<Option> listOptions() {
        Vector<Option> result = new Vector<Option>();
        result.addElement(new Option("\tOutput word counts rather than boolean word presence.\n", "C", 0, "-C"));
        result.addElement(new Option("\tSpecify list of string attributes to convert to words (as weka Range).\n\t(default: select all string attributes)", "R", 1, "-R <index1,index2-index4,...>"));
        result.addElement(new Option("\tInvert matching sense of column indexes.", "V", 0, "-V"));
        result.addElement(new Option("\tSpecify a prefix for the created attribute names.\n\t(default: \"\")", "P", 1, "-P <attribute name prefix>"));
        result.addElement(new Option("\tSpecify approximate number of word fields to create.\n\tSurplus words will be discarded..\n\t(default: 1000)", "W", 1, "-W <number of words to keep>"));
        result.addElement(new Option("\tSpecify the rate (e.g., every x instances) at which to periodically prune the dictionary.\n\t-W prunes after creating a full dictionary. You may not have enough memory for this approach.\n\t(default: no periodic pruning)", "prune-rate", 1, "-prune-rate <every x instances>"));
        result.addElement(new Option("\tTransform the word frequencies into log(1+fij)\n\twhere fij is the frequency of word i in jth document(instance).\n", "T", 0, "-T"));
        result.addElement(new Option("\tTransform each word frequency into:\n\tfij*log(num of Documents/num of documents containing word i)\n\t  where fij if frequency of word i in jth document(instance)", "I", 0, "-I"));
        result.addElement(new Option("\tWhether to 0=not normalize/1=normalize all data/2=normalize test data only\n\tto average length of training documents (default 0=don't normalize).", "N", 1, "-N"));
        result.addElement(new Option("\tConvert all tokens to lowercase before adding to the dictionary.", "L", 0, "-L"));
        result.addElement(new Option("\tThe stopwords handler to use (default Null).", "-stopwords-handler", 1, "-stopwords-handler"));
        result.addElement(new Option("\tThe stemming algorithm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>"));
        result.addElement(new Option("\tThe minimum term frequency (default = 1).", "M", 1, "-M <int>"));
        result.addElement(new Option("\tIf this is set, the maximum number of words and the \n\tminimum term frequency is not enforced on a per-class \n\tbasis but based on the documents in all the classes \n\t(even if a class attribute is set).", "O", 0, "-O"));
        result.addElement(new Option("\tThe tokenizing algorihtm (classname plus parameters) to use.\n\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>"));
        return result.elements();
    }

    @Override
    public String[] getOptions() {
        String spec;
        Vector<String> result = new Vector<String>();
        result.add("-R");
        result.add(this.getSelectedRange().getRanges());
        if (this.getInvertSelection()) {
            result.add("-V");
        }
        if (!"".equals(this.getAttributeNamePrefix())) {
            result.add("-P");
            result.add(this.getAttributeNamePrefix());
        }
        result.add("-W");
        result.add(String.valueOf(this.getWordsToKeep()));
        result.add("-prune-rate");
        result.add(String.valueOf(this.getPeriodicPruning()));
        if (this.getOutputWordCounts()) {
            result.add("-C");
        }
        if (this.getTFTransform()) {
            result.add("-T");
        }
        if (this.getIDFTransform()) {
            result.add("-I");
        }
        if (this.getNormalize()) {
            result.add("-N");
        }
        if (this.getLowerCaseTokens()) {
            result.add("-L");
        }
        if (this.getStemmer() != null) {
            result.add("-stemmer");
            spec = this.getStemmer().getClass().getName();
            if (this.getStemmer() instanceof OptionHandler) {
                spec = spec + " " + Utils.joinOptions(((OptionHandler)((Object)this.getStemmer())).getOptions());
            }
            result.add(spec.trim());
        }
        if (this.getStopwordsHandler() != null) {
            result.add("-stopwords-handler");
            spec = this.getStopwordsHandler().getClass().getName();
            if (this.getStopwordsHandler() instanceof OptionHandler) {
                spec = spec + " " + Utils.joinOptions(((OptionHandler)((Object)this.getStopwordsHandler())).getOptions());
            }
            result.add(spec.trim());
        }
        result.add("-M");
        result.add(String.valueOf(this.getMinTermFreq()));
        if (this.getDoNotOperateOnPerClassBasis()) {
            result.add("-O");
        }
        result.add("-tokenizer");
        spec = this.getTokenizer().getClass().getName();
        spec = spec + " " + Utils.joinOptions(this.getTokenizer().getOptions());
        result.add(spec.trim());
        return result.toArray(new String[result.size()]);
    }

    @Override
    public void setOptions(String[] options) throws Exception {
        String value = Utils.getOption('R', options);
        if (value.length() != 0) {
            this.setSelectedRange(value);
        } else {
            this.setSelectedRange("first-last");
        }
        this.setInvertSelection(Utils.getFlag('V', options));
        value = Utils.getOption('P', options);
        if (value.length() != 0) {
            this.setAttributeNamePrefix(value);
        } else {
            this.setAttributeNamePrefix("");
        }
        value = Utils.getOption('W', options);
        if (value.length() != 0) {
            this.setWordsToKeep(Integer.valueOf(value));
        } else {
            this.setWordsToKeep(1000);
        }
        value = Utils.getOption("prune-rate", options);
        if (value.length() > 0) {
            this.setPeriodicPruning(Integer.parseInt(value));
        } else {
            this.setPeriodicPruning(-1L);
        }
        value = Utils.getOption('M', options);
        if (value.length() != 0) {
            this.setMinTermFreq(Integer.valueOf(value));
        } else {
            this.setMinTermFreq(1);
        }
        this.setOutputWordCounts(Utils.getFlag('C', options));
        this.setTFTransform(Utils.getFlag('T', options));
        this.setIDFTransform(Utils.getFlag('I', options));
        this.setDoNotOperateOnPerClassBasis(Utils.getFlag('O', options));
        this.setNormalize(Utils.getFlag('N', options));
        this.setLowerCaseTokens(Utils.getFlag('L', options));
        String stemmerString = Utils.getOption("stemmer", options);
        if (stemmerString.length() == 0) {
            this.setStemmer(null);
        } else {
            String[] stemmerSpec = Utils.splitOptions(stemmerString);
            if (stemmerSpec.length == 0) {
                throw new Exception("Invalid stemmer specification string");
            }
            String stemmerName = stemmerSpec[0];
            stemmerSpec[0] = "";
            Stemmer stemmer = (Stemmer)Utils.forName(Stemmer.class, stemmerName, stemmerSpec);
            this.setStemmer(stemmer);
        }
        String stopwordsHandlerString = Utils.getOption("stopwords-handler", options);
        if (stopwordsHandlerString.length() == 0) {
            this.setStopwordsHandler(null);
        } else {
            String[] stopwordsHandlerSpec = Utils.splitOptions(stopwordsHandlerString);
            if (stopwordsHandlerSpec.length == 0) {
                throw new Exception("Invalid StopwordsHandler specification string");
            }
            String stopwordsHandlerName = stopwordsHandlerSpec[0];
            stopwordsHandlerSpec[0] = "";
            StopwordsHandler stopwordsHandler = (StopwordsHandler)Utils.forName(StopwordsHandler.class, stopwordsHandlerName, stopwordsHandlerSpec);
            this.setStopwordsHandler(stopwordsHandler);
        }
        String tokenizerString = Utils.getOption("tokenizer", options);
        if (tokenizerString.length() == 0) {
            this.setTokenizer(new WordTokenizer());
        } else {
            String[] tokenizerSpec = Utils.splitOptions(tokenizerString);
            if (tokenizerSpec.length == 0) {
                throw new Exception("Invalid tokenizer specification string");
            }
            String tokenizerName = tokenizerSpec[0];
            tokenizerSpec[0] = "";
            Tokenizer tokenizer = (Tokenizer)Utils.forName(Tokenizer.class, tokenizerName, tokenizerSpec);
            this.setTokenizer(tokenizer);
        }
        Utils.checkForRemainingOptions(options);
    }

    public void setup(Instances inputFormat) throws Exception {
        this.m_inputContainsStringAttributes = inputFormat.checkForStringAttributes();
        this.m_inputFormat = inputFormat.stringFreeStructure();
        if (!this.m_inputContainsStringAttributes) {
            return;
        }
        this.m_numClasses = !this.m_doNotOperateOnPerClassBasis && this.m_inputFormat.classIndex() >= 0 ? this.m_inputFormat.numClasses() : 1;
        this.m_dictsPerClass = this.m_sortDictionary ? new TreeMap[this.m_numClasses] : new LinkedHashMap[this.m_numClasses];
        this.m_classIndex = this.m_inputFormat.classIndex();
        for (int i = 0; i < this.m_numClasses; ++i) {
            this.m_dictsPerClass[i] = this.m_sortDictionary ? new TreeMap() : new LinkedHashMap();
        }
        this.determineSelectedRange(inputFormat);
    }

    public Instances getInputFormat() {
        return this.m_inputFormat;
    }

    public boolean readyToVectorize() {
        return this.m_inputFormat != null && this.m_consolidatedDict != null;
    }

    private void determineSelectedRange(Instances inputFormat) {
        int j;
        StringBuffer fields;
        if (this.m_selectedRange == null) {
            fields = new StringBuffer();
            for (j = 0; j < inputFormat.numAttributes(); ++j) {
                if (inputFormat.attribute(j).type() != 2) continue;
                fields.append(j + 1 + ",");
            }
            this.m_selectedRange = new Range(fields.toString());
        }
        this.m_selectedRange.setUpper(inputFormat.numAttributes() - 1);
        fields = new StringBuffer();
        for (j = 0; j < inputFormat.numAttributes(); ++j) {
            if (!this.m_selectedRange.isInRange(j) || inputFormat.attribute(j).type() != 2) continue;
            fields.append(j + 1 + ",");
        }
        this.m_selectedRange.setRanges(fields.toString());
        this.m_selectedRange.setUpper(inputFormat.numAttributes() - 1);
    }

    public Instances getVectorizedFormat() throws Exception {
        if (this.m_inputFormat == null) {
            throw new Exception("No input format available. Call setup() and make sure a dictionary has been built first.");
        }
        if (!this.m_inputContainsStringAttributes) {
            return this.m_inputFormat;
        }
        if (this.m_consolidatedDict == null) {
            throw new Exception("Dictionary hasn't been built or finalized yet!");
        }
        if (this.m_outputFormat != null) {
            return this.m_outputFormat;
        }
        ArrayList<Attribute> newAtts = new ArrayList<Attribute>();
        int classIndex = -1;
        for (int i = 0; i < this.m_inputFormat.numAttributes(); ++i) {
            if (this.m_selectedRange.isInRange(i)) continue;
            if (this.m_inputFormat.classIndex() == i) {
                classIndex = newAtts.size();
            }
            newAtts.add((Attribute)this.m_inputFormat.attribute(i).copy());
        }
        for (Map.Entry<String, int[]> e : this.m_consolidatedDict.entrySet()) {
            newAtts.add(new Attribute(this.m_Prefix + e.getKey()));
        }
        Instances newFormat = new Instances(this.m_inputFormat.relationName(), newAtts, 0);
        if (classIndex >= 0) {
            newFormat.setClassIndex(classIndex);
        }
        return newFormat;
    }

    public Instances vectorizeBatch(Instances batch, boolean setAvgDocLength) throws Exception {
        if (this.m_inputFormat == null) {
            throw new Exception("No input format available. Call setup() and make sure a dictionary has been built first.");
        }
        if (!this.m_inputContainsStringAttributes) {
            return batch;
        }
        if (this.m_consolidatedDict == null) {
            throw new Exception("Dictionary hasn't been built or consolidated yet!");
        }
        Instances vectorized = new Instances(this.m_outputFormat, batch.numInstances());
        boolean normTemp = this.m_normalize;
        if (setAvgDocLength) {
            this.m_normalize = false;
        }
        if (batch.numInstances() > 0) {
            int i;
            int[] offsetHolder = new int[1];
            vectorized.add(this.vectorizeInstance(batch.instance(0), offsetHolder, true));
            for (i = 1; i < batch.numInstances(); ++i) {
                vectorized.add(this.vectorizeInstance(batch.instance(i), offsetHolder, true));
            }
            if (setAvgDocLength) {
                this.m_avgDocLength = 0.0;
                for (i = 0; i < vectorized.numInstances(); ++i) {
                    Instance inst = vectorized.instance(i);
                    double docLength = 0.0;
                    for (int j = 0; j < inst.numValues(); ++j) {
                        if (inst.index(j) < offsetHolder[0]) continue;
                        docLength += inst.valueSparse(j) * inst.valueSparse(j);
                    }
                    this.m_avgDocLength += Math.sqrt(docLength);
                }
                this.m_avgDocLength /= (double)vectorized.numInstances();
                if (normTemp) {
                    for (i = 0; i < vectorized.numInstances(); ++i) {
                        this.normalizeInstance(vectorized.instance(i), offsetHolder[0]);
                    }
                }
            }
        }
        this.m_normalize = normTemp;
        vectorized.compactify();
        return vectorized;
    }

    public Instance vectorizeInstance(Instance input) throws Exception {
        return this.vectorizeInstance(input, new int[1], false);
    }

    public Instance vectorizeInstance(Instance input, boolean retainStringAttValuesInMemory) throws Exception {
        return this.vectorizeInstance(input, new int[1], retainStringAttValuesInMemory);
    }

    private Instance vectorizeInstance(Instance input, int[] offsetHolder, boolean retainStringAttValuesInMemory) throws Exception {
        double[] val;
        int i;
        if (!this.m_inputContainsStringAttributes) {
            return input;
        }
        if (this.m_inputFormat == null) {
            throw new Exception("No input format available. Call setup() and make sure a dictionary has been built first.");
        }
        if (this.m_consolidatedDict == null) {
            throw new Exception("Dictionary hasn't been built or consolidated yet!");
        }
        int indexOffset = 0;
        int classIndex = this.m_outputFormat.classIndex();
        TreeMap<Integer, double[]> contained = new TreeMap<Integer, double[]>();
        for (i = 0; i < this.m_inputFormat.numAttributes(); ++i) {
            if (this.m_selectedRange.isInRange(i)) continue;
            if (!this.m_inputFormat.attribute(i).isString() && !this.m_inputFormat.attribute(i).isRelationValued()) {
                if (input.value(i) != 0.0) {
                    contained.put(indexOffset, new double[]{input.value(i)});
                }
            } else if (input.isMissing(i)) {
                contained.put(indexOffset, new double[]{Utils.missingValue()});
            } else if (this.m_inputFormat.attribute(i).isString()) {
                String strVal = input.stringValue(i);
                if (retainStringAttValuesInMemory) {
                    double strIndex = this.m_outputFormat.attribute(indexOffset).addStringValue(strVal);
                    contained.put(indexOffset, new double[]{strIndex});
                } else {
                    this.m_outputFormat.attribute(indexOffset).setStringValue(strVal);
                    contained.put(indexOffset, new double[]{0.0});
                }
            } else {
                if (this.m_outputFormat.attribute(indexOffset).numValues() == 0) {
                    Instances relationalHeader = this.m_outputFormat.attribute(indexOffset).relation();
                    this.m_outputFormat.attribute(indexOffset).addRelation(relationalHeader);
                }
                int newIndex = this.m_outputFormat.attribute(indexOffset).addRelation(input.relationalValue(i));
                contained.put(indexOffset, new double[]{newIndex});
            }
            ++indexOffset;
        }
        offsetHolder[0] = indexOffset;
        for (i = 0; i < this.m_inputFormat.numAttributes(); ++i) {
            if (!this.m_selectedRange.isInRange(i) || input.isMissing(i)) continue;
            this.m_tokenizer.tokenize(input.stringValue(i));
            while (this.m_tokenizer.hasMoreElements()) {
                int[] idxAndDocCount;
                String word = this.m_tokenizer.nextElement();
                if (this.m_lowerCaseTokens) {
                    word = word.toLowerCase();
                }
                if ((idxAndDocCount = this.m_consolidatedDict.get(word = this.m_stemmer.stem(word))) == null) continue;
                if (this.m_outputCounts) {
                    double[] inputCount = (double[])contained.get(idxAndDocCount[0] + indexOffset);
                    if (inputCount != null) {
                        inputCount[0] = inputCount[0] + 1.0;
                        continue;
                    }
                    contained.put(idxAndDocCount[0] + indexOffset, new double[]{1.0});
                    continue;
                }
                contained.put(idxAndDocCount[0] + indexOffset, new double[]{1.0});
            }
        }
        if (this.m_TFTransform) {
            for (Map.Entry e : contained.entrySet()) {
                int index = (Integer)e.getKey();
                if (index < indexOffset) continue;
                val = (double[])e.getValue();
                val[0] = Math.log(val[0] + 1.0);
            }
        }
        if (this.m_IDFTransform) {
            for (Map.Entry e : contained.entrySet()) {
                int index = (Integer)e.getKey();
                if (index < indexOffset) continue;
                val = (double[])e.getValue();
                String word = this.m_outputFormat.attribute(index).name();
                int[] idxAndDocCount = this.m_consolidatedDict.get(word = word.substring(this.m_Prefix.length()));
                if (idxAndDocCount == null) {
                    throw new Exception("This should never occur");
                }
                if (idxAndDocCount.length != 2) {
                    throw new Exception("Can't compute IDF transform as document counts are not available");
                }
                val[0] = val[0] * Math.log((double)this.m_count / (double)idxAndDocCount[1]);
            }
        }
        double[] values = new double[contained.size()];
        int[] indices = new int[contained.size()];
        int i2 = 0;
        for (Map.Entry e : contained.entrySet()) {
            values[i2] = ((double[])e.getValue())[0];
            indices[i2++] = (Integer)e.getKey();
        }
        SparseInstance inst = new SparseInstance(input.weight(), values, indices, this.m_outputFormat.numAttributes());
        inst.setDataset(this.m_outputFormat);
        if (this.m_normalize) {
            this.normalizeInstance(inst, indexOffset);
        }
        return inst;
    }

    private void normalizeInstance(Instance inst, int offset) throws Exception {
        int i;
        if (this.m_avgDocLength <= 0.0) {
            throw new Exception("Average document length is not set!");
        }
        double docLength = 0.0;
        for (i = 0; i < inst.numValues(); ++i) {
            if (inst.index(i) < offset || inst.index(i) == this.m_outputFormat.classIndex()) continue;
            docLength += inst.valueSparse(i) * inst.valueSparse(i);
        }
        docLength = Math.sqrt(docLength);
        for (i = 0; i < inst.numValues(); ++i) {
            if (inst.index(i) < offset || inst.index(i) == this.m_outputFormat.classIndex()) continue;
            double val = inst.valueSparse(i) * this.m_avgDocLength / docLength;
            inst.setValueSparse(i, val);
            if (val != 0.0) continue;
            System.err.println("setting value " + inst.index(i) + " to zero.");
            --i;
        }
    }

    public void processInstance(Instance inst) {
        if (!this.m_inputContainsStringAttributes) {
            return;
        }
        if (this.m_inputVector == null) {
            this.m_inputVector = new LinkedHashMap<String, int[]>();
        } else {
            this.m_inputVector.clear();
        }
        int dIndex = 0;
        if (!this.m_doNotOperateOnPerClassBasis && this.m_classIndex >= 0) {
            if (!inst.classIsMissing()) {
                dIndex = (int)inst.classValue();
            } else {
                return;
            }
        }
        for (int j = 0; j < inst.numAttributes(); ++j) {
            if (!this.m_selectedRange.isInRange(j) || inst.isMissing(j)) continue;
            this.m_tokenizer.tokenize(inst.stringValue(j));
            while (this.m_tokenizer.hasMoreElements()) {
                String word = this.m_tokenizer.nextElement();
                if (this.m_lowerCaseTokens) {
                    word = word.toLowerCase();
                }
                if (this.m_stopwordsHandler.isStopword(word = this.m_stemmer.stem(word))) continue;
                int[] counts = this.m_inputVector.get(word);
                if (counts == null) {
                    counts = new int[]{1, 1};
                    this.m_inputVector.put(word, counts);
                    continue;
                }
                counts[0] = counts[0] + 1;
            }
        }
        double docLength = 0.0;
        for (Map.Entry<String, int[]> e : this.m_inputVector.entrySet()) {
            int[] dictCounts = this.m_dictsPerClass[dIndex].get(e.getKey());
            if (dictCounts == null) {
                dictCounts = new int[2];
                this.m_dictsPerClass[dIndex].put(e.getKey(), dictCounts);
            }
            dictCounts[0] = dictCounts[0] + e.getValue()[0];
            dictCounts[1] = dictCounts[1] + e.getValue()[1];
            docLength += (double)(e.getValue()[0] * e.getValue()[0]);
        }
        if (this.m_normalize) {
            this.m_docLengthSum += Math.sqrt(docLength);
        }
        ++this.m_count;
        this.pruneDictionary();
    }

    protected void pruneDictionary() {
        if (this.m_periodicPruneRate > 0L && (long)this.m_count % this.m_periodicPruneRate == 0L) {
            for (Map<String, int[]> m_dictsPerClas : this.m_dictsPerClass) {
                Iterator<Map.Entry<String, int[]>> entries = m_dictsPerClas.entrySet().iterator();
                while (entries.hasNext()) {
                    Map.Entry<String, int[]> entry = entries.next();
                    if (entry.getValue()[0] >= this.m_minFrequency) continue;
                    entries.remove();
                }
            }
        }
    }

    public void reset() {
        this.m_dictsPerClass = null;
        this.m_count = 0;
        this.m_docLengthSum = 0.0;
        this.m_avgDocLength = 0.0;
        this.m_inputFormat = null;
        this.m_outputFormat = null;
        this.m_consolidatedDict = null;
    }

    public Map<String, int[]>[] getDictionaries(boolean minFrequencyPrune) throws WekaException {
        if (this.m_dictsPerClass == null) {
            throw new WekaException("No dictionaries have been built yet!");
        }
        if (minFrequencyPrune) {
            this.pruneDictionary();
        }
        return this.m_dictsPerClass;
    }

    @Override
    public DictionaryBuilder aggregate(DictionaryBuilder toAgg) throws Exception {
        Map<String, int[]>[] toAggDicts = toAgg.getDictionaries(false);
        if (toAggDicts.length != this.m_dictsPerClass.length) {
            throw new Exception("Number of dictionaries from the builder to be aggregated does not match our number of dictionaries");
        }
        for (int i = 0; i < toAggDicts.length; ++i) {
            Map<String, int[]> toAggDictForClass = toAggDicts[i];
            for (Map.Entry<String, int[]> e : toAggDictForClass.entrySet()) {
                int[] ourCounts = this.m_dictsPerClass[i].get(e.getKey());
                if (ourCounts == null) {
                    ourCounts = new int[2];
                    this.m_dictsPerClass[i].put(e.getKey(), ourCounts);
                }
                ourCounts[0] = ourCounts[0] + e.getValue()[0];
                ourCounts[1] = ourCounts[1] + e.getValue()[1];
            }
        }
        this.m_count += toAgg.m_count;
        this.m_docLengthSum += toAgg.m_docLengthSum;
        return this;
    }

    @Override
    public void finalizeAggregation() throws Exception {
        this.finalizeDictionary();
    }

    public Map<String, int[]> finalizeDictionary() throws Exception {
        if (!this.m_inputContainsStringAttributes) {
            return null;
        }
        if (this.m_consolidatedDict != null) {
            return this.m_consolidatedDict;
        }
        if (this.m_dictsPerClass == null) {
            System.err.println(this.hashCode());
            throw new WekaException("No dictionary built yet!");
        }
        int[] prune = new int[this.m_dictsPerClass.length];
        for (int z = 0; z < prune.length; ++z) {
            int[] array = new int[this.m_dictsPerClass[z].size()];
            int index = 0;
            for (Map.Entry<String, int[]> e : this.m_dictsPerClass[z].entrySet()) {
                array[index++] = e.getValue()[0];
            }
            if (array.length < this.m_wordsToKeep) {
                prune[z] = this.m_minFrequency;
                continue;
            }
            Arrays.sort(array);
            prune[z] = Math.max(this.m_minFrequency, array[array.length - this.m_wordsToKeep]);
        }
        LinkedHashMap<String, int[]> consolidated = new LinkedHashMap<String, int[]>();
        int index = 0;
        for (int z = 0; z < prune.length; ++z) {
            for (Map.Entry<String, int[]> e : this.m_dictsPerClass[z].entrySet()) {
                if (e.getValue()[0] < prune[z]) continue;
                int[] counts = (int[])consolidated.get(e.getKey());
                if (counts == null) {
                    counts = new int[2];
                    counts[0] = index++;
                    consolidated.put(e.getKey(), counts);
                }
                counts[1] = counts[1] + e.getValue()[1];
            }
        }
        this.m_consolidatedDict = consolidated;
        this.m_dictsPerClass = null;
        if (this.m_normalize) {
            this.m_avgDocLength = this.m_docLengthSum / (double)this.m_count;
        }
        this.m_outputFormat = this.getVectorizedFormat();
        return this.m_consolidatedDict;
    }

    public void loadDictionary(String filename, boolean plainText) throws IOException {
        this.loadDictionary(new File(filename), plainText);
    }

    public void loadDictionary(File toLoad, boolean plainText) throws IOException {
        if (plainText) {
            this.loadDictionary(new FileReader(toLoad));
        } else {
            this.loadDictionary(new FileInputStream(toLoad));
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void loadDictionary(Reader reader) throws IOException {
        block17: {
            this.m_consolidatedDict = new LinkedHashMap<String, int[]>();
            try (BufferedReader br = new BufferedReader(reader);){
                String line = br.readLine();
                int index = 0;
                if (line != null) {
                    if (line.startsWith("@@@") && line.endsWith("@@@")) {
                        String avgS = line.replace("@@@", "");
                        try {
                            this.m_avgDocLength = Double.parseDouble(avgS);
                        }
                        catch (NumberFormatException ex) {
                            System.err.println("Unable to parse average document length '" + avgS + "'");
                        }
                        line = br.readLine();
                        if (line == null) {
                            throw new IOException("Empty dictionary file!");
                        }
                    }
                    boolean hasDocCounts = false;
                    if (line.lastIndexOf(",") > 0) {
                        String countS = line.substring(line.lastIndexOf(",") + 1, line.length()).trim();
                        try {
                            int dCount = Integer.parseInt(countS);
                            hasDocCounts = true;
                            int[] holder = new int[2];
                            holder[1] = dCount;
                            holder[0] = index++;
                            this.m_consolidatedDict.put(line.substring(0, line.lastIndexOf(",")), holder);
                        }
                        catch (NumberFormatException dCount) {
                            // empty catch block
                        }
                    }
                    while ((line = br.readLine()) != null) {
                        int[] holder = new int[hasDocCounts ? 2 : 1];
                        holder[0] = index++;
                        if (hasDocCounts) {
                            String countS = line.substring(line.lastIndexOf(",") + 1, line.length()).trim();
                            line = line.substring(0, line.lastIndexOf(","));
                            try {
                                int dCount;
                                holder[1] = dCount = Integer.parseInt(countS);
                            }
                            catch (NumberFormatException e) {
                                throw new IOException(e);
                            }
                        }
                        this.m_consolidatedDict.put(line, holder);
                    }
                    break block17;
                }
                throw new IOException("Empty dictionary file!");
            }
        }
        try {
            this.m_outputFormat = this.getVectorizedFormat();
        }
        catch (Exception ex) {
            throw new IOException(ex);
        }
    }

    public void loadDictionary(InputStream is) throws IOException {
        try (ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));){
            List holder = (List)ois.readObject();
            this.m_avgDocLength = (Double)holder.get(0);
            this.m_consolidatedDict = (Map)holder.get(1);
        }
    }

    public void saveDictionary(String filename, boolean plainText) throws IOException {
        this.saveDictionary(new File(filename), plainText);
    }

    public void saveDictionary(File toSave, boolean plainText) throws IOException {
        if (plainText) {
            this.saveDictionary(new FileWriter(toSave));
        } else {
            this.saveDictionary(new FileOutputStream(toSave));
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void saveDictionary(Writer writer) throws IOException {
        if (!this.m_inputContainsStringAttributes) {
            throw new IOException("Input did not contain any string attributes!");
        }
        if (this.m_consolidatedDict == null) {
            throw new IOException("No dictionary to save!");
        }
        BufferedWriter br = new BufferedWriter(writer);
        try {
            if (this.m_avgDocLength > 0.0) {
                br.write("@@@" + this.m_avgDocLength + "@@@\n");
            }
            for (Map.Entry<String, int[]> e : this.m_consolidatedDict.entrySet()) {
                int[] v = e.getValue();
                br.write(e.getKey() + "," + (v.length > 1 ? Integer.valueOf(v[1]) : "") + "\n");
            }
        }
        finally {
            br.flush();
            br.close();
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void saveDictionary(OutputStream os) throws IOException {
        if (!this.m_inputContainsStringAttributes) {
            throw new IOException("Input did not contain any string attributes!");
        }
        if (this.m_consolidatedDict == null) {
            throw new IOException("No dictionary to save!");
        }
        ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(os));
        ArrayList<Object> holder = new ArrayList<Object>();
        holder.add(this.m_avgDocLength);
        holder.add(this.m_consolidatedDict);
        try {
            oos.writeObject(holder);
        }
        finally {
            oos.flush();
            oos.close();
        }
    }
}

