package org.carrot2.text.preprocessing;

import com.carrotsearch.hppc.BitSet;
import com.carrotsearch.hppc.ByteArrayList;
import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.IntStack;
import com.carrotsearch.hppc.ShortArrayList;
import com.carrotsearch.hppc.sorting.IndirectSort;
import java.util.ArrayList;
import java.util.Arrays;
import org.carrot2.core.attribute.Processing;
import org.carrot2.shaded.guava.common.collect.Lists;
import org.carrot2.text.util.CharArrayComparators;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.constraint.IntRange;

@Bindable(prefix = "CaseNormalizer")
/* loaded from: input_file:libs/carrot2-mini-3.15.0.jar:org/carrot2/text/preprocessing/CaseNormalizer.class */
public final class CaseNormalizer {

    @Level(AttributeLevel.ADVANCED)
    @Input
    @Attribute
    @Group(DefaultGroups.PREPROCESSING)
    @Processing
    @IntRange(min = 1, max = 100)
    @Label("Word document frequency threshold")
    public int dfThreshold = 1;

    public void normalize(PreprocessingContext preprocessingContext) {
        char[][] cArr = preprocessingContext.allTokens.image;
        short[] sArr = preprocessingContext.allTokens.type;
        int[] iArr = preprocessingContext.allTokens.documentIndex;
        byte[] bArr = preprocessingContext.allTokens.fieldIndex;
        int length = cArr.length;
        int[] mergesort = IndirectSort.mergesort(cArr, 0, cArr.length, CharArrayComparators.NORMALIZING_CHAR_ARRAY_COMPARATOR);
        ArrayList newArrayList = Lists.newArrayList();
        IntArrayList intArrayList = new IntArrayList();
        ArrayList newArrayList2 = Lists.newArrayList();
        ByteArrayList byteArrayList = new ByteArrayList();
        ShortArrayList shortArrayList = new ShortArrayList();
        int[] iArr2 = new int[length];
        Arrays.fill(iArr2, -1);
        int i = 1;
        int i2 = 1;
        int i3 = mergesort[0];
        int i4 = 1;
        int i5 = 0;
        BitSet bitSet = new BitSet(preprocessingContext.allFields.name.length);
        IntStack intStack = new IntStack();
        if (iArr[mergesort[0]] >= 0) {
            intStack.push(iArr[mergesort[0]]);
        }
        for (int i6 = 0; i6 < mergesort.length - 1; i6++) {
            char[] cArr2 = cArr[mergesort[i6]];
            char[] cArr3 = cArr[mergesort[i6 + 1]];
            short s = sArr[mergesort[i6]];
            int i7 = iArr[mergesort[i6 + 1]];
            if (cArr2 == null) {
                break;
            }
            if (isNotIndexed(s)) {
                i5 = i6 + 1;
                i3 = mergesort[i6 + 1];
                resetForNewTokenImage(iArr, mergesort, bitSet, intStack, i6);
            } else {
                bitSet.set(bArr[mergesort[i6]]);
                if (CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR.compare(cArr2, cArr3) == 0) {
                    i++;
                    i4++;
                    intStack.push(i7);
                } else {
                    if (i2 < i) {
                        i2 = i;
                        i3 = mergesort[i6];
                        i = 1;
                    }
                    if (CharArrayComparators.CASE_INSENSITIVE_CHAR_ARRAY_COMPARATOR.compare(cArr2, cArr3) == 0) {
                        i4++;
                        intStack.push(i7);
                    } else {
                        if (intStack.size() >= this.dfThreshold) {
                            int[] sparseEncoding = SparseArray.toSparseEncoding(intStack);
                            if ((sparseEncoding.length >> 1) >= this.dfThreshold) {
                                newArrayList2.add(sparseEncoding);
                                newArrayList.add(cArr[i3]);
                                shortArrayList.add(sArr[i3]);
                                intArrayList.add(i4);
                                byteArrayList.add((byte) bitSet.bits[0]);
                                for (int i8 = i5; i8 < i6 + 1; i8++) {
                                    iArr2[mergesort[i8]] = newArrayList.size() - 1;
                                }
                            }
                        }
                        i4 = 1;
                        i = 1;
                        i2 = 1;
                        i3 = mergesort[i6 + 1];
                        i5 = i6 + 1;
                        resetForNewTokenImage(iArr, mergesort, bitSet, intStack, i6);
                    }
                }
            }
        }
        preprocessingContext.allTokens.wordIndex = iArr2;
        preprocessingContext.allWords.image = (char[][]) newArrayList.toArray((Object[]) new char[newArrayList.size()]);
        preprocessingContext.allWords.tf = intArrayList.toArray();
        preprocessingContext.allWords.tfByDocument = (int[][]) newArrayList2.toArray((Object[]) new int[newArrayList2.size()]);
        preprocessingContext.allWords.fieldIndices = byteArrayList.toArray();
        preprocessingContext.allWords.type = shortArrayList.toArray();
    }

    private void resetForNewTokenImage(int[] iArr, int[] iArr2, BitSet bitSet, IntStack intStack, int i) {
        bitSet.clear();
        intStack.clear();
        if (iArr[iArr2[i + 1]] >= 0) {
            intStack.push(iArr[iArr2[i + 1]]);
        }
    }

    private boolean isNotIndexed(int i) {
        return i == 3 || i == 6 || (i & 256) != 0;
    }
}
