/*
 * Decompiled with CFR 0.152.
 */
package org.springframework.ai.transformer.splitter;

import com.knuddels.jtokkit.Encodings;
import com.knuddels.jtokkit.api.Encoding;
import com.knuddels.jtokkit.api.EncodingRegistry;
import com.knuddels.jtokkit.api.EncodingType;
import com.knuddels.jtokkit.api.IntArrayList;
import java.util.ArrayList;
import java.util.List;
import org.springframework.ai.transformer.splitter.TextSplitter;
import org.springframework.util.Assert;

public class TokenTextSplitter
extends TextSplitter {
    private static final int DEFAULT_CHUNK_SIZE = 800;
    private static final int MIN_CHUNK_SIZE_CHARS = 350;
    private static final int MIN_CHUNK_LENGTH_TO_EMBED = 5;
    private static final int MAX_NUM_CHUNKS = 10000;
    private static final boolean KEEP_SEPARATOR = true;
    private final EncodingRegistry registry = Encodings.newLazyEncodingRegistry();
    private final Encoding encoding = this.registry.getEncoding(EncodingType.CL100K_BASE);
    private final int chunkSize;
    private final int minChunkSizeChars;
    private final int minChunkLengthToEmbed;
    private final int maxNumChunks;
    private final boolean keepSeparator;

    public TokenTextSplitter() {
        this(800, 350, 5, 10000, true);
    }

    public TokenTextSplitter(boolean keepSeparator) {
        this(800, 350, 5, 10000, keepSeparator);
    }

    public TokenTextSplitter(int chunkSize, int minChunkSizeChars, int minChunkLengthToEmbed, int maxNumChunks, boolean keepSeparator) {
        this.chunkSize = chunkSize;
        this.minChunkSizeChars = minChunkSizeChars;
        this.minChunkLengthToEmbed = minChunkLengthToEmbed;
        this.maxNumChunks = maxNumChunks;
        this.keepSeparator = keepSeparator;
    }

    public static Builder builder() {
        return new Builder();
    }

    @Override
    protected List<String> splitText(String text) {
        return this.doSplit(text, this.chunkSize);
    }

    protected List<String> doSplit(String text, int chunkSize) {
        String remaining_text;
        if (text == null || text.trim().isEmpty()) {
            return new ArrayList<String>();
        }
        List<Integer> tokens = this.getEncodedTokens(text);
        ArrayList<String> chunks = new ArrayList<String>();
        int num_chunks = 0;
        while (!tokens.isEmpty() && num_chunks < this.maxNumChunks) {
            String chunkTextToAppend;
            List<Integer> chunk = tokens.subList(0, Math.min(chunkSize, tokens.size()));
            String chunkText = this.decodeTokens(chunk);
            if (chunkText.trim().isEmpty()) {
                tokens = tokens.subList(chunk.size(), tokens.size());
                continue;
            }
            int lastPunctuation = Math.max(chunkText.lastIndexOf(46), Math.max(chunkText.lastIndexOf(63), Math.max(chunkText.lastIndexOf(33), chunkText.lastIndexOf(10))));
            if (lastPunctuation != -1 && lastPunctuation > this.minChunkSizeChars) {
                chunkText = chunkText.substring(0, lastPunctuation + 1);
            }
            String string = chunkTextToAppend = this.keepSeparator ? chunkText.trim() : chunkText.replace(System.lineSeparator(), " ").trim();
            if (chunkTextToAppend.length() > this.minChunkLengthToEmbed) {
                chunks.add(chunkTextToAppend);
            }
            tokens = tokens.subList(this.getEncodedTokens(chunkText).size(), tokens.size());
            ++num_chunks;
        }
        if (!tokens.isEmpty() && (remaining_text = this.decodeTokens(tokens).replace(System.lineSeparator(), " ").trim()).length() > this.minChunkLengthToEmbed) {
            chunks.add(remaining_text);
        }
        return chunks;
    }

    private List<Integer> getEncodedTokens(String text) {
        Assert.notNull((Object)text, (String)"Text must not be null");
        return this.encoding.encode(text).boxed();
    }

    private String decodeTokens(List<Integer> tokens) {
        Assert.notNull(tokens, (String)"Tokens must not be null");
        IntArrayList tokensIntArray = new IntArrayList(tokens.size());
        tokens.forEach(arg_0 -> ((IntArrayList)tokensIntArray).add(arg_0));
        return this.encoding.decode(tokensIntArray);
    }

    public static final class Builder {
        private int chunkSize = 800;
        private int minChunkSizeChars = 350;
        private int minChunkLengthToEmbed = 5;
        private int maxNumChunks = 10000;
        private boolean keepSeparator = true;

        private Builder() {
        }

        public Builder withChunkSize(int chunkSize) {
            this.chunkSize = chunkSize;
            return this;
        }

        public Builder withMinChunkSizeChars(int minChunkSizeChars) {
            this.minChunkSizeChars = minChunkSizeChars;
            return this;
        }

        public Builder withMinChunkLengthToEmbed(int minChunkLengthToEmbed) {
            this.minChunkLengthToEmbed = minChunkLengthToEmbed;
            return this;
        }

        public Builder withMaxNumChunks(int maxNumChunks) {
            this.maxNumChunks = maxNumChunks;
            return this;
        }

        public Builder withKeepSeparator(boolean keepSeparator) {
            this.keepSeparator = keepSeparator;
            return this;
        }

        public TokenTextSplitter build() {
            return new TokenTextSplitter(this.chunkSize, this.minChunkSizeChars, this.minChunkLengthToEmbed, this.maxNumChunks, this.keepSeparator);
        }
    }
}

