Spaces:
Running
Running
File size: 1,766 Bytes
6b6ca97 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | import { getEncoding } from 'js-tiktoken';
const splitRegex = /(?<=\. |\n|! |\? |; |:\s|\d+\.\s|- |\* )/g;
const enc = getEncoding('cl100k_base');
const getTokenCount = (text: string): number => {
try {
return enc.encode(text).length;
} catch {
return Math.ceil(text.length / 4);
}
};
export const splitText = (
text: string,
maxTokens = 512,
overlapTokens = 64,
): string[] => {
const segments = text.split(splitRegex).filter(Boolean);
if (segments.length === 0) {
return [];
}
const segmentTokenCounts = segments.map(getTokenCount);
const result: string[] = [];
let chunkStart = 0;
while (chunkStart < segments.length) {
let chunkEnd = chunkStart;
let currentTokenCount = 0;
while (chunkEnd < segments.length && currentTokenCount < maxTokens) {
if (currentTokenCount + segmentTokenCounts[chunkEnd] > maxTokens) {
break;
}
currentTokenCount += segmentTokenCounts[chunkEnd];
chunkEnd++;
}
let overlapBeforeStart = Math.max(0, chunkStart - 1);
let overlapBeforeTokenCount = 0;
while (overlapBeforeStart >= 0 && overlapBeforeTokenCount < overlapTokens) {
if (
overlapBeforeTokenCount + segmentTokenCounts[overlapBeforeStart] >
overlapTokens
) {
break;
}
overlapBeforeTokenCount += segmentTokenCounts[overlapBeforeStart];
overlapBeforeStart--;
}
const overlapStartIndex = Math.max(0, overlapBeforeStart + 1);
const overlapBeforeContent = segments
.slice(overlapStartIndex, chunkStart)
.join('');
const chunkContent = segments.slice(chunkStart, chunkEnd).join('');
result.push(overlapBeforeContent + chunkContent);
chunkStart = chunkEnd;
}
return result;
};
|