Spaces:
Running
Running
| import { getEncoding } from 'js-tiktoken'; | |
| const splitRegex = /(?<=\. |\n|! |\? |; |:\s|\d+\.\s|- |\* )/g; | |
| const enc = getEncoding('cl100k_base'); | |
| const getTokenCount = (text: string): number => { | |
| try { | |
| return enc.encode(text).length; | |
| } catch { | |
| return Math.ceil(text.length / 4); | |
| } | |
| }; | |
| export const splitText = ( | |
| text: string, | |
| maxTokens = 512, | |
| overlapTokens = 64, | |
| ): string[] => { | |
| const segments = text.split(splitRegex).filter(Boolean); | |
| if (segments.length === 0) { | |
| return []; | |
| } | |
| const segmentTokenCounts = segments.map(getTokenCount); | |
| const result: string[] = []; | |
| let chunkStart = 0; | |
| while (chunkStart < segments.length) { | |
| let chunkEnd = chunkStart; | |
| let currentTokenCount = 0; | |
| while (chunkEnd < segments.length && currentTokenCount < maxTokens) { | |
| if (currentTokenCount + segmentTokenCounts[chunkEnd] > maxTokens) { | |
| break; | |
| } | |
| currentTokenCount += segmentTokenCounts[chunkEnd]; | |
| chunkEnd++; | |
| } | |
| let overlapBeforeStart = Math.max(0, chunkStart - 1); | |
| let overlapBeforeTokenCount = 0; | |
| while (overlapBeforeStart >= 0 && overlapBeforeTokenCount < overlapTokens) { | |
| if ( | |
| overlapBeforeTokenCount + segmentTokenCounts[overlapBeforeStart] > | |
| overlapTokens | |
| ) { | |
| break; | |
| } | |
| overlapBeforeTokenCount += segmentTokenCounts[overlapBeforeStart]; | |
| overlapBeforeStart--; | |
| } | |
| const overlapStartIndex = Math.max(0, overlapBeforeStart + 1); | |
| const overlapBeforeContent = segments | |
| .slice(overlapStartIndex, chunkStart) | |
| .join(''); | |
| const chunkContent = segments.slice(chunkStart, chunkEnd).join(''); | |
| result.push(overlapBeforeContent + chunkContent); | |
| chunkStart = chunkEnd; | |
| } | |
| return result; | |
| }; | |