| import { sentences as splitBySentences } from "sbd"; |
| import { MarkdownElementType, type MarkdownElement } from "../types"; |
|
|
| export function chunkElements(elements: MarkdownElement[], maxLength: number): MarkdownElement[] { |
| return elements.flatMap((elem) => { |
| |
| |
| if (elem.type === MarkdownElementType.Header) { |
| return { ...elem, content: elem.content.slice(0, maxLength) }; |
| } |
| const contentChunks = enforceMaxLength(elem.content, maxLength); |
| return contentChunks.map<MarkdownElement>((content) => ({ ...elem, content })); |
| }); |
| } |
|
|
| const delimitersByPriority = ["?", "!", ".", ";", ":", ",", "|", " - ", " ", "-"]; |
| function enforceMaxLength(text: string, maxLength: number): string[] { |
| if (text.length <= maxLength) return [text].filter(Boolean); |
| return splitBySentences(text) |
| .flatMap((sentence) => { |
| if (sentence.length <= maxLength) return sentence; |
|
|
| |
| const indices: [number, number][] = []; |
| while ((indices.at(-1)?.[1] ?? 0) < sentence.length) { |
| const prevIndex = indices.at(-1)?.[1] ?? 0; |
|
|
| |
| if (prevIndex + maxLength >= sentence.length) { |
| indices.push([prevIndex, sentence.length]); |
| continue; |
| } |
|
|
| const bestDelimiter = delimitersByPriority.find( |
| (delimiter) => sentence.lastIndexOf(delimiter, prevIndex + maxLength) !== -1 |
| ); |
| |
| if (!bestDelimiter) { |
| indices.push([prevIndex, prevIndex + maxLength]); |
| continue; |
| } |
|
|
| const closestDelimiter = sentence.lastIndexOf(bestDelimiter, prevIndex + maxLength); |
| indices.push([prevIndex, Math.max(prevIndex + 1, closestDelimiter)]); |
| } |
|
|
| return indices.map((sliceIndices) => sentence.slice(...sliceIndices)); |
| }) |
| .reduce<string[]>( |
| (chunks, sentence) => { |
| const lastChunk = chunks[chunks.length - 1]; |
| if (lastChunk.length + sentence.length <= maxLength) { |
| return [...chunks.slice(0, -1), lastChunk + sentence]; |
| } |
| return [...chunks, sentence]; |
| }, |
| [""] |
| ) |
| .filter(Boolean); |
| } |
|
|