| import type { Chunk, Document } from "../types"; |
| import { CHUNK_SIZE_CHARS, CHUNK_OVERLAP_CHARS } from "../constants"; |
|
|
| |
| const CHUNK_WINDOW_CHARS = 800; |
|
|
| |
| |
| const BREAK_PATTERNS: [RegExp, number, string][] = [ |
| [/\n#{1}(?!#)/g, 100, "h1"], |
| [/\n#{2}(?!#)/g, 90, "h2"], |
| [/\n#{3}(?!#)/g, 80, "h3"], |
| [/\n#{4}(?!#)/g, 70, "h4"], |
| [/\n#{5}(?!#)/g, 60, "h5"], |
| [/\n#{6}(?!#)/g, 50, "h6"], |
| [/\n```/g, 80, "codeblock"], |
| [/\n(?:---|\*\*\*|___)\s*\n/g, 60, "hr"], |
| [/\n\n+/g, 20, "blank"], |
| [/\n[-*]\s/g, 5, "list"], |
| [/\n\d+\.\s/g, 5, "numlist"], |
| [/\n/g, 1, "newline"], |
| ]; |
|
|
| interface BreakPoint { |
| pos: number; |
| score: number; |
| type: string; |
| } |
|
|
| interface CodeFenceRegion { |
| start: number; |
| end: number; |
| } |
|
|
| |
| |
| |
| |
| function scanBreakPoints(text: string): BreakPoint[] { |
| const seen = new Map<number, BreakPoint>(); |
|
|
| for (const [pattern, score, type] of BREAK_PATTERNS) { |
| for (const match of text.matchAll(pattern)) { |
| const pos = match.index!; |
| const existing = seen.get(pos); |
| if (!existing || score > existing.score) { |
| seen.set(pos, { pos, score, type }); |
| } |
| } |
| } |
|
|
| return [...seen.values()].sort((a, b) => a.pos - b.pos); |
| } |
|
|
| |
| |
| |
| function findCodeFences(text: string): CodeFenceRegion[] { |
| const regions: CodeFenceRegion[] = []; |
| const fencePattern = /\n```/g; |
| let inFence = false; |
| let fenceStart = 0; |
|
|
| for (const match of text.matchAll(fencePattern)) { |
| if (!inFence) { |
| fenceStart = match.index!; |
| inFence = true; |
| } else { |
| regions.push({ start: fenceStart, end: match.index! + match[0].length }); |
| inFence = false; |
| } |
| } |
|
|
| |
| if (inFence) { |
| regions.push({ start: fenceStart, end: text.length }); |
| } |
|
|
| return regions; |
| } |
|
|
| function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]): boolean { |
| return fences.some((f) => pos > f.start && pos < f.end); |
| } |
|
|
| |
| |
| |
| |
| |
| function findBestCutoff( |
| breakPoints: BreakPoint[], |
| targetCharPos: number, |
| windowChars: number = CHUNK_WINDOW_CHARS, |
| decayFactor: number = 0.7, |
| codeFences: CodeFenceRegion[] = [], |
| ): number { |
| const windowStart = targetCharPos - windowChars; |
| let bestScore = -1; |
| let bestPos = targetCharPos; |
|
|
| for (const bp of breakPoints) { |
| if (bp.pos < windowStart) continue; |
| if (bp.pos > targetCharPos) break; |
|
|
| if (isInsideCodeFence(bp.pos, codeFences)) continue; |
|
|
| const distance = targetCharPos - bp.pos; |
| const normalizedDist = distance / windowChars; |
| const multiplier = 1.0 - normalizedDist * normalizedDist * decayFactor; |
| const finalScore = bp.score * multiplier; |
|
|
| if (finalScore > bestScore) { |
| bestScore = finalScore; |
| bestPos = bp.pos; |
| } |
| } |
|
|
| return bestPos; |
| } |
|
|
| |
| |
| |
| |
| function splitIntoChunks( |
| content: string, |
| maxChars: number = CHUNK_SIZE_CHARS, |
| overlapChars: number = CHUNK_OVERLAP_CHARS, |
| windowChars: number = CHUNK_WINDOW_CHARS, |
| ): { text: string; pos: number }[] { |
| if (content.length <= maxChars) { |
| return [{ text: content, pos: 0 }]; |
| } |
|
|
| const breakPoints = scanBreakPoints(content); |
| const codeFences = findCodeFences(content); |
| const chunks: { text: string; pos: number }[] = []; |
| let charPos = 0; |
|
|
| while (charPos < content.length) { |
| const targetEndPos = Math.min(charPos + maxChars, content.length); |
| let endPos = targetEndPos; |
|
|
| if (endPos < content.length) { |
| const bestCutoff = findBestCutoff( |
| breakPoints, |
| targetEndPos, |
| windowChars, |
| 0.7, |
| codeFences, |
| ); |
|
|
| if (bestCutoff > charPos && bestCutoff <= targetEndPos) { |
| endPos = bestCutoff; |
| } |
| } |
|
|
| |
| if (endPos <= charPos) { |
| endPos = Math.min(charPos + maxChars, content.length); |
| } |
|
|
| chunks.push({ text: content.slice(charPos, endPos), pos: charPos }); |
|
|
| if (endPos >= content.length) break; |
|
|
| charPos = endPos - overlapChars; |
| const lastChunkPos = chunks.at(-1)!.pos; |
| if (charPos <= lastChunkPos) { |
| charPos = endPos; |
| } |
| } |
|
|
| return chunks; |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| export function extractTitle(content: string, filename: string): string { |
| const match = content.match(/^#\s+(.+)$/m); |
| if (match) return match[1].trim(); |
|
|
| |
| const dot = filename.lastIndexOf("."); |
| return dot > 0 ? filename.slice(0, dot) : filename; |
| } |
|
|
| |
| |
| |
| |
| export function chunkDocument(doc: Document): Chunk[] { |
| const raw = splitIntoChunks(doc.body); |
|
|
| return raw.map((c, i) => ({ |
| docId: doc.id, |
| chunkIndex: i, |
| text: c.text, |
| startChar: c.pos, |
| title: doc.title, |
| })); |
| } |
|
|
| |
| export { scanBreakPoints, findCodeFences, isInsideCodeFence, findBestCutoff, splitIntoChunks }; |
| export type { BreakPoint, CodeFenceRegion }; |
|
|