qmd-web / src /pipeline /chunking.ts
shreyask's picture
add chunking module with markdown-aware token-based splitting
ce263b4 verified
import type { Chunk, Document } from "../types";
import { CHUNK_SIZE_CHARS, CHUNK_OVERLAP_CHARS } from "../constants";
// How far back from the target cut position to search for a break point (~200 tokens)
const CHUNK_WINDOW_CHARS = 800;
// Break point scoring — higher = better place to split.
// Order matters: more specific patterns first so headings beat generic newlines.
const BREAK_PATTERNS: [RegExp, number, string][] = [
[/\n#{1}(?!#)/g, 100, "h1"],
[/\n#{2}(?!#)/g, 90, "h2"],
[/\n#{3}(?!#)/g, 80, "h3"],
[/\n#{4}(?!#)/g, 70, "h4"],
[/\n#{5}(?!#)/g, 60, "h5"],
[/\n#{6}(?!#)/g, 50, "h6"],
[/\n```/g, 80, "codeblock"],
[/\n(?:---|\*\*\*|___)\s*\n/g, 60, "hr"],
[/\n\n+/g, 20, "blank"],
[/\n[-*]\s/g, 5, "list"],
[/\n\d+\.\s/g, 5, "numlist"],
[/\n/g, 1, "newline"],
];
interface BreakPoint {
pos: number;
score: number;
type: string;
}
interface CodeFenceRegion {
start: number;
end: number;
}
/**
* Scan text for all potential break points, returning them sorted by position.
* When multiple patterns match the same position, the highest score wins.
*/
function scanBreakPoints(text: string): BreakPoint[] {
const seen = new Map<number, BreakPoint>();
for (const [pattern, score, type] of BREAK_PATTERNS) {
for (const match of text.matchAll(pattern)) {
const pos = match.index!;
const existing = seen.get(pos);
if (!existing || score > existing.score) {
seen.set(pos, { pos, score, type });
}
}
}
return [...seen.values()].sort((a, b) => a.pos - b.pos);
}
/**
* Find all code fence regions (``` pairs). Never split inside these.
*/
function findCodeFences(text: string): CodeFenceRegion[] {
const regions: CodeFenceRegion[] = [];
const fencePattern = /\n```/g;
let inFence = false;
let fenceStart = 0;
for (const match of text.matchAll(fencePattern)) {
if (!inFence) {
fenceStart = match.index!;
inFence = true;
} else {
regions.push({ start: fenceStart, end: match.index! + match[0].length });
inFence = false;
}
}
// Unclosed fence extends to end of document
if (inFence) {
regions.push({ start: fenceStart, end: text.length });
}
return regions;
}
function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]): boolean {
return fences.some((f) => pos > f.start && pos < f.end);
}
/**
* Find the best cut position near `targetCharPos` using scored break points
* with squared distance decay. Headings far back still beat low-quality breaks
* near the target.
*/
function findBestCutoff(
breakPoints: BreakPoint[],
targetCharPos: number,
windowChars: number = CHUNK_WINDOW_CHARS,
decayFactor: number = 0.7,
codeFences: CodeFenceRegion[] = [],
): number {
const windowStart = targetCharPos - windowChars;
let bestScore = -1;
let bestPos = targetCharPos;
for (const bp of breakPoints) {
if (bp.pos < windowStart) continue;
if (bp.pos > targetCharPos) break; // sorted — stop early
if (isInsideCodeFence(bp.pos, codeFences)) continue;
const distance = targetCharPos - bp.pos;
const normalizedDist = distance / windowChars;
const multiplier = 1.0 - normalizedDist * normalizedDist * decayFactor;
const finalScore = bp.score * multiplier;
if (finalScore > bestScore) {
bestScore = finalScore;
bestPos = bp.pos;
}
}
return bestPos;
}
/**
* Split content into overlapping character-based chunks, preferring markdown
* heading boundaries and avoiding splits inside code fences.
*/
function splitIntoChunks(
content: string,
maxChars: number = CHUNK_SIZE_CHARS,
overlapChars: number = CHUNK_OVERLAP_CHARS,
windowChars: number = CHUNK_WINDOW_CHARS,
): { text: string; pos: number }[] {
if (content.length <= maxChars) {
return [{ text: content, pos: 0 }];
}
const breakPoints = scanBreakPoints(content);
const codeFences = findCodeFences(content);
const chunks: { text: string; pos: number }[] = [];
let charPos = 0;
while (charPos < content.length) {
const targetEndPos = Math.min(charPos + maxChars, content.length);
let endPos = targetEndPos;
if (endPos < content.length) {
const bestCutoff = findBestCutoff(
breakPoints,
targetEndPos,
windowChars,
0.7,
codeFences,
);
if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
endPos = bestCutoff;
}
}
// Ensure forward progress
if (endPos <= charPos) {
endPos = Math.min(charPos + maxChars, content.length);
}
chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
if (endPos >= content.length) break;
charPos = endPos - overlapChars;
const lastChunkPos = chunks.at(-1)!.pos;
if (charPos <= lastChunkPos) {
charPos = endPos; // prevent infinite loop
}
}
return chunks;
}
// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------
/**
* Extract title from markdown content. Returns the first H1 heading text,
* or falls back to filename without extension.
*/
export function extractTitle(content: string, filename: string): string {
const match = content.match(/^#\s+(.+)$/m);
if (match) return match[1].trim();
// Strip extension from filename
const dot = filename.lastIndexOf(".");
return dot > 0 ? filename.slice(0, dot) : filename;
}
/**
* Chunk a single Document into overlapping Chunk objects suitable for
* embedding and search.
*/
export function chunkDocument(doc: Document): Chunk[] {
const raw = splitIntoChunks(doc.body);
return raw.map((c, i) => ({
docId: doc.id,
chunkIndex: i,
text: c.text,
startChar: c.pos,
title: doc.title,
}));
}
// Exported for testing
export { scanBreakPoints, findCodeFences, isInsideCodeFence, findBestCutoff, splitIntoChunks };
export type { BreakPoint, CodeFenceRegion };