Spaces:

shreyask
/

qmd-web

Running

App Files Files Community

qmd-web / src /pipeline /chunking.ts

shreyask

add chunking module with markdown-aware token-based splitting

ce263b4 verified about 1 month ago

raw

history blame contribute delete

5.97 kB

	import type { Chunk, Document } from "../types";
	import { CHUNK_SIZE_CHARS, CHUNK_OVERLAP_CHARS } from "../constants";

	// How far back from the target cut position to search for a break point (~200 tokens)
	const CHUNK_WINDOW_CHARS = 800;

	// Break point scoring — higher = better place to split.
	// Order matters: more specific patterns first so headings beat generic newlines.
	const BREAK_PATTERNS: [RegExp, number, string][] = [
	[/\n#{1}(?!#)/g, 100, "h1"],
	[/\n#{2}(?!#)/g, 90, "h2"],
	[/\n#{3}(?!#)/g, 80, "h3"],
	[/\n#{4}(?!#)/g, 70, "h4"],
	[/\n#{5}(?!#)/g, 60, "h5"],
	[/\n#{6}(?!#)/g, 50, "h6"],
	[/\n```/g, 80, "codeblock"],
	[/\n(?:---\|\\\\|___)\s\n/g, 60, "hr"],
	[/\n\n+/g, 20, "blank"],
	[/\n[-*]\s/g, 5, "list"],
	[/\n\d+\.\s/g, 5, "numlist"],
	[/\n/g, 1, "newline"],
	];

	interface BreakPoint {
	pos: number;
	score: number;
	type: string;
	}

	interface CodeFenceRegion {
	start: number;
	end: number;
	}

	/**
	* Scan text for all potential break points, returning them sorted by position.
	* When multiple patterns match the same position, the highest score wins.
	*/
	function scanBreakPoints(text: string): BreakPoint[] {
	const seen = new Map<number, BreakPoint>();

	for (const [pattern, score, type] of BREAK_PATTERNS) {
	for (const match of text.matchAll(pattern)) {
	const pos = match.index!;
	const existing = seen.get(pos);
	if (!existing \|\| score > existing.score) {
	seen.set(pos, { pos, score, type });
	}
	}
	}

	return [...seen.values()].sort((a, b) => a.pos - b.pos);
	}

	/**
	* Find all code fence regions (``` pairs). Never split inside these.
	*/
	function findCodeFences(text: string): CodeFenceRegion[] {
	const regions: CodeFenceRegion[] = [];
	const fencePattern = /\n```/g;
	let inFence = false;
	let fenceStart = 0;

	for (const match of text.matchAll(fencePattern)) {
	if (!inFence) {
	fenceStart = match.index!;
	inFence = true;
	} else {
	regions.push({ start: fenceStart, end: match.index! + match[0].length });
	inFence = false;
	}
	}

	// Unclosed fence extends to end of document
	if (inFence) {
	regions.push({ start: fenceStart, end: text.length });
	}

	return regions;
	}

	function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]): boolean {
	return fences.some((f) => pos > f.start && pos < f.end);
	}

	/**
	* Find the best cut position near `targetCharPos` using scored break points
	* with squared distance decay. Headings far back still beat low-quality breaks
	* near the target.
	*/
	function findBestCutoff(
	breakPoints: BreakPoint[],
	targetCharPos: number,
	windowChars: number = CHUNK_WINDOW_CHARS,
	decayFactor: number = 0.7,
	codeFences: CodeFenceRegion[] = [],
	): number {
	const windowStart = targetCharPos - windowChars;
	let bestScore = -1;
	let bestPos = targetCharPos;

	for (const bp of breakPoints) {
	if (bp.pos < windowStart) continue;
	if (bp.pos > targetCharPos) break; // sorted — stop early

	if (isInsideCodeFence(bp.pos, codeFences)) continue;

	const distance = targetCharPos - bp.pos;
	const normalizedDist = distance / windowChars;
	const multiplier = 1.0 - normalizedDist * normalizedDist * decayFactor;
	const finalScore = bp.score * multiplier;

	if (finalScore > bestScore) {
	bestScore = finalScore;
	bestPos = bp.pos;
	}
	}

	return bestPos;
	}

	/**
	* Split content into overlapping character-based chunks, preferring markdown
	* heading boundaries and avoiding splits inside code fences.
	*/
	function splitIntoChunks(
	content: string,
	maxChars: number = CHUNK_SIZE_CHARS,
	overlapChars: number = CHUNK_OVERLAP_CHARS,
	windowChars: number = CHUNK_WINDOW_CHARS,
	): { text: string; pos: number }[] {
	if (content.length <= maxChars) {
	return [{ text: content, pos: 0 }];
	}

	const breakPoints = scanBreakPoints(content);
	const codeFences = findCodeFences(content);
	const chunks: { text: string; pos: number }[] = [];
	let charPos = 0;

	while (charPos < content.length) {
	const targetEndPos = Math.min(charPos + maxChars, content.length);
	let endPos = targetEndPos;

	if (endPos < content.length) {
	const bestCutoff = findBestCutoff(
	breakPoints,
	targetEndPos,
	windowChars,
	0.7,
	codeFences,
	);

	if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
	endPos = bestCutoff;
	}
	}

	// Ensure forward progress
	if (endPos <= charPos) {
	endPos = Math.min(charPos + maxChars, content.length);
	}

	chunks.push({ text: content.slice(charPos, endPos), pos: charPos });

	if (endPos >= content.length) break;

	charPos = endPos - overlapChars;
	const lastChunkPos = chunks.at(-1)!.pos;
	if (charPos <= lastChunkPos) {
	charPos = endPos; // prevent infinite loop
	}
	}

	return chunks;
	}

	// ---------------------------------------------------------------------------
	// Public API
	// ---------------------------------------------------------------------------

	/**
	* Extract title from markdown content. Returns the first H1 heading text,
	* or falls back to filename without extension.
	*/
	export function extractTitle(content: string, filename: string): string {
	const match = content.match(/^#\s+(.+)$/m);
	if (match) return match[1].trim();

	// Strip extension from filename
	const dot = filename.lastIndexOf(".");
	return dot > 0 ? filename.slice(0, dot) : filename;
	}

	/**
	* Chunk a single Document into overlapping Chunk objects suitable for
	* embedding and search.
	*/
	export function chunkDocument(doc: Document): Chunk[] {
	const raw = splitIntoChunks(doc.body);

	return raw.map((c, i) => ({
	docId: doc.id,
	chunkIndex: i,
	text: c.text,
	startChar: c.pos,
	title: doc.title,
	}));
	}

	// Exported for testing
	export { scanBreakPoints, findCodeFences, isInsideCodeFence, findBestCutoff, splitIntoChunks };
	export type { BreakPoint, CodeFenceRegion };