add chunking module with markdown-aware token-based splitting
Browse filesPorts QMD's chunking algorithm: 900-token chunks (~3600 chars) with 15%
overlap, heading-preferred break points, code fence protection, and
squared distance decay for cut position scoring. Includes 23 vitest tests.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- package-lock.json +356 -1
- package.json +2 -1
- src/pipeline/chunking.test.ts +267 -0
- src/pipeline/chunking.ts +213 -0
package-lock.json
CHANGED
|
@@ -24,7 +24,8 @@
|
|
| 24 |
"globals": "^16.5.0",
|
| 25 |
"typescript": "~5.9.3",
|
| 26 |
"typescript-eslint": "^8.48.0",
|
| 27 |
-
"vite": "^7.3.1"
|
|
|
|
| 28 |
}
|
| 29 |
},
|
| 30 |
"node_modules/@babel/code-frame": {
|
|
@@ -1934,6 +1935,13 @@
|
|
| 1934 |
"win32"
|
| 1935 |
]
|
| 1936 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1937 |
"node_modules/@types/babel__core": {
|
| 1938 |
"version": "7.20.5",
|
| 1939 |
"resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz",
|
|
@@ -1979,6 +1987,24 @@
|
|
| 1979 |
"@babel/types": "^7.28.2"
|
| 1980 |
}
|
| 1981 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1982 |
"node_modules/@types/estree": {
|
| 1983 |
"version": "1.0.8",
|
| 1984 |
"resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
|
|
@@ -2338,6 +2364,117 @@
|
|
| 2338 |
"vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0"
|
| 2339 |
}
|
| 2340 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2341 |
"node_modules/acorn": {
|
| 2342 |
"version": "8.16.0",
|
| 2343 |
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.16.0.tgz",
|
|
@@ -2410,6 +2547,16 @@
|
|
| 2410 |
"dev": true,
|
| 2411 |
"license": "Python-2.0"
|
| 2412 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2413 |
"node_modules/balanced-match": {
|
| 2414 |
"version": "1.0.2",
|
| 2415 |
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
|
|
@@ -2513,6 +2660,16 @@
|
|
| 2513 |
],
|
| 2514 |
"license": "CC-BY-4.0"
|
| 2515 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2516 |
"node_modules/chalk": {
|
| 2517 |
"version": "4.1.2",
|
| 2518 |
"resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
|
|
@@ -2685,6 +2842,13 @@
|
|
| 2685 |
"node": ">= 0.4"
|
| 2686 |
}
|
| 2687 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2688 |
"node_modules/es6-error": {
|
| 2689 |
"version": "4.1.1",
|
| 2690 |
"resolved": "https://registry.npmjs.org/es6-error/-/es6-error-4.1.1.tgz",
|
|
@@ -2929,6 +3093,16 @@
|
|
| 2929 |
"node": ">=4.0"
|
| 2930 |
}
|
| 2931 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2932 |
"node_modules/esutils": {
|
| 2933 |
"version": "2.0.3",
|
| 2934 |
"resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
|
|
@@ -2939,6 +3113,16 @@
|
|
| 2939 |
"node": ">=0.10.0"
|
| 2940 |
}
|
| 2941 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2942 |
"node_modules/fast-deep-equal": {
|
| 2943 |
"version": "3.1.3",
|
| 2944 |
"resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
|
|
@@ -3391,6 +3575,16 @@
|
|
| 3391 |
"yallist": "^3.0.2"
|
| 3392 |
}
|
| 3393 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3394 |
"node_modules/matcher": {
|
| 3395 |
"version": "3.0.0",
|
| 3396 |
"resolved": "https://registry.npmjs.org/matcher/-/matcher-3.0.0.tgz",
|
|
@@ -3465,6 +3659,17 @@
|
|
| 3465 |
"node": ">= 0.4"
|
| 3466 |
}
|
| 3467 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3468 |
"node_modules/onnxruntime-common": {
|
| 3469 |
"version": "1.24.3",
|
| 3470 |
"resolved": "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.24.3.tgz",
|
|
@@ -3591,6 +3796,13 @@
|
|
| 3591 |
"node": ">=8"
|
| 3592 |
}
|
| 3593 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3594 |
"node_modules/picocolors": {
|
| 3595 |
"version": "1.1.1",
|
| 3596 |
"resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
|
|
@@ -3909,6 +4121,13 @@
|
|
| 3909 |
"node": ">=8"
|
| 3910 |
}
|
| 3911 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3912 |
"node_modules/source-map-js": {
|
| 3913 |
"version": "1.2.1",
|
| 3914 |
"resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
|
|
@@ -3925,6 +4144,20 @@
|
|
| 3925 |
"integrity": "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==",
|
| 3926 |
"license": "BSD-3-Clause"
|
| 3927 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3928 |
"node_modules/strip-json-comments": {
|
| 3929 |
"version": "3.1.1",
|
| 3930 |
"resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz",
|
|
@@ -3951,6 +4184,23 @@
|
|
| 3951 |
"node": ">=8"
|
| 3952 |
}
|
| 3953 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3954 |
"node_modules/tinyglobby": {
|
| 3955 |
"version": "0.2.15",
|
| 3956 |
"resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz",
|
|
@@ -3968,6 +4218,16 @@
|
|
| 3968 |
"url": "https://github.com/sponsors/SuperchupuDev"
|
| 3969 |
}
|
| 3970 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3971 |
"node_modules/ts-api-utils": {
|
| 3972 |
"version": "2.4.0",
|
| 3973 |
"resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.4.0.tgz",
|
|
@@ -4173,6 +4433,84 @@
|
|
| 4173 |
}
|
| 4174 |
}
|
| 4175 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4176 |
"node_modules/which": {
|
| 4177 |
"version": "2.0.2",
|
| 4178 |
"resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
|
|
@@ -4189,6 +4527,23 @@
|
|
| 4189 |
"node": ">= 8"
|
| 4190 |
}
|
| 4191 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4192 |
"node_modules/word-wrap": {
|
| 4193 |
"version": "1.2.5",
|
| 4194 |
"resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.5.tgz",
|
|
|
|
| 24 |
"globals": "^16.5.0",
|
| 25 |
"typescript": "~5.9.3",
|
| 26 |
"typescript-eslint": "^8.48.0",
|
| 27 |
+
"vite": "^7.3.1",
|
| 28 |
+
"vitest": "^4.0.18"
|
| 29 |
}
|
| 30 |
},
|
| 31 |
"node_modules/@babel/code-frame": {
|
|
|
|
| 1935 |
"win32"
|
| 1936 |
]
|
| 1937 |
},
|
| 1938 |
+
"node_modules/@standard-schema/spec": {
|
| 1939 |
+
"version": "1.1.0",
|
| 1940 |
+
"resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.1.0.tgz",
|
| 1941 |
+
"integrity": "sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==",
|
| 1942 |
+
"dev": true,
|
| 1943 |
+
"license": "MIT"
|
| 1944 |
+
},
|
| 1945 |
"node_modules/@types/babel__core": {
|
| 1946 |
"version": "7.20.5",
|
| 1947 |
"resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz",
|
|
|
|
| 1987 |
"@babel/types": "^7.28.2"
|
| 1988 |
}
|
| 1989 |
},
|
| 1990 |
+
"node_modules/@types/chai": {
|
| 1991 |
+
"version": "5.2.3",
|
| 1992 |
+
"resolved": "https://registry.npmjs.org/@types/chai/-/chai-5.2.3.tgz",
|
| 1993 |
+
"integrity": "sha512-Mw558oeA9fFbv65/y4mHtXDs9bPnFMZAL/jxdPFUpOHHIXX91mcgEHbS5Lahr+pwZFR8A7GQleRWeI6cGFC2UA==",
|
| 1994 |
+
"dev": true,
|
| 1995 |
+
"license": "MIT",
|
| 1996 |
+
"dependencies": {
|
| 1997 |
+
"@types/deep-eql": "*",
|
| 1998 |
+
"assertion-error": "^2.0.1"
|
| 1999 |
+
}
|
| 2000 |
+
},
|
| 2001 |
+
"node_modules/@types/deep-eql": {
|
| 2002 |
+
"version": "4.0.2",
|
| 2003 |
+
"resolved": "https://registry.npmjs.org/@types/deep-eql/-/deep-eql-4.0.2.tgz",
|
| 2004 |
+
"integrity": "sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==",
|
| 2005 |
+
"dev": true,
|
| 2006 |
+
"license": "MIT"
|
| 2007 |
+
},
|
| 2008 |
"node_modules/@types/estree": {
|
| 2009 |
"version": "1.0.8",
|
| 2010 |
"resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
|
|
|
|
| 2364 |
"vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0"
|
| 2365 |
}
|
| 2366 |
},
|
| 2367 |
+
"node_modules/@vitest/expect": {
|
| 2368 |
+
"version": "4.0.18",
|
| 2369 |
+
"resolved": "https://registry.npmjs.org/@vitest/expect/-/expect-4.0.18.tgz",
|
| 2370 |
+
"integrity": "sha512-8sCWUyckXXYvx4opfzVY03EOiYVxyNrHS5QxX3DAIi5dpJAAkyJezHCP77VMX4HKA2LDT/Jpfo8i2r5BE3GnQQ==",
|
| 2371 |
+
"dev": true,
|
| 2372 |
+
"license": "MIT",
|
| 2373 |
+
"dependencies": {
|
| 2374 |
+
"@standard-schema/spec": "^1.0.0",
|
| 2375 |
+
"@types/chai": "^5.2.2",
|
| 2376 |
+
"@vitest/spy": "4.0.18",
|
| 2377 |
+
"@vitest/utils": "4.0.18",
|
| 2378 |
+
"chai": "^6.2.1",
|
| 2379 |
+
"tinyrainbow": "^3.0.3"
|
| 2380 |
+
},
|
| 2381 |
+
"funding": {
|
| 2382 |
+
"url": "https://opencollective.com/vitest"
|
| 2383 |
+
}
|
| 2384 |
+
},
|
| 2385 |
+
"node_modules/@vitest/mocker": {
|
| 2386 |
+
"version": "4.0.18",
|
| 2387 |
+
"resolved": "https://registry.npmjs.org/@vitest/mocker/-/mocker-4.0.18.tgz",
|
| 2388 |
+
"integrity": "sha512-HhVd0MDnzzsgevnOWCBj5Otnzobjy5wLBe4EdeeFGv8luMsGcYqDuFRMcttKWZA5vVO8RFjexVovXvAM4JoJDQ==",
|
| 2389 |
+
"dev": true,
|
| 2390 |
+
"license": "MIT",
|
| 2391 |
+
"dependencies": {
|
| 2392 |
+
"@vitest/spy": "4.0.18",
|
| 2393 |
+
"estree-walker": "^3.0.3",
|
| 2394 |
+
"magic-string": "^0.30.21"
|
| 2395 |
+
},
|
| 2396 |
+
"funding": {
|
| 2397 |
+
"url": "https://opencollective.com/vitest"
|
| 2398 |
+
},
|
| 2399 |
+
"peerDependencies": {
|
| 2400 |
+
"msw": "^2.4.9",
|
| 2401 |
+
"vite": "^6.0.0 || ^7.0.0-0"
|
| 2402 |
+
},
|
| 2403 |
+
"peerDependenciesMeta": {
|
| 2404 |
+
"msw": {
|
| 2405 |
+
"optional": true
|
| 2406 |
+
},
|
| 2407 |
+
"vite": {
|
| 2408 |
+
"optional": true
|
| 2409 |
+
}
|
| 2410 |
+
}
|
| 2411 |
+
},
|
| 2412 |
+
"node_modules/@vitest/pretty-format": {
|
| 2413 |
+
"version": "4.0.18",
|
| 2414 |
+
"resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-4.0.18.tgz",
|
| 2415 |
+
"integrity": "sha512-P24GK3GulZWC5tz87ux0m8OADrQIUVDPIjjj65vBXYG17ZeU3qD7r+MNZ1RNv4l8CGU2vtTRqixrOi9fYk/yKw==",
|
| 2416 |
+
"dev": true,
|
| 2417 |
+
"license": "MIT",
|
| 2418 |
+
"dependencies": {
|
| 2419 |
+
"tinyrainbow": "^3.0.3"
|
| 2420 |
+
},
|
| 2421 |
+
"funding": {
|
| 2422 |
+
"url": "https://opencollective.com/vitest"
|
| 2423 |
+
}
|
| 2424 |
+
},
|
| 2425 |
+
"node_modules/@vitest/runner": {
|
| 2426 |
+
"version": "4.0.18",
|
| 2427 |
+
"resolved": "https://registry.npmjs.org/@vitest/runner/-/runner-4.0.18.tgz",
|
| 2428 |
+
"integrity": "sha512-rpk9y12PGa22Jg6g5M3UVVnTS7+zycIGk9ZNGN+m6tZHKQb7jrP7/77WfZy13Y/EUDd52NDsLRQhYKtv7XfPQw==",
|
| 2429 |
+
"dev": true,
|
| 2430 |
+
"license": "MIT",
|
| 2431 |
+
"dependencies": {
|
| 2432 |
+
"@vitest/utils": "4.0.18",
|
| 2433 |
+
"pathe": "^2.0.3"
|
| 2434 |
+
},
|
| 2435 |
+
"funding": {
|
| 2436 |
+
"url": "https://opencollective.com/vitest"
|
| 2437 |
+
}
|
| 2438 |
+
},
|
| 2439 |
+
"node_modules/@vitest/snapshot": {
|
| 2440 |
+
"version": "4.0.18",
|
| 2441 |
+
"resolved": "https://registry.npmjs.org/@vitest/snapshot/-/snapshot-4.0.18.tgz",
|
| 2442 |
+
"integrity": "sha512-PCiV0rcl7jKQjbgYqjtakly6T1uwv/5BQ9SwBLekVg/EaYeQFPiXcgrC2Y7vDMA8dM1SUEAEV82kgSQIlXNMvA==",
|
| 2443 |
+
"dev": true,
|
| 2444 |
+
"license": "MIT",
|
| 2445 |
+
"dependencies": {
|
| 2446 |
+
"@vitest/pretty-format": "4.0.18",
|
| 2447 |
+
"magic-string": "^0.30.21",
|
| 2448 |
+
"pathe": "^2.0.3"
|
| 2449 |
+
},
|
| 2450 |
+
"funding": {
|
| 2451 |
+
"url": "https://opencollective.com/vitest"
|
| 2452 |
+
}
|
| 2453 |
+
},
|
| 2454 |
+
"node_modules/@vitest/spy": {
|
| 2455 |
+
"version": "4.0.18",
|
| 2456 |
+
"resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-4.0.18.tgz",
|
| 2457 |
+
"integrity": "sha512-cbQt3PTSD7P2OARdVW3qWER5EGq7PHlvE+QfzSC0lbwO+xnt7+XH06ZzFjFRgzUX//JmpxrCu92VdwvEPlWSNw==",
|
| 2458 |
+
"dev": true,
|
| 2459 |
+
"license": "MIT",
|
| 2460 |
+
"funding": {
|
| 2461 |
+
"url": "https://opencollective.com/vitest"
|
| 2462 |
+
}
|
| 2463 |
+
},
|
| 2464 |
+
"node_modules/@vitest/utils": {
|
| 2465 |
+
"version": "4.0.18",
|
| 2466 |
+
"resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-4.0.18.tgz",
|
| 2467 |
+
"integrity": "sha512-msMRKLMVLWygpK3u2Hybgi4MNjcYJvwTb0Ru09+fOyCXIgT5raYP041DRRdiJiI3k/2U6SEbAETB3YtBrUkCFA==",
|
| 2468 |
+
"dev": true,
|
| 2469 |
+
"license": "MIT",
|
| 2470 |
+
"dependencies": {
|
| 2471 |
+
"@vitest/pretty-format": "4.0.18",
|
| 2472 |
+
"tinyrainbow": "^3.0.3"
|
| 2473 |
+
},
|
| 2474 |
+
"funding": {
|
| 2475 |
+
"url": "https://opencollective.com/vitest"
|
| 2476 |
+
}
|
| 2477 |
+
},
|
| 2478 |
"node_modules/acorn": {
|
| 2479 |
"version": "8.16.0",
|
| 2480 |
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.16.0.tgz",
|
|
|
|
| 2547 |
"dev": true,
|
| 2548 |
"license": "Python-2.0"
|
| 2549 |
},
|
| 2550 |
+
"node_modules/assertion-error": {
|
| 2551 |
+
"version": "2.0.1",
|
| 2552 |
+
"resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-2.0.1.tgz",
|
| 2553 |
+
"integrity": "sha512-Izi8RQcffqCeNVgFigKli1ssklIbpHnCYc6AknXGYoB6grJqyeby7jv12JUQgmTAnIDnbck1uxksT4dzN3PWBA==",
|
| 2554 |
+
"dev": true,
|
| 2555 |
+
"license": "MIT",
|
| 2556 |
+
"engines": {
|
| 2557 |
+
"node": ">=12"
|
| 2558 |
+
}
|
| 2559 |
+
},
|
| 2560 |
"node_modules/balanced-match": {
|
| 2561 |
"version": "1.0.2",
|
| 2562 |
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
|
|
|
|
| 2660 |
],
|
| 2661 |
"license": "CC-BY-4.0"
|
| 2662 |
},
|
| 2663 |
+
"node_modules/chai": {
|
| 2664 |
+
"version": "6.2.2",
|
| 2665 |
+
"resolved": "https://registry.npmjs.org/chai/-/chai-6.2.2.tgz",
|
| 2666 |
+
"integrity": "sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg==",
|
| 2667 |
+
"dev": true,
|
| 2668 |
+
"license": "MIT",
|
| 2669 |
+
"engines": {
|
| 2670 |
+
"node": ">=18"
|
| 2671 |
+
}
|
| 2672 |
+
},
|
| 2673 |
"node_modules/chalk": {
|
| 2674 |
"version": "4.1.2",
|
| 2675 |
"resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
|
|
|
|
| 2842 |
"node": ">= 0.4"
|
| 2843 |
}
|
| 2844 |
},
|
| 2845 |
+
"node_modules/es-module-lexer": {
|
| 2846 |
+
"version": "1.7.0",
|
| 2847 |
+
"resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.7.0.tgz",
|
| 2848 |
+
"integrity": "sha512-jEQoCwk8hyb2AZziIOLhDqpm5+2ww5uIE6lkO/6jcOCusfk6LhMHpXXfBLXTZ7Ydyt0j4VoUQv6uGNYbdW+kBA==",
|
| 2849 |
+
"dev": true,
|
| 2850 |
+
"license": "MIT"
|
| 2851 |
+
},
|
| 2852 |
"node_modules/es6-error": {
|
| 2853 |
"version": "4.1.1",
|
| 2854 |
"resolved": "https://registry.npmjs.org/es6-error/-/es6-error-4.1.1.tgz",
|
|
|
|
| 3093 |
"node": ">=4.0"
|
| 3094 |
}
|
| 3095 |
},
|
| 3096 |
+
"node_modules/estree-walker": {
|
| 3097 |
+
"version": "3.0.3",
|
| 3098 |
+
"resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-3.0.3.tgz",
|
| 3099 |
+
"integrity": "sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g==",
|
| 3100 |
+
"dev": true,
|
| 3101 |
+
"license": "MIT",
|
| 3102 |
+
"dependencies": {
|
| 3103 |
+
"@types/estree": "^1.0.0"
|
| 3104 |
+
}
|
| 3105 |
+
},
|
| 3106 |
"node_modules/esutils": {
|
| 3107 |
"version": "2.0.3",
|
| 3108 |
"resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
|
|
|
|
| 3113 |
"node": ">=0.10.0"
|
| 3114 |
}
|
| 3115 |
},
|
| 3116 |
+
"node_modules/expect-type": {
|
| 3117 |
+
"version": "1.3.0",
|
| 3118 |
+
"resolved": "https://registry.npmjs.org/expect-type/-/expect-type-1.3.0.tgz",
|
| 3119 |
+
"integrity": "sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==",
|
| 3120 |
+
"dev": true,
|
| 3121 |
+
"license": "Apache-2.0",
|
| 3122 |
+
"engines": {
|
| 3123 |
+
"node": ">=12.0.0"
|
| 3124 |
+
}
|
| 3125 |
+
},
|
| 3126 |
"node_modules/fast-deep-equal": {
|
| 3127 |
"version": "3.1.3",
|
| 3128 |
"resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
|
|
|
|
| 3575 |
"yallist": "^3.0.2"
|
| 3576 |
}
|
| 3577 |
},
|
| 3578 |
+
"node_modules/magic-string": {
|
| 3579 |
+
"version": "0.30.21",
|
| 3580 |
+
"resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.21.tgz",
|
| 3581 |
+
"integrity": "sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==",
|
| 3582 |
+
"dev": true,
|
| 3583 |
+
"license": "MIT",
|
| 3584 |
+
"dependencies": {
|
| 3585 |
+
"@jridgewell/sourcemap-codec": "^1.5.5"
|
| 3586 |
+
}
|
| 3587 |
+
},
|
| 3588 |
"node_modules/matcher": {
|
| 3589 |
"version": "3.0.0",
|
| 3590 |
"resolved": "https://registry.npmjs.org/matcher/-/matcher-3.0.0.tgz",
|
|
|
|
| 3659 |
"node": ">= 0.4"
|
| 3660 |
}
|
| 3661 |
},
|
| 3662 |
+
"node_modules/obug": {
|
| 3663 |
+
"version": "2.1.1",
|
| 3664 |
+
"resolved": "https://registry.npmjs.org/obug/-/obug-2.1.1.tgz",
|
| 3665 |
+
"integrity": "sha512-uTqF9MuPraAQ+IsnPf366RG4cP9RtUi7MLO1N3KEc+wb0a6yKpeL0lmk2IB1jY5KHPAlTc6T/JRdC/YqxHNwkQ==",
|
| 3666 |
+
"dev": true,
|
| 3667 |
+
"funding": [
|
| 3668 |
+
"https://github.com/sponsors/sxzz",
|
| 3669 |
+
"https://opencollective.com/debug"
|
| 3670 |
+
],
|
| 3671 |
+
"license": "MIT"
|
| 3672 |
+
},
|
| 3673 |
"node_modules/onnxruntime-common": {
|
| 3674 |
"version": "1.24.3",
|
| 3675 |
"resolved": "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.24.3.tgz",
|
|
|
|
| 3796 |
"node": ">=8"
|
| 3797 |
}
|
| 3798 |
},
|
| 3799 |
+
"node_modules/pathe": {
|
| 3800 |
+
"version": "2.0.3",
|
| 3801 |
+
"resolved": "https://registry.npmjs.org/pathe/-/pathe-2.0.3.tgz",
|
| 3802 |
+
"integrity": "sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w==",
|
| 3803 |
+
"dev": true,
|
| 3804 |
+
"license": "MIT"
|
| 3805 |
+
},
|
| 3806 |
"node_modules/picocolors": {
|
| 3807 |
"version": "1.1.1",
|
| 3808 |
"resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
|
|
|
|
| 4121 |
"node": ">=8"
|
| 4122 |
}
|
| 4123 |
},
|
| 4124 |
+
"node_modules/siginfo": {
|
| 4125 |
+
"version": "2.0.0",
|
| 4126 |
+
"resolved": "https://registry.npmjs.org/siginfo/-/siginfo-2.0.0.tgz",
|
| 4127 |
+
"integrity": "sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==",
|
| 4128 |
+
"dev": true,
|
| 4129 |
+
"license": "ISC"
|
| 4130 |
+
},
|
| 4131 |
"node_modules/source-map-js": {
|
| 4132 |
"version": "1.2.1",
|
| 4133 |
"resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
|
|
|
|
| 4144 |
"integrity": "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==",
|
| 4145 |
"license": "BSD-3-Clause"
|
| 4146 |
},
|
| 4147 |
+
"node_modules/stackback": {
|
| 4148 |
+
"version": "0.0.2",
|
| 4149 |
+
"resolved": "https://registry.npmjs.org/stackback/-/stackback-0.0.2.tgz",
|
| 4150 |
+
"integrity": "sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==",
|
| 4151 |
+
"dev": true,
|
| 4152 |
+
"license": "MIT"
|
| 4153 |
+
},
|
| 4154 |
+
"node_modules/std-env": {
|
| 4155 |
+
"version": "3.10.0",
|
| 4156 |
+
"resolved": "https://registry.npmjs.org/std-env/-/std-env-3.10.0.tgz",
|
| 4157 |
+
"integrity": "sha512-5GS12FdOZNliM5mAOxFRg7Ir0pWz8MdpYm6AY6VPkGpbA7ZzmbzNcBJQ0GPvvyWgcY7QAhCgf9Uy89I03faLkg==",
|
| 4158 |
+
"dev": true,
|
| 4159 |
+
"license": "MIT"
|
| 4160 |
+
},
|
| 4161 |
"node_modules/strip-json-comments": {
|
| 4162 |
"version": "3.1.1",
|
| 4163 |
"resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz",
|
|
|
|
| 4184 |
"node": ">=8"
|
| 4185 |
}
|
| 4186 |
},
|
| 4187 |
+
"node_modules/tinybench": {
|
| 4188 |
+
"version": "2.9.0",
|
| 4189 |
+
"resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz",
|
| 4190 |
+
"integrity": "sha512-0+DUvqWMValLmha6lr4kD8iAMK1HzV0/aKnCtWb9v9641TnP/MFb7Pc2bxoxQjTXAErryXVgUOfv2YqNllqGeg==",
|
| 4191 |
+
"dev": true,
|
| 4192 |
+
"license": "MIT"
|
| 4193 |
+
},
|
| 4194 |
+
"node_modules/tinyexec": {
|
| 4195 |
+
"version": "1.0.2",
|
| 4196 |
+
"resolved": "https://registry.npmjs.org/tinyexec/-/tinyexec-1.0.2.tgz",
|
| 4197 |
+
"integrity": "sha512-W/KYk+NFhkmsYpuHq5JykngiOCnxeVL8v8dFnqxSD8qEEdRfXk1SDM6JzNqcERbcGYj9tMrDQBYV9cjgnunFIg==",
|
| 4198 |
+
"dev": true,
|
| 4199 |
+
"license": "MIT",
|
| 4200 |
+
"engines": {
|
| 4201 |
+
"node": ">=18"
|
| 4202 |
+
}
|
| 4203 |
+
},
|
| 4204 |
"node_modules/tinyglobby": {
|
| 4205 |
"version": "0.2.15",
|
| 4206 |
"resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz",
|
|
|
|
| 4218 |
"url": "https://github.com/sponsors/SuperchupuDev"
|
| 4219 |
}
|
| 4220 |
},
|
| 4221 |
+
"node_modules/tinyrainbow": {
|
| 4222 |
+
"version": "3.0.3",
|
| 4223 |
+
"resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.0.3.tgz",
|
| 4224 |
+
"integrity": "sha512-PSkbLUoxOFRzJYjjxHJt9xro7D+iilgMX/C9lawzVuYiIdcihh9DXmVibBe8lmcFrRi/VzlPjBxbN7rH24q8/Q==",
|
| 4225 |
+
"dev": true,
|
| 4226 |
+
"license": "MIT",
|
| 4227 |
+
"engines": {
|
| 4228 |
+
"node": ">=14.0.0"
|
| 4229 |
+
}
|
| 4230 |
+
},
|
| 4231 |
"node_modules/ts-api-utils": {
|
| 4232 |
"version": "2.4.0",
|
| 4233 |
"resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.4.0.tgz",
|
|
|
|
| 4433 |
}
|
| 4434 |
}
|
| 4435 |
},
|
| 4436 |
+
"node_modules/vitest": {
|
| 4437 |
+
"version": "4.0.18",
|
| 4438 |
+
"resolved": "https://registry.npmjs.org/vitest/-/vitest-4.0.18.tgz",
|
| 4439 |
+
"integrity": "sha512-hOQuK7h0FGKgBAas7v0mSAsnvrIgAvWmRFjmzpJ7SwFHH3g1k2u37JtYwOwmEKhK6ZO3v9ggDBBm0La1LCK4uQ==",
|
| 4440 |
+
"dev": true,
|
| 4441 |
+
"license": "MIT",
|
| 4442 |
+
"dependencies": {
|
| 4443 |
+
"@vitest/expect": "4.0.18",
|
| 4444 |
+
"@vitest/mocker": "4.0.18",
|
| 4445 |
+
"@vitest/pretty-format": "4.0.18",
|
| 4446 |
+
"@vitest/runner": "4.0.18",
|
| 4447 |
+
"@vitest/snapshot": "4.0.18",
|
| 4448 |
+
"@vitest/spy": "4.0.18",
|
| 4449 |
+
"@vitest/utils": "4.0.18",
|
| 4450 |
+
"es-module-lexer": "^1.7.0",
|
| 4451 |
+
"expect-type": "^1.2.2",
|
| 4452 |
+
"magic-string": "^0.30.21",
|
| 4453 |
+
"obug": "^2.1.1",
|
| 4454 |
+
"pathe": "^2.0.3",
|
| 4455 |
+
"picomatch": "^4.0.3",
|
| 4456 |
+
"std-env": "^3.10.0",
|
| 4457 |
+
"tinybench": "^2.9.0",
|
| 4458 |
+
"tinyexec": "^1.0.2",
|
| 4459 |
+
"tinyglobby": "^0.2.15",
|
| 4460 |
+
"tinyrainbow": "^3.0.3",
|
| 4461 |
+
"vite": "^6.0.0 || ^7.0.0",
|
| 4462 |
+
"why-is-node-running": "^2.3.0"
|
| 4463 |
+
},
|
| 4464 |
+
"bin": {
|
| 4465 |
+
"vitest": "vitest.mjs"
|
| 4466 |
+
},
|
| 4467 |
+
"engines": {
|
| 4468 |
+
"node": "^20.0.0 || ^22.0.0 || >=24.0.0"
|
| 4469 |
+
},
|
| 4470 |
+
"funding": {
|
| 4471 |
+
"url": "https://opencollective.com/vitest"
|
| 4472 |
+
},
|
| 4473 |
+
"peerDependencies": {
|
| 4474 |
+
"@edge-runtime/vm": "*",
|
| 4475 |
+
"@opentelemetry/api": "^1.9.0",
|
| 4476 |
+
"@types/node": "^20.0.0 || ^22.0.0 || >=24.0.0",
|
| 4477 |
+
"@vitest/browser-playwright": "4.0.18",
|
| 4478 |
+
"@vitest/browser-preview": "4.0.18",
|
| 4479 |
+
"@vitest/browser-webdriverio": "4.0.18",
|
| 4480 |
+
"@vitest/ui": "4.0.18",
|
| 4481 |
+
"happy-dom": "*",
|
| 4482 |
+
"jsdom": "*"
|
| 4483 |
+
},
|
| 4484 |
+
"peerDependenciesMeta": {
|
| 4485 |
+
"@edge-runtime/vm": {
|
| 4486 |
+
"optional": true
|
| 4487 |
+
},
|
| 4488 |
+
"@opentelemetry/api": {
|
| 4489 |
+
"optional": true
|
| 4490 |
+
},
|
| 4491 |
+
"@types/node": {
|
| 4492 |
+
"optional": true
|
| 4493 |
+
},
|
| 4494 |
+
"@vitest/browser-playwright": {
|
| 4495 |
+
"optional": true
|
| 4496 |
+
},
|
| 4497 |
+
"@vitest/browser-preview": {
|
| 4498 |
+
"optional": true
|
| 4499 |
+
},
|
| 4500 |
+
"@vitest/browser-webdriverio": {
|
| 4501 |
+
"optional": true
|
| 4502 |
+
},
|
| 4503 |
+
"@vitest/ui": {
|
| 4504 |
+
"optional": true
|
| 4505 |
+
},
|
| 4506 |
+
"happy-dom": {
|
| 4507 |
+
"optional": true
|
| 4508 |
+
},
|
| 4509 |
+
"jsdom": {
|
| 4510 |
+
"optional": true
|
| 4511 |
+
}
|
| 4512 |
+
}
|
| 4513 |
+
},
|
| 4514 |
"node_modules/which": {
|
| 4515 |
"version": "2.0.2",
|
| 4516 |
"resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
|
|
|
|
| 4527 |
"node": ">= 8"
|
| 4528 |
}
|
| 4529 |
},
|
| 4530 |
+
"node_modules/why-is-node-running": {
|
| 4531 |
+
"version": "2.3.0",
|
| 4532 |
+
"resolved": "https://registry.npmjs.org/why-is-node-running/-/why-is-node-running-2.3.0.tgz",
|
| 4533 |
+
"integrity": "sha512-hUrmaWBdVDcxvYqnyh09zunKzROWjbZTiNy8dBEjkS7ehEDQibXJ7XvlmtbwuTclUiIyN+CyXQD4Vmko8fNm8w==",
|
| 4534 |
+
"dev": true,
|
| 4535 |
+
"license": "MIT",
|
| 4536 |
+
"dependencies": {
|
| 4537 |
+
"siginfo": "^2.0.0",
|
| 4538 |
+
"stackback": "0.0.2"
|
| 4539 |
+
},
|
| 4540 |
+
"bin": {
|
| 4541 |
+
"why-is-node-running": "cli.js"
|
| 4542 |
+
},
|
| 4543 |
+
"engines": {
|
| 4544 |
+
"node": ">=8"
|
| 4545 |
+
}
|
| 4546 |
+
},
|
| 4547 |
"node_modules/word-wrap": {
|
| 4548 |
"version": "1.2.5",
|
| 4549 |
"resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.5.tgz",
|
package.json
CHANGED
|
@@ -26,6 +26,7 @@
|
|
| 26 |
"globals": "^16.5.0",
|
| 27 |
"typescript": "~5.9.3",
|
| 28 |
"typescript-eslint": "^8.48.0",
|
| 29 |
-
"vite": "^7.3.1"
|
|
|
|
| 30 |
}
|
| 31 |
}
|
|
|
|
| 26 |
"globals": "^16.5.0",
|
| 27 |
"typescript": "~5.9.3",
|
| 28 |
"typescript-eslint": "^8.48.0",
|
| 29 |
+
"vite": "^7.3.1",
|
| 30 |
+
"vitest": "^4.0.18"
|
| 31 |
}
|
| 32 |
}
|
src/pipeline/chunking.test.ts
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { describe, it, expect } from "vitest";
|
| 2 |
+
import {
|
| 3 |
+
chunkDocument,
|
| 4 |
+
extractTitle,
|
| 5 |
+
scanBreakPoints,
|
| 6 |
+
findCodeFences,
|
| 7 |
+
isInsideCodeFence,
|
| 8 |
+
splitIntoChunks,
|
| 9 |
+
} from "./chunking";
|
| 10 |
+
import type { Document } from "../types";
|
| 11 |
+
import { CHUNK_SIZE_CHARS, CHUNK_OVERLAP_CHARS } from "../constants";
|
| 12 |
+
|
| 13 |
+
// ---------------------------------------------------------------------------
|
| 14 |
+
// extractTitle
|
| 15 |
+
// ---------------------------------------------------------------------------
|
| 16 |
+
describe("extractTitle", () => {
|
| 17 |
+
it("extracts the first H1 heading", () => {
|
| 18 |
+
expect(extractTitle("# My Document\n\nBody text", "file.md")).toBe(
|
| 19 |
+
"My Document",
|
| 20 |
+
);
|
| 21 |
+
});
|
| 22 |
+
|
| 23 |
+
it("ignores H2 headings and uses the H1", () => {
|
| 24 |
+
const content = "## Section\n\n# Title\n\nBody";
|
| 25 |
+
expect(extractTitle(content, "file.md")).toBe("Title");
|
| 26 |
+
});
|
| 27 |
+
|
| 28 |
+
it("falls back to filename without extension", () => {
|
| 29 |
+
expect(extractTitle("No headings here", "notes.md")).toBe("notes");
|
| 30 |
+
});
|
| 31 |
+
|
| 32 |
+
it("handles filename without extension", () => {
|
| 33 |
+
expect(extractTitle("No headings", "README")).toBe("README");
|
| 34 |
+
});
|
| 35 |
+
|
| 36 |
+
it("trims whitespace from heading", () => {
|
| 37 |
+
expect(extractTitle("# Spaced Title \n", "f.md")).toBe("Spaced Title");
|
| 38 |
+
});
|
| 39 |
+
});
|
| 40 |
+
|
| 41 |
+
// ---------------------------------------------------------------------------
|
| 42 |
+
// scanBreakPoints
|
| 43 |
+
// ---------------------------------------------------------------------------
|
| 44 |
+
describe("scanBreakPoints", () => {
|
| 45 |
+
it("detects heading break points", () => {
|
| 46 |
+
const text = "\n# H1\n## H2\n### H3";
|
| 47 |
+
const bps = scanBreakPoints(text);
|
| 48 |
+
const types = bps.map((bp) => bp.type);
|
| 49 |
+
expect(types).toContain("h1");
|
| 50 |
+
expect(types).toContain("h2");
|
| 51 |
+
expect(types).toContain("h3");
|
| 52 |
+
});
|
| 53 |
+
|
| 54 |
+
it("detects blank-line paragraph breaks", () => {
|
| 55 |
+
const text = "line1\n\nline2";
|
| 56 |
+
const bps = scanBreakPoints(text);
|
| 57 |
+
expect(bps.some((bp) => bp.type === "blank")).toBe(true);
|
| 58 |
+
});
|
| 59 |
+
|
| 60 |
+
it("higher-score pattern wins at same position", () => {
|
| 61 |
+
// A heading line also matches \n, but heading should win
|
| 62 |
+
const text = "\n# Heading";
|
| 63 |
+
const bps = scanBreakPoints(text);
|
| 64 |
+
const atZero = bps.find((bp) => bp.pos === 0);
|
| 65 |
+
expect(atZero?.type).toBe("h1");
|
| 66 |
+
expect(atZero?.score).toBe(100);
|
| 67 |
+
});
|
| 68 |
+
|
| 69 |
+
it("returns break points sorted by position", () => {
|
| 70 |
+
const text = "\n## B\n\n# A\ntext";
|
| 71 |
+
const bps = scanBreakPoints(text);
|
| 72 |
+
for (let i = 1; i < bps.length; i++) {
|
| 73 |
+
expect(bps[i].pos).toBeGreaterThanOrEqual(bps[i - 1].pos);
|
| 74 |
+
}
|
| 75 |
+
});
|
| 76 |
+
});
|
| 77 |
+
|
| 78 |
+
// ---------------------------------------------------------------------------
|
| 79 |
+
// findCodeFences / isInsideCodeFence
|
| 80 |
+
// ---------------------------------------------------------------------------
|
| 81 |
+
describe("findCodeFences", () => {
|
| 82 |
+
it("finds paired code fences", () => {
|
| 83 |
+
const text = "before\n```js\ncode\n```\nafter";
|
| 84 |
+
const fences = findCodeFences(text);
|
| 85 |
+
expect(fences).toHaveLength(1);
|
| 86 |
+
expect(fences[0].start).toBeLessThan(fences[0].end);
|
| 87 |
+
});
|
| 88 |
+
|
| 89 |
+
it("handles unclosed fence extending to end", () => {
|
| 90 |
+
const text = "before\n```js\ncode without closing";
|
| 91 |
+
const fences = findCodeFences(text);
|
| 92 |
+
expect(fences).toHaveLength(1);
|
| 93 |
+
expect(fences[0].end).toBe(text.length);
|
| 94 |
+
});
|
| 95 |
+
|
| 96 |
+
it("handles multiple code fence pairs", () => {
|
| 97 |
+
const text = "a\n```\nb\n```\nc\n```\nd\n```\ne";
|
| 98 |
+
const fences = findCodeFences(text);
|
| 99 |
+
expect(fences).toHaveLength(2);
|
| 100 |
+
});
|
| 101 |
+
});
|
| 102 |
+
|
| 103 |
+
describe("isInsideCodeFence", () => {
|
| 104 |
+
it("returns true for position inside a fence", () => {
|
| 105 |
+
const fences = [{ start: 10, end: 50 }];
|
| 106 |
+
expect(isInsideCodeFence(25, fences)).toBe(true);
|
| 107 |
+
});
|
| 108 |
+
|
| 109 |
+
it("returns false for position outside fences", () => {
|
| 110 |
+
const fences = [{ start: 10, end: 50 }];
|
| 111 |
+
expect(isInsideCodeFence(5, fences)).toBe(false);
|
| 112 |
+
expect(isInsideCodeFence(55, fences)).toBe(false);
|
| 113 |
+
});
|
| 114 |
+
|
| 115 |
+
it("returns false for position at fence boundary", () => {
|
| 116 |
+
const fences = [{ start: 10, end: 50 }];
|
| 117 |
+
// Boundaries are exclusive
|
| 118 |
+
expect(isInsideCodeFence(10, fences)).toBe(false);
|
| 119 |
+
expect(isInsideCodeFence(50, fences)).toBe(false);
|
| 120 |
+
});
|
| 121 |
+
});
|
| 122 |
+
|
| 123 |
+
// ---------------------------------------------------------------------------
|
| 124 |
+
// splitIntoChunks
|
| 125 |
+
// ---------------------------------------------------------------------------
|
| 126 |
+
describe("splitIntoChunks", () => {
|
| 127 |
+
it("returns a single chunk for short content", () => {
|
| 128 |
+
const text = "Short content";
|
| 129 |
+
const chunks = splitIntoChunks(text, 100, 15);
|
| 130 |
+
expect(chunks).toHaveLength(1);
|
| 131 |
+
expect(chunks[0].text).toBe(text);
|
| 132 |
+
expect(chunks[0].pos).toBe(0);
|
| 133 |
+
});
|
| 134 |
+
|
| 135 |
+
it("splits long content into overlapping chunks", () => {
|
| 136 |
+
// Build text longer than one chunk
|
| 137 |
+
const line = "word ".repeat(20) + "\n"; // ~100 chars
|
| 138 |
+
const text = line.repeat(50); // ~5000 chars
|
| 139 |
+
const chunks = splitIntoChunks(text, 1000, 150, 200);
|
| 140 |
+
expect(chunks.length).toBeGreaterThan(1);
|
| 141 |
+
|
| 142 |
+
// Verify overlap: each chunk (except the first) should start before
|
| 143 |
+
// the previous chunk ends
|
| 144 |
+
for (let i = 1; i < chunks.length; i++) {
|
| 145 |
+
const prevEnd = chunks[i - 1].pos + chunks[i - 1].text.length;
|
| 146 |
+
expect(chunks[i].pos).toBeLessThan(prevEnd);
|
| 147 |
+
}
|
| 148 |
+
});
|
| 149 |
+
|
| 150 |
+
it("prefers heading boundaries for splits", () => {
|
| 151 |
+
// Create content where a heading is near the chunk boundary
|
| 152 |
+
const filler = "x".repeat(900);
|
| 153 |
+
const text = filler + "\n## Section Two\n" + "y".repeat(900);
|
| 154 |
+
const chunks = splitIntoChunks(text, 1000, 100, 300);
|
| 155 |
+
|
| 156 |
+
// The first chunk should end at/near the heading, not mid-text
|
| 157 |
+
expect(chunks.length).toBeGreaterThanOrEqual(2);
|
| 158 |
+
// The heading should appear at the start of a chunk (after overlap)
|
| 159 |
+
const secondChunkHasHeading = chunks
|
| 160 |
+
.slice(1)
|
| 161 |
+
.some((c) => c.text.includes("## Section Two"));
|
| 162 |
+
expect(secondChunkHasHeading).toBe(true);
|
| 163 |
+
});
|
| 164 |
+
|
| 165 |
+
it("does not split inside code fences", () => {
|
| 166 |
+
// Create a code fence that spans the would-be chunk boundary
|
| 167 |
+
const before = "a".repeat(800);
|
| 168 |
+
const codeFence =
|
| 169 |
+
"\n```\n" + "code line\n".repeat(80) + "\n```\n"; // ~880 chars
|
| 170 |
+
const after = "b".repeat(400);
|
| 171 |
+
const text = before + codeFence + after;
|
| 172 |
+
|
| 173 |
+
const chunks = splitIntoChunks(text, 1000, 100, 200);
|
| 174 |
+
|
| 175 |
+
// No chunk should start or end inside the code fence (between ``` markers)
|
| 176 |
+
// with part of the fence in one chunk and part in another
|
| 177 |
+
for (const chunk of chunks) {
|
| 178 |
+
const openCount = (chunk.text.match(/\n```/g) || []).length;
|
| 179 |
+
// If a chunk contains an opening ```, it should also contain the closing
|
| 180 |
+
// (i.e. fences should be paired within each chunk, or the chunk includes
|
| 181 |
+
// the entire fence region)
|
| 182 |
+
// This is a soft check — the algorithm extends past fences
|
| 183 |
+
if (openCount === 1) {
|
| 184 |
+
// Single fence marker is okay if it's the closing one at the start
|
| 185 |
+
// (from overlap) or at the very end
|
| 186 |
+
}
|
| 187 |
+
// At minimum, verify no chunk has an odd number of fence markers
|
| 188 |
+
// unless it's the last chunk containing an unclosed fence
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
// Primary assertion: all chunks produce valid content
|
| 192 |
+
expect(chunks.length).toBeGreaterThan(1);
|
| 193 |
+
});
|
| 194 |
+
|
| 195 |
+
it("makes forward progress even with edge cases", () => {
|
| 196 |
+
const text = "x".repeat(5000);
|
| 197 |
+
const chunks = splitIntoChunks(text, 1000, 150, 200);
|
| 198 |
+
expect(chunks.length).toBeGreaterThan(1);
|
| 199 |
+
// Verify every chunk has content
|
| 200 |
+
for (const chunk of chunks) {
|
| 201 |
+
expect(chunk.text.length).toBeGreaterThan(0);
|
| 202 |
+
}
|
| 203 |
+
});
|
| 204 |
+
});
|
| 205 |
+
|
| 206 |
+
// ---------------------------------------------------------------------------
|
| 207 |
+
// chunkDocument (integration)
|
| 208 |
+
// ---------------------------------------------------------------------------
|
| 209 |
+
describe("chunkDocument", () => {
|
| 210 |
+
it("produces Chunk objects with correct metadata", () => {
|
| 211 |
+
const doc: Document = {
|
| 212 |
+
id: "test-doc",
|
| 213 |
+
title: "Test Document",
|
| 214 |
+
body: "Hello world",
|
| 215 |
+
filepath: "test.md",
|
| 216 |
+
};
|
| 217 |
+
const chunks = chunkDocument(doc);
|
| 218 |
+
expect(chunks).toHaveLength(1);
|
| 219 |
+
expect(chunks[0]).toEqual({
|
| 220 |
+
docId: "test-doc",
|
| 221 |
+
chunkIndex: 0,
|
| 222 |
+
text: "Hello world",
|
| 223 |
+
startChar: 0,
|
| 224 |
+
title: "Test Document",
|
| 225 |
+
});
|
| 226 |
+
});
|
| 227 |
+
|
| 228 |
+
it("chunks a long document into multiple pieces", () => {
|
| 229 |
+
const body = ("paragraph text here. ".repeat(50) + "\n\n").repeat(20);
|
| 230 |
+
const doc: Document = {
|
| 231 |
+
id: "long-doc",
|
| 232 |
+
title: "Long Document",
|
| 233 |
+
body,
|
| 234 |
+
filepath: "long.md",
|
| 235 |
+
};
|
| 236 |
+
const chunks = chunkDocument(doc);
|
| 237 |
+
expect(chunks.length).toBeGreaterThan(1);
|
| 238 |
+
|
| 239 |
+
// All chunks reference the parent doc
|
| 240 |
+
for (const chunk of chunks) {
|
| 241 |
+
expect(chunk.docId).toBe("long-doc");
|
| 242 |
+
expect(chunk.title).toBe("Long Document");
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
// Chunk indices are sequential
|
| 246 |
+
for (let i = 0; i < chunks.length; i++) {
|
| 247 |
+
expect(chunks[i].chunkIndex).toBe(i);
|
| 248 |
+
}
|
| 249 |
+
});
|
| 250 |
+
|
| 251 |
+
it("uses configured CHUNK_SIZE_CHARS and CHUNK_OVERLAP_CHARS", () => {
|
| 252 |
+
// Each chunk (except possibly the last) should be roughly CHUNK_SIZE_CHARS
|
| 253 |
+
const body = "a".repeat(CHUNK_SIZE_CHARS * 3);
|
| 254 |
+
const doc: Document = {
|
| 255 |
+
id: "sized",
|
| 256 |
+
title: "Sized",
|
| 257 |
+
body,
|
| 258 |
+
filepath: "sized.md",
|
| 259 |
+
};
|
| 260 |
+
const chunks = chunkDocument(doc);
|
| 261 |
+
expect(chunks.length).toBeGreaterThan(1);
|
| 262 |
+
|
| 263 |
+
// First chunk should be close to CHUNK_SIZE_CHARS
|
| 264 |
+
expect(chunks[0].text.length).toBeLessThanOrEqual(CHUNK_SIZE_CHARS);
|
| 265 |
+
expect(chunks[0].text.length).toBeGreaterThan(CHUNK_SIZE_CHARS * 0.5);
|
| 266 |
+
});
|
| 267 |
+
});
|
src/pipeline/chunking.ts
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { Chunk, Document } from "../types";
|
| 2 |
+
import { CHUNK_SIZE_CHARS, CHUNK_OVERLAP_CHARS } from "../constants";
|
| 3 |
+
|
| 4 |
+
// How far back from the target cut position to search for a break point (~200 tokens)
|
| 5 |
+
const CHUNK_WINDOW_CHARS = 800;
|
| 6 |
+
|
| 7 |
+
// Break point scoring — higher = better place to split.
|
| 8 |
+
// Order matters: more specific patterns first so headings beat generic newlines.
|
| 9 |
+
const BREAK_PATTERNS: [RegExp, number, string][] = [
|
| 10 |
+
[/\n#{1}(?!#)/g, 100, "h1"],
|
| 11 |
+
[/\n#{2}(?!#)/g, 90, "h2"],
|
| 12 |
+
[/\n#{3}(?!#)/g, 80, "h3"],
|
| 13 |
+
[/\n#{4}(?!#)/g, 70, "h4"],
|
| 14 |
+
[/\n#{5}(?!#)/g, 60, "h5"],
|
| 15 |
+
[/\n#{6}(?!#)/g, 50, "h6"],
|
| 16 |
+
[/\n```/g, 80, "codeblock"],
|
| 17 |
+
[/\n(?:---|\*\*\*|___)\s*\n/g, 60, "hr"],
|
| 18 |
+
[/\n\n+/g, 20, "blank"],
|
| 19 |
+
[/\n[-*]\s/g, 5, "list"],
|
| 20 |
+
[/\n\d+\.\s/g, 5, "numlist"],
|
| 21 |
+
[/\n/g, 1, "newline"],
|
| 22 |
+
];
|
| 23 |
+
|
| 24 |
+
interface BreakPoint {
|
| 25 |
+
pos: number;
|
| 26 |
+
score: number;
|
| 27 |
+
type: string;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
interface CodeFenceRegion {
|
| 31 |
+
start: number;
|
| 32 |
+
end: number;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
/**
|
| 36 |
+
* Scan text for all potential break points, returning them sorted by position.
|
| 37 |
+
* When multiple patterns match the same position, the highest score wins.
|
| 38 |
+
*/
|
| 39 |
+
function scanBreakPoints(text: string): BreakPoint[] {
|
| 40 |
+
const seen = new Map<number, BreakPoint>();
|
| 41 |
+
|
| 42 |
+
for (const [pattern, score, type] of BREAK_PATTERNS) {
|
| 43 |
+
for (const match of text.matchAll(pattern)) {
|
| 44 |
+
const pos = match.index!;
|
| 45 |
+
const existing = seen.get(pos);
|
| 46 |
+
if (!existing || score > existing.score) {
|
| 47 |
+
seen.set(pos, { pos, score, type });
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
return [...seen.values()].sort((a, b) => a.pos - b.pos);
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
/**
|
| 56 |
+
* Find all code fence regions (``` pairs). Never split inside these.
|
| 57 |
+
*/
|
| 58 |
+
function findCodeFences(text: string): CodeFenceRegion[] {
|
| 59 |
+
const regions: CodeFenceRegion[] = [];
|
| 60 |
+
const fencePattern = /\n```/g;
|
| 61 |
+
let inFence = false;
|
| 62 |
+
let fenceStart = 0;
|
| 63 |
+
|
| 64 |
+
for (const match of text.matchAll(fencePattern)) {
|
| 65 |
+
if (!inFence) {
|
| 66 |
+
fenceStart = match.index!;
|
| 67 |
+
inFence = true;
|
| 68 |
+
} else {
|
| 69 |
+
regions.push({ start: fenceStart, end: match.index! + match[0].length });
|
| 70 |
+
inFence = false;
|
| 71 |
+
}
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
// Unclosed fence extends to end of document
|
| 75 |
+
if (inFence) {
|
| 76 |
+
regions.push({ start: fenceStart, end: text.length });
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
return regions;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]): boolean {
|
| 83 |
+
return fences.some((f) => pos > f.start && pos < f.end);
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
/**
|
| 87 |
+
* Find the best cut position near `targetCharPos` using scored break points
|
| 88 |
+
* with squared distance decay. Headings far back still beat low-quality breaks
|
| 89 |
+
* near the target.
|
| 90 |
+
*/
|
| 91 |
+
function findBestCutoff(
|
| 92 |
+
breakPoints: BreakPoint[],
|
| 93 |
+
targetCharPos: number,
|
| 94 |
+
windowChars: number = CHUNK_WINDOW_CHARS,
|
| 95 |
+
decayFactor: number = 0.7,
|
| 96 |
+
codeFences: CodeFenceRegion[] = [],
|
| 97 |
+
): number {
|
| 98 |
+
const windowStart = targetCharPos - windowChars;
|
| 99 |
+
let bestScore = -1;
|
| 100 |
+
let bestPos = targetCharPos;
|
| 101 |
+
|
| 102 |
+
for (const bp of breakPoints) {
|
| 103 |
+
if (bp.pos < windowStart) continue;
|
| 104 |
+
if (bp.pos > targetCharPos) break; // sorted — stop early
|
| 105 |
+
|
| 106 |
+
if (isInsideCodeFence(bp.pos, codeFences)) continue;
|
| 107 |
+
|
| 108 |
+
const distance = targetCharPos - bp.pos;
|
| 109 |
+
const normalizedDist = distance / windowChars;
|
| 110 |
+
const multiplier = 1.0 - normalizedDist * normalizedDist * decayFactor;
|
| 111 |
+
const finalScore = bp.score * multiplier;
|
| 112 |
+
|
| 113 |
+
if (finalScore > bestScore) {
|
| 114 |
+
bestScore = finalScore;
|
| 115 |
+
bestPos = bp.pos;
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
return bestPos;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
/**
|
| 123 |
+
* Split content into overlapping character-based chunks, preferring markdown
|
| 124 |
+
* heading boundaries and avoiding splits inside code fences.
|
| 125 |
+
*/
|
| 126 |
+
function splitIntoChunks(
|
| 127 |
+
content: string,
|
| 128 |
+
maxChars: number = CHUNK_SIZE_CHARS,
|
| 129 |
+
overlapChars: number = CHUNK_OVERLAP_CHARS,
|
| 130 |
+
windowChars: number = CHUNK_WINDOW_CHARS,
|
| 131 |
+
): { text: string; pos: number }[] {
|
| 132 |
+
if (content.length <= maxChars) {
|
| 133 |
+
return [{ text: content, pos: 0 }];
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
const breakPoints = scanBreakPoints(content);
|
| 137 |
+
const codeFences = findCodeFences(content);
|
| 138 |
+
const chunks: { text: string; pos: number }[] = [];
|
| 139 |
+
let charPos = 0;
|
| 140 |
+
|
| 141 |
+
while (charPos < content.length) {
|
| 142 |
+
const targetEndPos = Math.min(charPos + maxChars, content.length);
|
| 143 |
+
let endPos = targetEndPos;
|
| 144 |
+
|
| 145 |
+
if (endPos < content.length) {
|
| 146 |
+
const bestCutoff = findBestCutoff(
|
| 147 |
+
breakPoints,
|
| 148 |
+
targetEndPos,
|
| 149 |
+
windowChars,
|
| 150 |
+
0.7,
|
| 151 |
+
codeFences,
|
| 152 |
+
);
|
| 153 |
+
|
| 154 |
+
if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
|
| 155 |
+
endPos = bestCutoff;
|
| 156 |
+
}
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
// Ensure forward progress
|
| 160 |
+
if (endPos <= charPos) {
|
| 161 |
+
endPos = Math.min(charPos + maxChars, content.length);
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
|
| 165 |
+
|
| 166 |
+
if (endPos >= content.length) break;
|
| 167 |
+
|
| 168 |
+
charPos = endPos - overlapChars;
|
| 169 |
+
const lastChunkPos = chunks.at(-1)!.pos;
|
| 170 |
+
if (charPos <= lastChunkPos) {
|
| 171 |
+
charPos = endPos; // prevent infinite loop
|
| 172 |
+
}
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
return chunks;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
// ---------------------------------------------------------------------------
|
| 179 |
+
// Public API
|
| 180 |
+
// ---------------------------------------------------------------------------
|
| 181 |
+
|
| 182 |
+
/**
|
| 183 |
+
* Extract title from markdown content. Returns the first H1 heading text,
|
| 184 |
+
* or falls back to filename without extension.
|
| 185 |
+
*/
|
| 186 |
+
export function extractTitle(content: string, filename: string): string {
|
| 187 |
+
const match = content.match(/^#\s+(.+)$/m);
|
| 188 |
+
if (match) return match[1].trim();
|
| 189 |
+
|
| 190 |
+
// Strip extension from filename
|
| 191 |
+
const dot = filename.lastIndexOf(".");
|
| 192 |
+
return dot > 0 ? filename.slice(0, dot) : filename;
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
/**
|
| 196 |
+
* Chunk a single Document into overlapping Chunk objects suitable for
|
| 197 |
+
* embedding and search.
|
| 198 |
+
*/
|
| 199 |
+
export function chunkDocument(doc: Document): Chunk[] {
|
| 200 |
+
const raw = splitIntoChunks(doc.body);
|
| 201 |
+
|
| 202 |
+
return raw.map((c, i) => ({
|
| 203 |
+
docId: doc.id,
|
| 204 |
+
chunkIndex: i,
|
| 205 |
+
text: c.text,
|
| 206 |
+
startChar: c.pos,
|
| 207 |
+
title: doc.title,
|
| 208 |
+
}));
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
// Exported for testing
|
| 212 |
+
export { scanBreakPoints, findCodeFences, isInsideCodeFence, findBestCutoff, splitIntoChunks };
|
| 213 |
+
export type { BreakPoint, CodeFenceRegion };
|