| import { describe, it, expect } from "vitest"; |
| import { |
| chunkDocument, |
| extractTitle, |
| scanBreakPoints, |
| findCodeFences, |
| isInsideCodeFence, |
| splitIntoChunks, |
| } from "./chunking"; |
| import type { Document } from "../types"; |
| import { CHUNK_SIZE_CHARS } from "../constants"; |
|
|
| |
| |
| |
| describe("extractTitle", () => { |
| it("extracts the first H1 heading", () => { |
| expect(extractTitle("# My Document\n\nBody text", "file.md")).toBe( |
| "My Document", |
| ); |
| }); |
|
|
| it("ignores H2 headings and uses the H1", () => { |
| const content = "## Section\n\n# Title\n\nBody"; |
| expect(extractTitle(content, "file.md")).toBe("Title"); |
| }); |
|
|
| it("falls back to filename without extension", () => { |
| expect(extractTitle("No headings here", "notes.md")).toBe("notes"); |
| }); |
|
|
| it("handles filename without extension", () => { |
| expect(extractTitle("No headings", "README")).toBe("README"); |
| }); |
|
|
| it("trims whitespace from heading", () => { |
| expect(extractTitle("# Spaced Title \n", "f.md")).toBe("Spaced Title"); |
| }); |
| }); |
|
|
| |
| |
| |
| describe("scanBreakPoints", () => { |
| it("detects heading break points", () => { |
| const text = "\n# H1\n## H2\n### H3"; |
| const bps = scanBreakPoints(text); |
| const types = bps.map((bp) => bp.type); |
| expect(types).toContain("h1"); |
| expect(types).toContain("h2"); |
| expect(types).toContain("h3"); |
| }); |
|
|
| it("detects blank-line paragraph breaks", () => { |
| const text = "line1\n\nline2"; |
| const bps = scanBreakPoints(text); |
| expect(bps.some((bp) => bp.type === "blank")).toBe(true); |
| }); |
|
|
| it("higher-score pattern wins at same position", () => { |
| |
| const text = "\n# Heading"; |
| const bps = scanBreakPoints(text); |
| const atZero = bps.find((bp) => bp.pos === 0); |
| expect(atZero?.type).toBe("h1"); |
| expect(atZero?.score).toBe(100); |
| }); |
|
|
| it("returns break points sorted by position", () => { |
| const text = "\n## B\n\n# A\ntext"; |
| const bps = scanBreakPoints(text); |
| for (let i = 1; i < bps.length; i++) { |
| expect(bps[i].pos).toBeGreaterThanOrEqual(bps[i - 1].pos); |
| } |
| }); |
| }); |
|
|
| |
| |
| |
| describe("findCodeFences", () => { |
| it("finds paired code fences", () => { |
| const text = "before\n```js\ncode\n```\nafter"; |
| const fences = findCodeFences(text); |
| expect(fences).toHaveLength(1); |
| expect(fences[0].start).toBeLessThan(fences[0].end); |
| }); |
|
|
| it("handles unclosed fence extending to end", () => { |
| const text = "before\n```js\ncode without closing"; |
| const fences = findCodeFences(text); |
| expect(fences).toHaveLength(1); |
| expect(fences[0].end).toBe(text.length); |
| }); |
|
|
| it("handles multiple code fence pairs", () => { |
| const text = "a\n```\nb\n```\nc\n```\nd\n```\ne"; |
| const fences = findCodeFences(text); |
| expect(fences).toHaveLength(2); |
| }); |
| }); |
|
|
| describe("isInsideCodeFence", () => { |
| it("returns true for position inside a fence", () => { |
| const fences = [{ start: 10, end: 50 }]; |
| expect(isInsideCodeFence(25, fences)).toBe(true); |
| }); |
|
|
| it("returns false for position outside fences", () => { |
| const fences = [{ start: 10, end: 50 }]; |
| expect(isInsideCodeFence(5, fences)).toBe(false); |
| expect(isInsideCodeFence(55, fences)).toBe(false); |
| }); |
|
|
| it("returns false for position at fence boundary", () => { |
| const fences = [{ start: 10, end: 50 }]; |
| |
| expect(isInsideCodeFence(10, fences)).toBe(false); |
| expect(isInsideCodeFence(50, fences)).toBe(false); |
| }); |
| }); |
|
|
| |
| |
| |
| describe("splitIntoChunks", () => { |
| it("returns a single chunk for short content", () => { |
| const text = "Short content"; |
| const chunks = splitIntoChunks(text, 100, 15); |
| expect(chunks).toHaveLength(1); |
| expect(chunks[0].text).toBe(text); |
| expect(chunks[0].pos).toBe(0); |
| }); |
|
|
| it("splits long content into overlapping chunks", () => { |
| |
| const line = "word ".repeat(20) + "\n"; |
| const text = line.repeat(50); |
| const chunks = splitIntoChunks(text, 1000, 150, 200); |
| expect(chunks.length).toBeGreaterThan(1); |
|
|
| |
| |
| for (let i = 1; i < chunks.length; i++) { |
| const prevEnd = chunks[i - 1].pos + chunks[i - 1].text.length; |
| expect(chunks[i].pos).toBeLessThan(prevEnd); |
| } |
| }); |
|
|
| it("prefers heading boundaries for splits", () => { |
| |
| const filler = "x".repeat(900); |
| const text = filler + "\n## Section Two\n" + "y".repeat(900); |
| const chunks = splitIntoChunks(text, 1000, 100, 300); |
|
|
| |
| expect(chunks.length).toBeGreaterThanOrEqual(2); |
| |
| const secondChunkHasHeading = chunks |
| .slice(1) |
| .some((c) => c.text.includes("## Section Two")); |
| expect(secondChunkHasHeading).toBe(true); |
| }); |
|
|
| it("does not split inside code fences", () => { |
| |
| const before = "a".repeat(800); |
| const codeFence = |
| "\n```\n" + "code line\n".repeat(80) + "\n```\n"; |
| const after = "b".repeat(400); |
| const text = before + codeFence + after; |
|
|
| const chunks = splitIntoChunks(text, 1000, 100, 200); |
|
|
| |
| |
| for (const chunk of chunks) { |
| const openCount = (chunk.text.match(/\n```/g) || []).length; |
| |
| |
| |
| |
| if (openCount === 1) { |
| |
| |
| } |
| |
| |
| } |
|
|
| |
| expect(chunks.length).toBeGreaterThan(1); |
| }); |
|
|
| it("makes forward progress even with edge cases", () => { |
| const text = "x".repeat(5000); |
| const chunks = splitIntoChunks(text, 1000, 150, 200); |
| expect(chunks.length).toBeGreaterThan(1); |
| |
| for (const chunk of chunks) { |
| expect(chunk.text.length).toBeGreaterThan(0); |
| } |
| }); |
| }); |
|
|
| |
| |
| |
| describe("chunkDocument", () => { |
| it("produces Chunk objects with correct metadata", () => { |
| const doc: Document = { |
| id: "test-doc", |
| title: "Test Document", |
| body: "Hello world", |
| filepath: "test.md", |
| }; |
| const chunks = chunkDocument(doc); |
| expect(chunks).toHaveLength(1); |
| expect(chunks[0]).toEqual({ |
| docId: "test-doc", |
| chunkIndex: 0, |
| text: "Hello world", |
| startChar: 0, |
| title: "Test Document", |
| }); |
| }); |
|
|
| it("chunks a long document into multiple pieces", () => { |
| const body = ("paragraph text here. ".repeat(50) + "\n\n").repeat(20); |
| const doc: Document = { |
| id: "long-doc", |
| title: "Long Document", |
| body, |
| filepath: "long.md", |
| }; |
| const chunks = chunkDocument(doc); |
| expect(chunks.length).toBeGreaterThan(1); |
|
|
| |
| for (const chunk of chunks) { |
| expect(chunk.docId).toBe("long-doc"); |
| expect(chunk.title).toBe("Long Document"); |
| } |
|
|
| |
| for (let i = 0; i < chunks.length; i++) { |
| expect(chunks[i].chunkIndex).toBe(i); |
| } |
| }); |
|
|
| it("uses configured CHUNK_SIZE_CHARS and CHUNK_OVERLAP_CHARS", () => { |
| |
| const body = "a".repeat(CHUNK_SIZE_CHARS * 3); |
| const doc: Document = { |
| id: "sized", |
| title: "Sized", |
| body, |
| filepath: "sized.md", |
| }; |
| const chunks = chunkDocument(doc); |
| expect(chunks.length).toBeGreaterThan(1); |
|
|
| |
| expect(chunks[0].text.length).toBeLessThanOrEqual(CHUNK_SIZE_CHARS); |
| expect(chunks[0].text.length).toBeGreaterThan(CHUNK_SIZE_CHARS * 0.5); |
| }); |
| }); |
|
|