| import { describe, expect, it } from "vitest"; |
| import type { EmbeddedChunk, ScoredChunk } from "../types"; |
| import { |
| extractIntentTerms, |
| extractQueryTerms, |
| hasStrongBm25Signal, |
| normalizeBm25Score, |
| selectBestChunkForRerank, |
| } from "./orchestrator"; |
|
|
| function makeScoredChunk(docId: string, score: number): ScoredChunk { |
| return { |
| chunk: { |
| docId, |
| chunkIndex: 0, |
| text: `chunk for ${docId}`, |
| startChar: 0, |
| title: `Title ${docId}`, |
| }, |
| score, |
| source: "bm25", |
| }; |
| } |
|
|
| function makeEmbeddedChunk( |
| text: string, |
| chunkIndex: number, |
| docId = "doc1", |
| ): EmbeddedChunk { |
| return { |
| docId, |
| chunkIndex, |
| text, |
| startChar: chunkIndex * 100, |
| title: `Title ${docId}`, |
| embedding: new Float32Array([chunkIndex + 1, 0, 0]), |
| }; |
| } |
|
|
| describe("normalizeBm25Score", () => { |
| it("returns 0 for non-positive scores", () => { |
| expect(normalizeBm25Score(0)).toBe(0); |
| expect(normalizeBm25Score(-1)).toBe(0); |
| }); |
|
|
| it("maps positive scores into a stable 0..1 range", () => { |
| expect(normalizeBm25Score(1)).toBeCloseTo(0.5, 10); |
| expect(normalizeBm25Score(9)).toBeCloseTo(0.9, 10); |
| }); |
|
|
| it("is monotonic for stronger BM25 signals", () => { |
| expect(normalizeBm25Score(10)).toBeGreaterThan(normalizeBm25Score(2)); |
| }); |
| }); |
|
|
| describe("hasStrongBm25Signal", () => { |
| it("detects a strong top result with a clear gap", () => { |
| const results = [makeScoredChunk("doc1", 10), makeScoredChunk("doc2", 2)]; |
| expect(hasStrongBm25Signal(results)).toBe(true); |
| }); |
|
|
| it("rejects signals that are not strong enough", () => { |
| const results = [makeScoredChunk("doc1", 5), makeScoredChunk("doc2", 1)]; |
| expect(hasStrongBm25Signal(results)).toBe(false); |
| }); |
|
|
| it("rejects signals without enough separation from the runner-up", () => { |
| const results = [makeScoredChunk("doc1", 10), makeScoredChunk("doc2", 8)]; |
| expect(hasStrongBm25Signal(results)).toBe(false); |
| }); |
| }); |
|
|
| describe("extractQueryTerms", () => { |
| it("normalizes punctuation and removes duplicates", () => { |
| expect(extractQueryTerms("API, api! auth?? login")).toEqual([ |
| "api", |
| "auth", |
| "login", |
| ]); |
| }); |
|
|
| it("drops short noisy fragments", () => { |
| expect(extractQueryTerms("a an of API db")).toEqual(["api"]); |
| }); |
| }); |
|
|
| describe("selectBestChunkForRerank", () => { |
| it("chooses the chunk with the highest query-term overlap", () => { |
| const chunks = [ |
| makeEmbeddedChunk("Intro to distributed systems", 0), |
| makeEmbeddedChunk("Consensus algorithms coordinate a distributed cluster", 1), |
| makeEmbeddedChunk("Coffee history and trade routes", 2), |
| ]; |
|
|
| expect(selectBestChunkForRerank("distributed consensus algorithm", chunks)).toBe( |
| "Consensus algorithms coordinate a distributed cluster", |
| ); |
| }); |
|
|
| it("falls back to the first chunk when the query has no informative terms", () => { |
| const chunks = [ |
| makeEmbeddedChunk("First chunk", 0), |
| makeEmbeddedChunk("Second chunk", 1), |
| ]; |
|
|
| expect(selectBestChunkForRerank("an of to", chunks)).toBe("First chunk"); |
| }); |
|
|
| it("returns an empty string for empty chunk lists", () => { |
| expect(selectBestChunkForRerank("query", [])).toBe(""); |
| }); |
|
|
| it("boosts chunk selection using intent terms at 0.5x weight", () => { |
| const chunks = [ |
| makeEmbeddedChunk("General performance tuning tips for servers", 0), |
| makeEmbeddedChunk("Core web vitals and page load optimization", 1), |
| makeEmbeddedChunk("CPU benchmarking guide for gaming rigs", 2), |
| ]; |
|
|
| |
| expect(selectBestChunkForRerank("performance", chunks)).toBe( |
| "General performance tuning tips for servers", |
| ); |
|
|
| |
| expect(selectBestChunkForRerank("performance", chunks, "web page load times")).toBe( |
| "Core web vitals and page load optimization", |
| ); |
| }); |
| }); |
|
|
| describe("extractIntentTerms", () => { |
| it("filters stop words and preserves domain terms", () => { |
| const terms = extractIntentTerms("looking for API performance in the database"); |
| expect(terms).toContain("api"); |
| expect(terms).toContain("performance"); |
| expect(terms).toContain("database"); |
| expect(terms).not.toContain("looking"); |
| expect(terms).not.toContain("for"); |
| expect(terms).not.toContain("the"); |
| }); |
|
|
| it("preserves short domain acronyms like API, SQL, CI", () => { |
| const terms = extractIntentTerms("API SQL CI CD"); |
| expect(terms).toEqual(["api", "sql", "ci", "cd"]); |
| }); |
|
|
| it("strips Unicode punctuation", () => { |
| const terms = extractIntentTerms('"web" (vitals) —performance—'); |
| expect(terms).toContain("web"); |
| expect(terms).toContain("vitals"); |
| expect(terms).toContain("performance"); |
| }); |
| }); |
|
|