qmd-web / src /pipeline /orchestrator.test.ts
shreyask's picture
Deploy qmd-web
0ab4fc5 verified
import { describe, expect, it } from "vitest";
import type { EmbeddedChunk, ScoredChunk } from "../types";
import {
extractIntentTerms,
extractQueryTerms,
hasStrongBm25Signal,
normalizeBm25Score,
selectBestChunkForRerank,
} from "./orchestrator";
function makeScoredChunk(docId: string, score: number): ScoredChunk {
return {
chunk: {
docId,
chunkIndex: 0,
text: `chunk for ${docId}`,
startChar: 0,
title: `Title ${docId}`,
},
score,
source: "bm25",
};
}
function makeEmbeddedChunk(
text: string,
chunkIndex: number,
docId = "doc1",
): EmbeddedChunk {
return {
docId,
chunkIndex,
text,
startChar: chunkIndex * 100,
title: `Title ${docId}`,
embedding: new Float32Array([chunkIndex + 1, 0, 0]),
};
}
describe("normalizeBm25Score", () => {
it("returns 0 for non-positive scores", () => {
expect(normalizeBm25Score(0)).toBe(0);
expect(normalizeBm25Score(-1)).toBe(0);
});
it("maps positive scores into a stable 0..1 range", () => {
expect(normalizeBm25Score(1)).toBeCloseTo(0.5, 10);
expect(normalizeBm25Score(9)).toBeCloseTo(0.9, 10);
});
it("is monotonic for stronger BM25 signals", () => {
expect(normalizeBm25Score(10)).toBeGreaterThan(normalizeBm25Score(2));
});
});
describe("hasStrongBm25Signal", () => {
it("detects a strong top result with a clear gap", () => {
const results = [makeScoredChunk("doc1", 10), makeScoredChunk("doc2", 2)];
expect(hasStrongBm25Signal(results)).toBe(true);
});
it("rejects signals that are not strong enough", () => {
const results = [makeScoredChunk("doc1", 5), makeScoredChunk("doc2", 1)];
expect(hasStrongBm25Signal(results)).toBe(false);
});
it("rejects signals without enough separation from the runner-up", () => {
const results = [makeScoredChunk("doc1", 10), makeScoredChunk("doc2", 8)];
expect(hasStrongBm25Signal(results)).toBe(false);
});
});
describe("extractQueryTerms", () => {
it("normalizes punctuation and removes duplicates", () => {
expect(extractQueryTerms("API, api! auth?? login")).toEqual([
"api",
"auth",
"login",
]);
});
it("drops short noisy fragments", () => {
expect(extractQueryTerms("a an of API db")).toEqual(["api"]);
});
});
describe("selectBestChunkForRerank", () => {
it("chooses the chunk with the highest query-term overlap", () => {
const chunks = [
makeEmbeddedChunk("Intro to distributed systems", 0),
makeEmbeddedChunk("Consensus algorithms coordinate a distributed cluster", 1),
makeEmbeddedChunk("Coffee history and trade routes", 2),
];
expect(selectBestChunkForRerank("distributed consensus algorithm", chunks)).toBe(
"Consensus algorithms coordinate a distributed cluster",
);
});
it("falls back to the first chunk when the query has no informative terms", () => {
const chunks = [
makeEmbeddedChunk("First chunk", 0),
makeEmbeddedChunk("Second chunk", 1),
];
expect(selectBestChunkForRerank("an of to", chunks)).toBe("First chunk");
});
it("returns an empty string for empty chunk lists", () => {
expect(selectBestChunkForRerank("query", [])).toBe("");
});
it("boosts chunk selection using intent terms at 0.5x weight", () => {
const chunks = [
makeEmbeddedChunk("General performance tuning tips for servers", 0),
makeEmbeddedChunk("Core web vitals and page load optimization", 1),
makeEmbeddedChunk("CPU benchmarking guide for gaming rigs", 2),
];
// Without intent, "performance" alone matches chunk 0 best
expect(selectBestChunkForRerank("performance", chunks)).toBe(
"General performance tuning tips for servers",
);
// With intent, chunk 1 wins via intent terms "web", "page", "load"
expect(selectBestChunkForRerank("performance", chunks, "web page load times")).toBe(
"Core web vitals and page load optimization",
);
});
});
describe("extractIntentTerms", () => {
it("filters stop words and preserves domain terms", () => {
const terms = extractIntentTerms("looking for API performance in the database");
expect(terms).toContain("api");
expect(terms).toContain("performance");
expect(terms).toContain("database");
expect(terms).not.toContain("looking");
expect(terms).not.toContain("for");
expect(terms).not.toContain("the");
});
it("preserves short domain acronyms like API, SQL, CI", () => {
const terms = extractIntentTerms("API SQL CI CD");
expect(terms).toEqual(["api", "sql", "ci", "cd"]);
});
it("strips Unicode punctuation", () => {
const terms = extractIntentTerms('"web" (vitals) —performance—');
expect(terms).toContain("web");
expect(terms).toContain("vitals");
expect(terms).toContain("performance");
});
});