Spaces:

shreyask
/

qmd-web

Running

App Files Files Community

qmd-web / src /pipeline /orchestrator.test.ts

shreyask

Deploy qmd-web

0ab4fc5 verified about 1 month ago

raw

history blame contribute delete

4.84 kB

	import { describe, expect, it } from "vitest";
	import type { EmbeddedChunk, ScoredChunk } from "../types";
	import {
	extractIntentTerms,
	extractQueryTerms,
	hasStrongBm25Signal,
	normalizeBm25Score,
	selectBestChunkForRerank,
	} from "./orchestrator";

	function makeScoredChunk(docId: string, score: number): ScoredChunk {
	return {
	chunk: {
	docId,
	chunkIndex: 0,
	text: `chunk for ${docId}`,
	startChar: 0,
	title: `Title ${docId}`,
	},
	score,
	source: "bm25",
	};
	}

	function makeEmbeddedChunk(
	text: string,
	chunkIndex: number,
	docId = "doc1",
	): EmbeddedChunk {
	return {
	docId,
	chunkIndex,
	text,
	startChar: chunkIndex * 100,
	title: `Title ${docId}`,
	embedding: new Float32Array([chunkIndex + 1, 0, 0]),
	};
	}

	describe("normalizeBm25Score", () => {
	it("returns 0 for non-positive scores", () => {
	expect(normalizeBm25Score(0)).toBe(0);
	expect(normalizeBm25Score(-1)).toBe(0);
	});

	it("maps positive scores into a stable 0..1 range", () => {
	expect(normalizeBm25Score(1)).toBeCloseTo(0.5, 10);
	expect(normalizeBm25Score(9)).toBeCloseTo(0.9, 10);
	});

	it("is monotonic for stronger BM25 signals", () => {
	expect(normalizeBm25Score(10)).toBeGreaterThan(normalizeBm25Score(2));
	});
	});

	describe("hasStrongBm25Signal", () => {
	it("detects a strong top result with a clear gap", () => {
	const results = [makeScoredChunk("doc1", 10), makeScoredChunk("doc2", 2)];
	expect(hasStrongBm25Signal(results)).toBe(true);
	});

	it("rejects signals that are not strong enough", () => {
	const results = [makeScoredChunk("doc1", 5), makeScoredChunk("doc2", 1)];
	expect(hasStrongBm25Signal(results)).toBe(false);
	});

	it("rejects signals without enough separation from the runner-up", () => {
	const results = [makeScoredChunk("doc1", 10), makeScoredChunk("doc2", 8)];
	expect(hasStrongBm25Signal(results)).toBe(false);
	});
	});

	describe("extractQueryTerms", () => {
	it("normalizes punctuation and removes duplicates", () => {
	expect(extractQueryTerms("API, api! auth?? login")).toEqual([
	"api",
	"auth",
	"login",
	]);
	});

	it("drops short noisy fragments", () => {
	expect(extractQueryTerms("a an of API db")).toEqual(["api"]);
	});
	});

	describe("selectBestChunkForRerank", () => {
	it("chooses the chunk with the highest query-term overlap", () => {
	const chunks = [
	makeEmbeddedChunk("Intro to distributed systems", 0),
	makeEmbeddedChunk("Consensus algorithms coordinate a distributed cluster", 1),
	makeEmbeddedChunk("Coffee history and trade routes", 2),
	];

	expect(selectBestChunkForRerank("distributed consensus algorithm", chunks)).toBe(
	"Consensus algorithms coordinate a distributed cluster",
	);
	});

	it("falls back to the first chunk when the query has no informative terms", () => {
	const chunks = [
	makeEmbeddedChunk("First chunk", 0),
	makeEmbeddedChunk("Second chunk", 1),
	];

	expect(selectBestChunkForRerank("an of to", chunks)).toBe("First chunk");
	});

	it("returns an empty string for empty chunk lists", () => {
	expect(selectBestChunkForRerank("query", [])).toBe("");
	});

	it("boosts chunk selection using intent terms at 0.5x weight", () => {
	const chunks = [
	makeEmbeddedChunk("General performance tuning tips for servers", 0),
	makeEmbeddedChunk("Core web vitals and page load optimization", 1),
	makeEmbeddedChunk("CPU benchmarking guide for gaming rigs", 2),
	];

	// Without intent, "performance" alone matches chunk 0 best
	expect(selectBestChunkForRerank("performance", chunks)).toBe(
	"General performance tuning tips for servers",
	);

	// With intent, chunk 1 wins via intent terms "web", "page", "load"
	expect(selectBestChunkForRerank("performance", chunks, "web page load times")).toBe(
	"Core web vitals and page load optimization",
	);
	});
	});

	describe("extractIntentTerms", () => {
	it("filters stop words and preserves domain terms", () => {
	const terms = extractIntentTerms("looking for API performance in the database");
	expect(terms).toContain("api");
	expect(terms).toContain("performance");
	expect(terms).toContain("database");
	expect(terms).not.toContain("looking");
	expect(terms).not.toContain("for");
	expect(terms).not.toContain("the");
	});

	it("preserves short domain acronyms like API, SQL, CI", () => {
	const terms = extractIntentTerms("API SQL CI CD");
	expect(terms).toEqual(["api", "sql", "ci", "cd"]);
	});

	it("strips Unicode punctuation", () => {
	const terms = extractIntentTerms('"web" (vitals) —performance—');
	expect(terms).toContain("web");
	expect(terms).toContain("vitals");
	expect(terms).toContain("performance");
	});
	});