qmd-web / src /pipeline /bm25.test.ts
shreyask's picture
add in-memory BM25 search index over chunks
4e354b6 verified
import { describe, it, expect } from "vitest";
import { tokenize, BM25Index } from "./bm25";
import type { Chunk } from "../types";
// ---------------------------------------------------------------------------
// Helper to create a Chunk with minimal fields
// ---------------------------------------------------------------------------
function makeChunk(text: string, docId = "doc1", chunkIndex = 0): Chunk {
return { docId, chunkIndex, text, startChar: 0, title: "Test" };
}
// ---------------------------------------------------------------------------
// tokenize
// ---------------------------------------------------------------------------
describe("tokenize", () => {
it("lowercases and splits on non-alphanumeric", () => {
expect(tokenize("Hello World")).toEqual(["hello", "world"]);
});
it("filters tokens shorter than 2 characters", () => {
expect(tokenize("I am a big fan")).toEqual(["am", "big", "fan"]);
});
it("handles numbers", () => {
// "2" and "0" are single chars, filtered out
expect(tokenize("version 2.0 release")).toEqual(["version", "release"]);
// multi-digit numbers are kept
expect(tokenize("version 20 release")).toEqual(["version", "20", "release"]);
});
it("returns empty array for empty string", () => {
expect(tokenize("")).toEqual([]);
});
it("handles punctuation-heavy text", () => {
expect(tokenize("hello---world!!! foo")).toEqual([
"hello",
"world",
"foo",
]);
});
});
// ---------------------------------------------------------------------------
// BM25Index
// ---------------------------------------------------------------------------
describe("BM25Index", () => {
const chunks: Chunk[] = [
makeChunk("the quick brown fox jumps over the lazy dog", "d1", 0),
makeChunk("the quick brown fox", "d2", 0),
makeChunk("the lazy dog sleeps all day long", "d3", 0),
makeChunk(
"machine learning and artificial intelligence are transforming the world",
"d4",
0,
),
];
it("returns results with source bm25", () => {
const index = new BM25Index(chunks);
const results = index.search("quick fox");
expect(results.length).toBeGreaterThan(0);
for (const r of results) {
expect(r.source).toBe("bm25");
}
});
it("ranks exact matches higher", () => {
const index = new BM25Index(chunks);
const results = index.search("quick brown fox");
// d1 and d2 both contain all query terms; d2 is shorter so BM25 should rank it higher
expect(results.length).toBeGreaterThanOrEqual(2);
expect(results[0].chunk.docId).toBe("d2");
expect(results[1].chunk.docId).toBe("d1");
});
it("returns empty for unmatched query", () => {
const index = new BM25Index(chunks);
const results = index.search("quantum computing");
expect(results).toEqual([]);
});
it("returns empty for empty query", () => {
const index = new BM25Index(chunks);
expect(index.search("")).toEqual([]);
});
it("respects topK parameter", () => {
const index = new BM25Index(chunks);
const results = index.search("the", 2);
expect(results.length).toBeLessThanOrEqual(2);
});
it("handles single-chunk index", () => {
const index = new BM25Index([makeChunk("hello world")]);
const results = index.search("hello");
expect(results.length).toBe(1);
expect(results[0].score).toBeGreaterThan(0);
});
it("handles empty index", () => {
const index = new BM25Index([]);
expect(index.search("anything")).toEqual([]);
});
it("scores are positive for matching documents", () => {
const index = new BM25Index(chunks);
const results = index.search("lazy dog");
for (const r of results) {
expect(r.score).toBeGreaterThan(0);
}
});
it("partial query terms still return results", () => {
const index = new BM25Index(chunks);
// "machine" appears only in d4
const results = index.search("machine");
expect(results.length).toBe(1);
expect(results[0].chunk.docId).toBe("d4");
});
it("scores decrease with result rank", () => {
const index = new BM25Index(chunks);
const results = index.search("the lazy dog");
for (let i = 1; i < results.length; i++) {
expect(results[i].score).toBeLessThanOrEqual(results[i - 1].score);
}
});
});