add in-memory BM25 search index over chunks
Browse filesImplements tokenizer (lowercase, split on non-alphanumeric, filter <2 chars)
and BM25Index class with standard BM25 scoring (k1=1.2, b=0.75). Includes
15 tests covering tokenization, ranking, edge cases, and score ordering.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- src/pipeline/bm25.test.ts +129 -0
- src/pipeline/bm25.ts +101 -0
src/pipeline/bm25.test.ts
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { describe, it, expect } from "vitest";
|
| 2 |
+
import { tokenize, BM25Index } from "./bm25";
|
| 3 |
+
import type { Chunk } from "../types";
|
| 4 |
+
|
| 5 |
+
// ---------------------------------------------------------------------------
|
| 6 |
+
// Helper to create a Chunk with minimal fields
|
| 7 |
+
// ---------------------------------------------------------------------------
|
| 8 |
+
function makeChunk(text: string, docId = "doc1", chunkIndex = 0): Chunk {
|
| 9 |
+
return { docId, chunkIndex, text, startChar: 0, title: "Test" };
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
// ---------------------------------------------------------------------------
|
| 13 |
+
// tokenize
|
| 14 |
+
// ---------------------------------------------------------------------------
|
| 15 |
+
describe("tokenize", () => {
|
| 16 |
+
it("lowercases and splits on non-alphanumeric", () => {
|
| 17 |
+
expect(tokenize("Hello World")).toEqual(["hello", "world"]);
|
| 18 |
+
});
|
| 19 |
+
|
| 20 |
+
it("filters tokens shorter than 2 characters", () => {
|
| 21 |
+
expect(tokenize("I am a big fan")).toEqual(["am", "big", "fan"]);
|
| 22 |
+
});
|
| 23 |
+
|
| 24 |
+
it("handles numbers", () => {
|
| 25 |
+
// "2" and "0" are single chars, filtered out
|
| 26 |
+
expect(tokenize("version 2.0 release")).toEqual(["version", "release"]);
|
| 27 |
+
// multi-digit numbers are kept
|
| 28 |
+
expect(tokenize("version 20 release")).toEqual(["version", "20", "release"]);
|
| 29 |
+
});
|
| 30 |
+
|
| 31 |
+
it("returns empty array for empty string", () => {
|
| 32 |
+
expect(tokenize("")).toEqual([]);
|
| 33 |
+
});
|
| 34 |
+
|
| 35 |
+
it("handles punctuation-heavy text", () => {
|
| 36 |
+
expect(tokenize("hello---world!!! foo")).toEqual([
|
| 37 |
+
"hello",
|
| 38 |
+
"world",
|
| 39 |
+
"foo",
|
| 40 |
+
]);
|
| 41 |
+
});
|
| 42 |
+
});
|
| 43 |
+
|
| 44 |
+
// ---------------------------------------------------------------------------
|
| 45 |
+
// BM25Index
|
| 46 |
+
// ---------------------------------------------------------------------------
|
| 47 |
+
describe("BM25Index", () => {
|
| 48 |
+
const chunks: Chunk[] = [
|
| 49 |
+
makeChunk("the quick brown fox jumps over the lazy dog", "d1", 0),
|
| 50 |
+
makeChunk("the quick brown fox", "d2", 0),
|
| 51 |
+
makeChunk("the lazy dog sleeps all day long", "d3", 0),
|
| 52 |
+
makeChunk(
|
| 53 |
+
"machine learning and artificial intelligence are transforming the world",
|
| 54 |
+
"d4",
|
| 55 |
+
0,
|
| 56 |
+
),
|
| 57 |
+
];
|
| 58 |
+
|
| 59 |
+
it("returns results with source bm25", () => {
|
| 60 |
+
const index = new BM25Index(chunks);
|
| 61 |
+
const results = index.search("quick fox");
|
| 62 |
+
expect(results.length).toBeGreaterThan(0);
|
| 63 |
+
for (const r of results) {
|
| 64 |
+
expect(r.source).toBe("bm25");
|
| 65 |
+
}
|
| 66 |
+
});
|
| 67 |
+
|
| 68 |
+
it("ranks exact matches higher", () => {
|
| 69 |
+
const index = new BM25Index(chunks);
|
| 70 |
+
const results = index.search("quick brown fox");
|
| 71 |
+
// d1 and d2 both contain all query terms; d2 is shorter so BM25 should rank it higher
|
| 72 |
+
expect(results.length).toBeGreaterThanOrEqual(2);
|
| 73 |
+
expect(results[0].chunk.docId).toBe("d2");
|
| 74 |
+
expect(results[1].chunk.docId).toBe("d1");
|
| 75 |
+
});
|
| 76 |
+
|
| 77 |
+
it("returns empty for unmatched query", () => {
|
| 78 |
+
const index = new BM25Index(chunks);
|
| 79 |
+
const results = index.search("quantum computing");
|
| 80 |
+
expect(results).toEqual([]);
|
| 81 |
+
});
|
| 82 |
+
|
| 83 |
+
it("returns empty for empty query", () => {
|
| 84 |
+
const index = new BM25Index(chunks);
|
| 85 |
+
expect(index.search("")).toEqual([]);
|
| 86 |
+
});
|
| 87 |
+
|
| 88 |
+
it("respects topK parameter", () => {
|
| 89 |
+
const index = new BM25Index(chunks);
|
| 90 |
+
const results = index.search("the", 2);
|
| 91 |
+
expect(results.length).toBeLessThanOrEqual(2);
|
| 92 |
+
});
|
| 93 |
+
|
| 94 |
+
it("handles single-chunk index", () => {
|
| 95 |
+
const index = new BM25Index([makeChunk("hello world")]);
|
| 96 |
+
const results = index.search("hello");
|
| 97 |
+
expect(results.length).toBe(1);
|
| 98 |
+
expect(results[0].score).toBeGreaterThan(0);
|
| 99 |
+
});
|
| 100 |
+
|
| 101 |
+
it("handles empty index", () => {
|
| 102 |
+
const index = new BM25Index([]);
|
| 103 |
+
expect(index.search("anything")).toEqual([]);
|
| 104 |
+
});
|
| 105 |
+
|
| 106 |
+
it("scores are positive for matching documents", () => {
|
| 107 |
+
const index = new BM25Index(chunks);
|
| 108 |
+
const results = index.search("lazy dog");
|
| 109 |
+
for (const r of results) {
|
| 110 |
+
expect(r.score).toBeGreaterThan(0);
|
| 111 |
+
}
|
| 112 |
+
});
|
| 113 |
+
|
| 114 |
+
it("partial query terms still return results", () => {
|
| 115 |
+
const index = new BM25Index(chunks);
|
| 116 |
+
// "machine" appears only in d4
|
| 117 |
+
const results = index.search("machine");
|
| 118 |
+
expect(results.length).toBe(1);
|
| 119 |
+
expect(results[0].chunk.docId).toBe("d4");
|
| 120 |
+
});
|
| 121 |
+
|
| 122 |
+
it("scores decrease with result rank", () => {
|
| 123 |
+
const index = new BM25Index(chunks);
|
| 124 |
+
const results = index.search("the lazy dog");
|
| 125 |
+
for (let i = 1; i < results.length; i++) {
|
| 126 |
+
expect(results[i].score).toBeLessThanOrEqual(results[i - 1].score);
|
| 127 |
+
}
|
| 128 |
+
});
|
| 129 |
+
});
|
src/pipeline/bm25.ts
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { Chunk, ScoredChunk } from "../types";
|
| 2 |
+
import { BM25_K1, BM25_B } from "../constants";
|
| 3 |
+
|
| 4 |
+
/**
|
| 5 |
+
* Simple tokenizer: lowercase, split on non-alphanumeric, filter tokens < 2 chars.
|
| 6 |
+
*/
|
| 7 |
+
export function tokenize(text: string): string[] {
|
| 8 |
+
return text
|
| 9 |
+
.toLowerCase()
|
| 10 |
+
.split(/[^a-z0-9]+/)
|
| 11 |
+
.filter((t) => t.length >= 2);
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
/**
|
| 15 |
+
* In-memory BM25 search index built over an array of Chunks.
|
| 16 |
+
*/
|
| 17 |
+
export class BM25Index {
|
| 18 |
+
private chunks: Chunk[];
|
| 19 |
+
private avgDocLength: number;
|
| 20 |
+
private docLengths: number[];
|
| 21 |
+
private termFreqs: Map<string, number[]>; // term -> per-chunk frequency
|
| 22 |
+
private docFreqs: Map<string, number>; // term -> number of chunks containing term
|
| 23 |
+
private N: number;
|
| 24 |
+
|
| 25 |
+
constructor(chunks: Chunk[]) {
|
| 26 |
+
this.chunks = chunks;
|
| 27 |
+
this.N = chunks.length;
|
| 28 |
+
this.docLengths = [];
|
| 29 |
+
this.termFreqs = new Map();
|
| 30 |
+
this.docFreqs = new Map();
|
| 31 |
+
|
| 32 |
+
let totalLength = 0;
|
| 33 |
+
|
| 34 |
+
for (let i = 0; i < this.N; i++) {
|
| 35 |
+
const tokens = tokenize(chunks[i].text);
|
| 36 |
+
this.docLengths.push(tokens.length);
|
| 37 |
+
totalLength += tokens.length;
|
| 38 |
+
|
| 39 |
+
// Count term frequencies for this chunk
|
| 40 |
+
const localFreq = new Map<string, number>();
|
| 41 |
+
for (const token of tokens) {
|
| 42 |
+
localFreq.set(token, (localFreq.get(token) ?? 0) + 1);
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
// Update inverted index
|
| 46 |
+
for (const [term, freq] of localFreq) {
|
| 47 |
+
let freqArray = this.termFreqs.get(term);
|
| 48 |
+
if (!freqArray) {
|
| 49 |
+
freqArray = new Array(this.N).fill(0);
|
| 50 |
+
this.termFreqs.set(term, freqArray);
|
| 51 |
+
}
|
| 52 |
+
freqArray[i] = freq;
|
| 53 |
+
|
| 54 |
+
this.docFreqs.set(term, (this.docFreqs.get(term) ?? 0) + 1);
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
this.avgDocLength = this.N > 0 ? totalLength / this.N : 0;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
search(query: string, topK: number = 20): ScoredChunk[] {
|
| 62 |
+
const queryTerms = tokenize(query);
|
| 63 |
+
if (queryTerms.length === 0 || this.N === 0) return [];
|
| 64 |
+
|
| 65 |
+
const scores = new Float64Array(this.N);
|
| 66 |
+
|
| 67 |
+
for (const term of queryTerms) {
|
| 68 |
+
const df = this.docFreqs.get(term);
|
| 69 |
+
if (df === undefined) continue;
|
| 70 |
+
|
| 71 |
+
const freqArray = this.termFreqs.get(term)!;
|
| 72 |
+
const idf = Math.log((this.N - df + 0.5) / (df + 0.5) + 1);
|
| 73 |
+
|
| 74 |
+
for (let i = 0; i < this.N; i++) {
|
| 75 |
+
const tf = freqArray[i];
|
| 76 |
+
if (tf === 0) continue;
|
| 77 |
+
|
| 78 |
+
const dl = this.docLengths[i];
|
| 79 |
+
const norm = BM25_K1 * (1 - BM25_B + BM25_B * (dl / this.avgDocLength));
|
| 80 |
+
const tfScore = (tf * (BM25_K1 + 1)) / (tf + norm);
|
| 81 |
+
scores[i] += idf * tfScore;
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
// Collect scored results, filtering out zero scores
|
| 86 |
+
const results: ScoredChunk[] = [];
|
| 87 |
+
for (let i = 0; i < this.N; i++) {
|
| 88 |
+
if (scores[i] > 0) {
|
| 89 |
+
results.push({
|
| 90 |
+
chunk: this.chunks[i],
|
| 91 |
+
score: scores[i],
|
| 92 |
+
source: "bm25",
|
| 93 |
+
});
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
// Sort descending by score and return top K
|
| 98 |
+
results.sort((a, b) => b.score - a.score);
|
| 99 |
+
return results.slice(0, topK);
|
| 100 |
+
}
|
| 101 |
+
}
|