shreyask Claude Opus 4.6 commited on
Commit
4e354b6
·
verified ·
1 Parent(s): ce263b4

add in-memory BM25 search index over chunks

Browse files

Implements tokenizer (lowercase, split on non-alphanumeric, filter <2 chars)
and BM25Index class with standard BM25 scoring (k1=1.2, b=0.75). Includes
15 tests covering tokenization, ranking, edge cases, and score ordering.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. src/pipeline/bm25.test.ts +129 -0
  2. src/pipeline/bm25.ts +101 -0
src/pipeline/bm25.test.ts ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { describe, it, expect } from "vitest";
2
+ import { tokenize, BM25Index } from "./bm25";
3
+ import type { Chunk } from "../types";
4
+
5
+ // ---------------------------------------------------------------------------
6
+ // Helper to create a Chunk with minimal fields
7
+ // ---------------------------------------------------------------------------
8
+ function makeChunk(text: string, docId = "doc1", chunkIndex = 0): Chunk {
9
+ return { docId, chunkIndex, text, startChar: 0, title: "Test" };
10
+ }
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // tokenize
14
+ // ---------------------------------------------------------------------------
15
+ describe("tokenize", () => {
16
+ it("lowercases and splits on non-alphanumeric", () => {
17
+ expect(tokenize("Hello World")).toEqual(["hello", "world"]);
18
+ });
19
+
20
+ it("filters tokens shorter than 2 characters", () => {
21
+ expect(tokenize("I am a big fan")).toEqual(["am", "big", "fan"]);
22
+ });
23
+
24
+ it("handles numbers", () => {
25
+ // "2" and "0" are single chars, filtered out
26
+ expect(tokenize("version 2.0 release")).toEqual(["version", "release"]);
27
+ // multi-digit numbers are kept
28
+ expect(tokenize("version 20 release")).toEqual(["version", "20", "release"]);
29
+ });
30
+
31
+ it("returns empty array for empty string", () => {
32
+ expect(tokenize("")).toEqual([]);
33
+ });
34
+
35
+ it("handles punctuation-heavy text", () => {
36
+ expect(tokenize("hello---world!!! foo")).toEqual([
37
+ "hello",
38
+ "world",
39
+ "foo",
40
+ ]);
41
+ });
42
+ });
43
+
44
+ // ---------------------------------------------------------------------------
45
+ // BM25Index
46
+ // ---------------------------------------------------------------------------
47
+ describe("BM25Index", () => {
48
+ const chunks: Chunk[] = [
49
+ makeChunk("the quick brown fox jumps over the lazy dog", "d1", 0),
50
+ makeChunk("the quick brown fox", "d2", 0),
51
+ makeChunk("the lazy dog sleeps all day long", "d3", 0),
52
+ makeChunk(
53
+ "machine learning and artificial intelligence are transforming the world",
54
+ "d4",
55
+ 0,
56
+ ),
57
+ ];
58
+
59
+ it("returns results with source bm25", () => {
60
+ const index = new BM25Index(chunks);
61
+ const results = index.search("quick fox");
62
+ expect(results.length).toBeGreaterThan(0);
63
+ for (const r of results) {
64
+ expect(r.source).toBe("bm25");
65
+ }
66
+ });
67
+
68
+ it("ranks exact matches higher", () => {
69
+ const index = new BM25Index(chunks);
70
+ const results = index.search("quick brown fox");
71
+ // d1 and d2 both contain all query terms; d2 is shorter so BM25 should rank it higher
72
+ expect(results.length).toBeGreaterThanOrEqual(2);
73
+ expect(results[0].chunk.docId).toBe("d2");
74
+ expect(results[1].chunk.docId).toBe("d1");
75
+ });
76
+
77
+ it("returns empty for unmatched query", () => {
78
+ const index = new BM25Index(chunks);
79
+ const results = index.search("quantum computing");
80
+ expect(results).toEqual([]);
81
+ });
82
+
83
+ it("returns empty for empty query", () => {
84
+ const index = new BM25Index(chunks);
85
+ expect(index.search("")).toEqual([]);
86
+ });
87
+
88
+ it("respects topK parameter", () => {
89
+ const index = new BM25Index(chunks);
90
+ const results = index.search("the", 2);
91
+ expect(results.length).toBeLessThanOrEqual(2);
92
+ });
93
+
94
+ it("handles single-chunk index", () => {
95
+ const index = new BM25Index([makeChunk("hello world")]);
96
+ const results = index.search("hello");
97
+ expect(results.length).toBe(1);
98
+ expect(results[0].score).toBeGreaterThan(0);
99
+ });
100
+
101
+ it("handles empty index", () => {
102
+ const index = new BM25Index([]);
103
+ expect(index.search("anything")).toEqual([]);
104
+ });
105
+
106
+ it("scores are positive for matching documents", () => {
107
+ const index = new BM25Index(chunks);
108
+ const results = index.search("lazy dog");
109
+ for (const r of results) {
110
+ expect(r.score).toBeGreaterThan(0);
111
+ }
112
+ });
113
+
114
+ it("partial query terms still return results", () => {
115
+ const index = new BM25Index(chunks);
116
+ // "machine" appears only in d4
117
+ const results = index.search("machine");
118
+ expect(results.length).toBe(1);
119
+ expect(results[0].chunk.docId).toBe("d4");
120
+ });
121
+
122
+ it("scores decrease with result rank", () => {
123
+ const index = new BM25Index(chunks);
124
+ const results = index.search("the lazy dog");
125
+ for (let i = 1; i < results.length; i++) {
126
+ expect(results[i].score).toBeLessThanOrEqual(results[i - 1].score);
127
+ }
128
+ });
129
+ });
src/pipeline/bm25.ts ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { Chunk, ScoredChunk } from "../types";
2
+ import { BM25_K1, BM25_B } from "../constants";
3
+
4
+ /**
5
+ * Simple tokenizer: lowercase, split on non-alphanumeric, filter tokens < 2 chars.
6
+ */
7
+ export function tokenize(text: string): string[] {
8
+ return text
9
+ .toLowerCase()
10
+ .split(/[^a-z0-9]+/)
11
+ .filter((t) => t.length >= 2);
12
+ }
13
+
14
+ /**
15
+ * In-memory BM25 search index built over an array of Chunks.
16
+ */
17
+ export class BM25Index {
18
+ private chunks: Chunk[];
19
+ private avgDocLength: number;
20
+ private docLengths: number[];
21
+ private termFreqs: Map<string, number[]>; // term -> per-chunk frequency
22
+ private docFreqs: Map<string, number>; // term -> number of chunks containing term
23
+ private N: number;
24
+
25
+ constructor(chunks: Chunk[]) {
26
+ this.chunks = chunks;
27
+ this.N = chunks.length;
28
+ this.docLengths = [];
29
+ this.termFreqs = new Map();
30
+ this.docFreqs = new Map();
31
+
32
+ let totalLength = 0;
33
+
34
+ for (let i = 0; i < this.N; i++) {
35
+ const tokens = tokenize(chunks[i].text);
36
+ this.docLengths.push(tokens.length);
37
+ totalLength += tokens.length;
38
+
39
+ // Count term frequencies for this chunk
40
+ const localFreq = new Map<string, number>();
41
+ for (const token of tokens) {
42
+ localFreq.set(token, (localFreq.get(token) ?? 0) + 1);
43
+ }
44
+
45
+ // Update inverted index
46
+ for (const [term, freq] of localFreq) {
47
+ let freqArray = this.termFreqs.get(term);
48
+ if (!freqArray) {
49
+ freqArray = new Array(this.N).fill(0);
50
+ this.termFreqs.set(term, freqArray);
51
+ }
52
+ freqArray[i] = freq;
53
+
54
+ this.docFreqs.set(term, (this.docFreqs.get(term) ?? 0) + 1);
55
+ }
56
+ }
57
+
58
+ this.avgDocLength = this.N > 0 ? totalLength / this.N : 0;
59
+ }
60
+
61
+ search(query: string, topK: number = 20): ScoredChunk[] {
62
+ const queryTerms = tokenize(query);
63
+ if (queryTerms.length === 0 || this.N === 0) return [];
64
+
65
+ const scores = new Float64Array(this.N);
66
+
67
+ for (const term of queryTerms) {
68
+ const df = this.docFreqs.get(term);
69
+ if (df === undefined) continue;
70
+
71
+ const freqArray = this.termFreqs.get(term)!;
72
+ const idf = Math.log((this.N - df + 0.5) / (df + 0.5) + 1);
73
+
74
+ for (let i = 0; i < this.N; i++) {
75
+ const tf = freqArray[i];
76
+ if (tf === 0) continue;
77
+
78
+ const dl = this.docLengths[i];
79
+ const norm = BM25_K1 * (1 - BM25_B + BM25_B * (dl / this.avgDocLength));
80
+ const tfScore = (tf * (BM25_K1 + 1)) / (tf + norm);
81
+ scores[i] += idf * tfScore;
82
+ }
83
+ }
84
+
85
+ // Collect scored results, filtering out zero scores
86
+ const results: ScoredChunk[] = [];
87
+ for (let i = 0; i < this.N; i++) {
88
+ if (scores[i] > 0) {
89
+ results.push({
90
+ chunk: this.chunks[i],
91
+ score: scores[i],
92
+ source: "bm25",
93
+ });
94
+ }
95
+ }
96
+
97
+ // Sort descending by score and return top K
98
+ results.sort((a, b) => b.score - a.score);
99
+ return results.slice(0, topK);
100
+ }
101
+ }