Spaces:

shreyask
/

qmd-web

Running

App Files Files Community

shreyask Claude Opus 4.6 commited on 28 days ago

Commit

4e354b6

verified ·

1 Parent(s): ce263b4

add in-memory BM25 search index over chunks

Browse files

Implements tokenizer (lowercase, split on non-alphanumeric, filter <2 chars)
and BM25Index class with standard BM25 scoring (k1=1.2, b=0.75). Includes
15 tests covering tokenization, ranking, edge cases, and score ordering.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

src/pipeline/bm25.test.ts +129 -0
src/pipeline/bm25.ts +101 -0

src/pipeline/bm25.test.ts ADDED Viewed

	@@ -0,0 +1,129 @@

+import { describe, it, expect } from "vitest";
+import { tokenize, BM25Index } from "./bm25";
+import type { Chunk } from "../types";
+// ---------------------------------------------------------------------------
+// Helper to create a Chunk with minimal fields
+// ---------------------------------------------------------------------------
+function makeChunk(text: string, docId = "doc1", chunkIndex = 0): Chunk {
+  return { docId, chunkIndex, text, startChar: 0, title: "Test" };
+}
+// ---------------------------------------------------------------------------
+// tokenize
+// ---------------------------------------------------------------------------
+describe("tokenize", () => {
+  it("lowercases and splits on non-alphanumeric", () => {
+    expect(tokenize("Hello World")).toEqual(["hello", "world"]);
+  });
+  it("filters tokens shorter than 2 characters", () => {
+    expect(tokenize("I am a big fan")).toEqual(["am", "big", "fan"]);
+  });
+  it("handles numbers", () => {
+    // "2" and "0" are single chars, filtered out
+    expect(tokenize("version 2.0 release")).toEqual(["version", "release"]);
+    // multi-digit numbers are kept
+    expect(tokenize("version 20 release")).toEqual(["version", "20", "release"]);
+  });
+  it("returns empty array for empty string", () => {
+    expect(tokenize("")).toEqual([]);
+  });
+  it("handles punctuation-heavy text", () => {
+    expect(tokenize("hello---world!!! foo")).toEqual([
+      "hello",
+      "world",
+      "foo",
+    ]);
+  });
+});
+// ---------------------------------------------------------------------------
+// BM25Index
+// ---------------------------------------------------------------------------
+describe("BM25Index", () => {
+  const chunks: Chunk[] = [
+    makeChunk("the quick brown fox jumps over the lazy dog", "d1", 0),
+    makeChunk("the quick brown fox", "d2", 0),
+    makeChunk("the lazy dog sleeps all day long", "d3", 0),
+    makeChunk(
+      "machine learning and artificial intelligence are transforming the world",
+      "d4",
+      0,
+    ),
+  ];
+  it("returns results with source bm25", () => {
+    const index = new BM25Index(chunks);
+    const results = index.search("quick fox");
+    expect(results.length).toBeGreaterThan(0);
+    for (const r of results) {
+      expect(r.source).toBe("bm25");
+    }
+  });
+  it("ranks exact matches higher", () => {
+    const index = new BM25Index(chunks);
+    const results = index.search("quick brown fox");
+    // d1 and d2 both contain all query terms; d2 is shorter so BM25 should rank it higher
+    expect(results.length).toBeGreaterThanOrEqual(2);
+    expect(results[0].chunk.docId).toBe("d2");
+    expect(results[1].chunk.docId).toBe("d1");
+  });
+  it("returns empty for unmatched query", () => {
+    const index = new BM25Index(chunks);
+    const results = index.search("quantum computing");
+    expect(results).toEqual([]);
+  });
+  it("returns empty for empty query", () => {
+    const index = new BM25Index(chunks);
+    expect(index.search("")).toEqual([]);
+  });
+  it("respects topK parameter", () => {
+    const index = new BM25Index(chunks);
+    const results = index.search("the", 2);
+    expect(results.length).toBeLessThanOrEqual(2);
+  });
+  it("handles single-chunk index", () => {
+    const index = new BM25Index([makeChunk("hello world")]);
+    const results = index.search("hello");
+    expect(results.length).toBe(1);
+    expect(results[0].score).toBeGreaterThan(0);
+  });
+  it("handles empty index", () => {
+    const index = new BM25Index([]);
+    expect(index.search("anything")).toEqual([]);
+  });
+  it("scores are positive for matching documents", () => {
+    const index = new BM25Index(chunks);
+    const results = index.search("lazy dog");
+    for (const r of results) {
+      expect(r.score).toBeGreaterThan(0);
+    }
+  });
+  it("partial query terms still return results", () => {
+    const index = new BM25Index(chunks);
+    // "machine" appears only in d4
+    const results = index.search("machine");
+    expect(results.length).toBe(1);
+    expect(results[0].chunk.docId).toBe("d4");
+  });
+  it("scores decrease with result rank", () => {
+    const index = new BM25Index(chunks);
+    const results = index.search("the lazy dog");
+    for (let i = 1; i < results.length; i++) {
+      expect(results[i].score).toBeLessThanOrEqual(results[i - 1].score);
+    }
+  });
+});

src/pipeline/bm25.ts ADDED Viewed

	@@ -0,0 +1,101 @@

+import type { Chunk, ScoredChunk } from "../types";
+import { BM25_K1, BM25_B } from "../constants";
+/**
+ * Simple tokenizer: lowercase, split on non-alphanumeric, filter tokens < 2 chars.
+ */
+export function tokenize(text: string): string[] {
+  return text
+    .toLowerCase()
+    .split(/[^a-z0-9]+/)
+    .filter((t) => t.length >= 2);
+}
+/**
+ * In-memory BM25 search index built over an array of Chunks.
+ */
+export class BM25Index {
+  private chunks: Chunk[];
+  private avgDocLength: number;
+  private docLengths: number[];
+  private termFreqs: Map<string, number[]>; // term -> per-chunk frequency
+  private docFreqs: Map<string, number>; // term -> number of chunks containing term
+  private N: number;
+  constructor(chunks: Chunk[]) {
+    this.chunks = chunks;
+    this.N = chunks.length;
+    this.docLengths = [];
+    this.termFreqs = new Map();
+    this.docFreqs = new Map();
+    let totalLength = 0;
+    for (let i = 0; i < this.N; i++) {
+      const tokens = tokenize(chunks[i].text);
+      this.docLengths.push(tokens.length);
+      totalLength += tokens.length;
+      // Count term frequencies for this chunk
+      const localFreq = new Map<string, number>();
+      for (const token of tokens) {
+        localFreq.set(token, (localFreq.get(token) ?? 0) + 1);
+      }
+      // Update inverted index
+      for (const [term, freq] of localFreq) {
+        let freqArray = this.termFreqs.get(term);
+        if (!freqArray) {
+          freqArray = new Array(this.N).fill(0);
+          this.termFreqs.set(term, freqArray);
+        }
+        freqArray[i] = freq;
+        this.docFreqs.set(term, (this.docFreqs.get(term) ?? 0) + 1);
+      }
+    }
+    this.avgDocLength = this.N > 0 ? totalLength / this.N : 0;
+  }
+  search(query: string, topK: number = 20): ScoredChunk[] {
+    const queryTerms = tokenize(query);
+    if (queryTerms.length === 0 || this.N === 0) return [];
+    const scores = new Float64Array(this.N);
+    for (const term of queryTerms) {
+      const df = this.docFreqs.get(term);
+      if (df === undefined) continue;
+      const freqArray = this.termFreqs.get(term)!;
+      const idf = Math.log((this.N - df + 0.5) / (df + 0.5) + 1);
+      for (let i = 0; i < this.N; i++) {
+        const tf = freqArray[i];
+        if (tf === 0) continue;
+        const dl = this.docLengths[i];
+        const norm = BM25_K1 * (1 - BM25_B + BM25_B * (dl / this.avgDocLength));
+        const tfScore = (tf * (BM25_K1 + 1)) / (tf + norm);
+        scores[i] += idf * tfScore;
+      }
+    }
+    // Collect scored results, filtering out zero scores
+    const results: ScoredChunk[] = [];
+    for (let i = 0; i < this.N; i++) {
+      if (scores[i] > 0) {
+        results.push({
+          chunk: this.chunks[i],
+          score: scores[i],
+          source: "bm25",
+        });
+      }
+    }
+    // Sort descending by score and return top K
+    results.sort((a, b) => b.score - a.score);
+    return results.slice(0, topK);
+  }
+}