| import { describe, it, expect } from "vitest"; |
| import { tokenize, BM25Index } from "./bm25"; |
| import type { Chunk } from "../types"; |
|
|
| |
| |
| |
| function makeChunk(text: string, docId = "doc1", chunkIndex = 0): Chunk { |
| return { docId, chunkIndex, text, startChar: 0, title: "Test" }; |
| } |
|
|
| |
| |
| |
| describe("tokenize", () => { |
| it("lowercases and splits on non-alphanumeric", () => { |
| expect(tokenize("Hello World")).toEqual(["hello", "world"]); |
| }); |
|
|
| it("filters tokens shorter than 2 characters", () => { |
| expect(tokenize("I am a big fan")).toEqual(["am", "big", "fan"]); |
| }); |
|
|
| it("handles numbers", () => { |
| |
| expect(tokenize("version 2.0 release")).toEqual(["version", "release"]); |
| |
| expect(tokenize("version 20 release")).toEqual(["version", "20", "release"]); |
| }); |
|
|
| it("returns empty array for empty string", () => { |
| expect(tokenize("")).toEqual([]); |
| }); |
|
|
| it("handles punctuation-heavy text", () => { |
| expect(tokenize("hello---world!!! foo")).toEqual([ |
| "hello", |
| "world", |
| "foo", |
| ]); |
| }); |
| }); |
|
|
| |
| |
| |
| describe("BM25Index", () => { |
| const chunks: Chunk[] = [ |
| makeChunk("the quick brown fox jumps over the lazy dog", "d1", 0), |
| makeChunk("the quick brown fox", "d2", 0), |
| makeChunk("the lazy dog sleeps all day long", "d3", 0), |
| makeChunk( |
| "machine learning and artificial intelligence are transforming the world", |
| "d4", |
| 0, |
| ), |
| ]; |
|
|
| it("returns results with source bm25", () => { |
| const index = new BM25Index(chunks); |
| const results = index.search("quick fox"); |
| expect(results.length).toBeGreaterThan(0); |
| for (const r of results) { |
| expect(r.source).toBe("bm25"); |
| } |
| }); |
|
|
| it("ranks exact matches higher", () => { |
| const index = new BM25Index(chunks); |
| const results = index.search("quick brown fox"); |
| |
| expect(results.length).toBeGreaterThanOrEqual(2); |
| expect(results[0].chunk.docId).toBe("d2"); |
| expect(results[1].chunk.docId).toBe("d1"); |
| }); |
|
|
| it("returns empty for unmatched query", () => { |
| const index = new BM25Index(chunks); |
| const results = index.search("quantum computing"); |
| expect(results).toEqual([]); |
| }); |
|
|
| it("returns empty for empty query", () => { |
| const index = new BM25Index(chunks); |
| expect(index.search("")).toEqual([]); |
| }); |
|
|
| it("respects topK parameter", () => { |
| const index = new BM25Index(chunks); |
| const results = index.search("the", 2); |
| expect(results.length).toBeLessThanOrEqual(2); |
| }); |
|
|
| it("handles single-chunk index", () => { |
| const index = new BM25Index([makeChunk("hello world")]); |
| const results = index.search("hello"); |
| expect(results.length).toBe(1); |
| expect(results[0].score).toBeGreaterThan(0); |
| }); |
|
|
| it("handles empty index", () => { |
| const index = new BM25Index([]); |
| expect(index.search("anything")).toEqual([]); |
| }); |
|
|
| it("scores are positive for matching documents", () => { |
| const index = new BM25Index(chunks); |
| const results = index.search("lazy dog"); |
| for (const r of results) { |
| expect(r.score).toBeGreaterThan(0); |
| } |
| }); |
|
|
| it("partial query terms still return results", () => { |
| const index = new BM25Index(chunks); |
| |
| const results = index.search("machine"); |
| expect(results.length).toBe(1); |
| expect(results[0].chunk.docId).toBe("d4"); |
| }); |
|
|
| it("scores decrease with result rank", () => { |
| const index = new BM25Index(chunks); |
| const results = index.search("the lazy dog"); |
| for (let i = 1; i < results.length; i++) { |
| expect(results[i].score).toBeLessThanOrEqual(results[i - 1].score); |
| } |
| }); |
| }); |
|
|