"""Tests for BM25 sparse retrieval.""" import pytest from src.models import DocumentChunk, QueryResult from src.retrieval.bm25_search import BM25Search def _make_chunk(chunk_id: str, text: str) -> DocumentChunk: """Create a DocumentChunk helper.""" return DocumentChunk(chunk_id=chunk_id, document_id="doc1", text=text) class TestBM25Index: """Tests for index construction.""" def test_is_indexed_false_before_indexing(self) -> None: bm25 = BM25Search() assert bm25.is_indexed is False def test_is_indexed_true_after_indexing(self) -> None: bm25 = BM25Search() bm25.index([_make_chunk("1", "hello world")]) assert bm25.is_indexed is True def test_index_stores_chunks(self) -> None: bm25 = BM25Search() chunks = [_make_chunk("1", "hello world"), _make_chunk("2", "foo bar")] bm25.index(chunks) assert bm25._chunks == chunks assert bm25._index is not None def test_index_replaces_previous(self) -> None: bm25 = BM25Search() bm25.index([_make_chunk("1", "old text")]) bm25.index([_make_chunk("2", "new text")]) assert len(bm25._chunks) == 1 assert bm25._chunks[0].chunk_id == "2" def test_index_empty_list_raises(self) -> None: bm25 = BM25Search() with pytest.raises(ZeroDivisionError): bm25.index([]) class TestBM25Search: """Tests for query and ranking correctness.""" def test_search_returns_relevant_results(self) -> None: bm25 = BM25Search() bm25.index([ _make_chunk("1", "university policy on student enrollment"), _make_chunk("2", "library opening hours and access"), _make_chunk("3", "student enrollment deadline and requirements"), ]) results = bm25.search("student enrollment", top_k=3) assert len(results) >= 2 # The two chunks mentioning "student enrollment" should rank highest top_ids = [r.chunk.chunk_id for r in results[:2]] assert "1" in top_ids assert "3" in top_ids def test_search_respects_top_k(self) -> None: bm25 = BM25Search() bm25.index([ _make_chunk("1", "alpha beta gamma"), _make_chunk("2", "beta gamma delta"), _make_chunk("3", "gamma delta epsilon"), _make_chunk("4", "delta epsilon zeta"), ]) # "alpha" only in chunk 1, "beta" in 1&2 — at most 2 have nonzero scores results = bm25.search("alpha beta", top_k=2) assert len(results) <= 2 def test_search_scores_descending(self) -> None: bm25 = BM25Search() bm25.index([ _make_chunk("1", "data"), _make_chunk("2", "data data data"), _make_chunk("3", "data data"), ]) results = bm25.search("data", top_k=3) scores = [r.score for r in results] assert scores == sorted(scores, reverse=True) def test_search_result_fields(self) -> None: bm25 = BM25Search() bm25.index([ _make_chunk("1", "test document content"), _make_chunk("2", "unrelated other stuff"), _make_chunk("3", "more filler material here"), ]) results = bm25.search("test", top_k=1) assert len(results) == 1 r = results[0] assert isinstance(r, QueryResult) assert r.source == "bm25" assert r.score > 0.0 assert r.chunk.chunk_id == "1" def test_search_no_match_returns_empty(self) -> None: bm25 = BM25Search() bm25.index([_make_chunk("1", "hello world")]) results = bm25.search("zzzznotfound", top_k=5) assert results == [] def test_search_filters_zero_scores(self) -> None: bm25 = BM25Search() bm25.index([ _make_chunk("1", "relevant keyword"), _make_chunk("2", "completely unrelated text"), ]) results = bm25.search("keyword", top_k=10) for r in results: assert r.score > 0.0 class TestBM25Danish: """Tests for Danish text with æ, ø, å characters.""" def test_danish_characters_indexed_and_searchable(self) -> None: bm25 = BM25Search() bm25.index([ _make_chunk("1", "københavns universitet uddannelsespolitik"), _make_chunk("2", "studerende skal følge reglerne"), _make_chunk("3", "årsrapport for forskningsområdet"), ]) results = bm25.search("københavns", top_k=3) assert len(results) == 1 assert results[0].chunk.chunk_id == "1" def test_danish_oe_character(self) -> None: bm25 = BM25Search() bm25.index([ _make_chunk("1", "følgende bestemmelser gælder"), _make_chunk("2", "other english text here"), _make_chunk("3", "mere dansk tekst uden søgeord"), ]) results = bm25.search("følgende", top_k=3) assert len(results) == 1 assert results[0].chunk.chunk_id == "1" def test_danish_aa_character(self) -> None: bm25 = BM25Search() bm25.index([ _make_chunk("1", "åben adgang til dokumenter"), _make_chunk("2", "lukket periode for eksamen"), _make_chunk("3", "generel information om kurser"), ]) results = bm25.search("åben", top_k=3) assert len(results) == 1 assert results[0].chunk.chunk_id == "1" def test_danish_case_insensitive(self) -> None: bm25 = BM25Search() bm25.index([ _make_chunk("1", "Ændringer i studieordningen"), _make_chunk("2", "andet dokument uden relevans"), _make_chunk("3", "tredje dokument om noget helt andet"), ]) results = bm25.search("ændringer", top_k=3) assert len(results) == 1 class TestBM25EmptyIndex: """Tests for querying before or on an empty index.""" def test_search_before_indexing(self) -> None: bm25 = BM25Search() results = bm25.search("anything", top_k=5) assert results == [] def test_search_on_empty_index_not_possible(self) -> None: """BM25Okapi raises ZeroDivisionError on empty corpus, so searching an empty index is only possible if index() was never called.""" bm25 = BM25Search() with pytest.raises(ZeroDivisionError): bm25.index([])