Spaces:
Sleeping
Sleeping
File size: 6,410 Bytes
31a2688 3f19c23 31a2688 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | """Tests for BM25 sparse retrieval."""
import pytest
from src.models import DocumentChunk, QueryResult
from src.retrieval.bm25_search import BM25Search
def _make_chunk(chunk_id: str, text: str) -> DocumentChunk:
"""Create a DocumentChunk helper."""
return DocumentChunk(chunk_id=chunk_id, document_id="doc1", text=text)
class TestBM25Index:
"""Tests for index construction."""
def test_is_indexed_false_before_indexing(self) -> None:
bm25 = BM25Search()
assert bm25.is_indexed is False
def test_is_indexed_true_after_indexing(self) -> None:
bm25 = BM25Search()
bm25.index([_make_chunk("1", "hello world")])
assert bm25.is_indexed is True
def test_index_stores_chunks(self) -> None:
bm25 = BM25Search()
chunks = [_make_chunk("1", "hello world"), _make_chunk("2", "foo bar")]
bm25.index(chunks)
assert bm25._chunks == chunks
assert bm25._index is not None
def test_index_replaces_previous(self) -> None:
bm25 = BM25Search()
bm25.index([_make_chunk("1", "old text")])
bm25.index([_make_chunk("2", "new text")])
assert len(bm25._chunks) == 1
assert bm25._chunks[0].chunk_id == "2"
def test_index_empty_list_raises(self) -> None:
bm25 = BM25Search()
with pytest.raises(ZeroDivisionError):
bm25.index([])
class TestBM25Search:
"""Tests for query and ranking correctness."""
def test_search_returns_relevant_results(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "university policy on student enrollment"),
_make_chunk("2", "library opening hours and access"),
_make_chunk("3", "student enrollment deadline and requirements"),
])
results = bm25.search("student enrollment", top_k=3)
assert len(results) >= 2
# The two chunks mentioning "student enrollment" should rank highest
top_ids = [r.chunk.chunk_id for r in results[:2]]
assert "1" in top_ids
assert "3" in top_ids
def test_search_respects_top_k(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "alpha beta gamma"),
_make_chunk("2", "beta gamma delta"),
_make_chunk("3", "gamma delta epsilon"),
_make_chunk("4", "delta epsilon zeta"),
])
# "alpha" only in chunk 1, "beta" in 1&2 — at most 2 have nonzero scores
results = bm25.search("alpha beta", top_k=2)
assert len(results) <= 2
def test_search_scores_descending(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "data"),
_make_chunk("2", "data data data"),
_make_chunk("3", "data data"),
])
results = bm25.search("data", top_k=3)
scores = [r.score for r in results]
assert scores == sorted(scores, reverse=True)
def test_search_result_fields(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "test document content"),
_make_chunk("2", "unrelated other stuff"),
_make_chunk("3", "more filler material here"),
])
results = bm25.search("test", top_k=1)
assert len(results) == 1
r = results[0]
assert isinstance(r, QueryResult)
assert r.source == "bm25"
assert r.score > 0.0
assert r.chunk.chunk_id == "1"
def test_search_no_match_returns_empty(self) -> None:
bm25 = BM25Search()
bm25.index([_make_chunk("1", "hello world")])
results = bm25.search("zzzznotfound", top_k=5)
assert results == []
def test_search_filters_zero_scores(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "relevant keyword"),
_make_chunk("2", "completely unrelated text"),
])
results = bm25.search("keyword", top_k=10)
for r in results:
assert r.score > 0.0
class TestBM25Danish:
"""Tests for Danish text with æ, ø, å characters."""
def test_danish_characters_indexed_and_searchable(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "københavns universitet uddannelsespolitik"),
_make_chunk("2", "studerende skal følge reglerne"),
_make_chunk("3", "årsrapport for forskningsområdet"),
])
results = bm25.search("københavns", top_k=3)
assert len(results) == 1
assert results[0].chunk.chunk_id == "1"
def test_danish_oe_character(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "følgende bestemmelser gælder"),
_make_chunk("2", "other english text here"),
_make_chunk("3", "mere dansk tekst uden søgeord"),
])
results = bm25.search("følgende", top_k=3)
assert len(results) == 1
assert results[0].chunk.chunk_id == "1"
def test_danish_aa_character(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "åben adgang til dokumenter"),
_make_chunk("2", "lukket periode for eksamen"),
_make_chunk("3", "generel information om kurser"),
])
results = bm25.search("åben", top_k=3)
assert len(results) == 1
assert results[0].chunk.chunk_id == "1"
def test_danish_case_insensitive(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "Ændringer i studieordningen"),
_make_chunk("2", "andet dokument uden relevans"),
_make_chunk("3", "tredje dokument om noget helt andet"),
])
results = bm25.search("ændringer", top_k=3)
assert len(results) == 1
class TestBM25EmptyIndex:
"""Tests for querying before or on an empty index."""
def test_search_before_indexing(self) -> None:
bm25 = BM25Search()
results = bm25.search("anything", top_k=5)
assert results == []
def test_search_on_empty_index_not_possible(self) -> None:
"""BM25Okapi raises ZeroDivisionError on empty corpus,
so searching an empty index is only possible if index() was never called."""
bm25 = BM25Search()
with pytest.raises(ZeroDivisionError):
bm25.index([])
|