Dokumentassistent / tests /test_bm25.py
XQ
Update health check and cloud deployment
3f19c23
raw
history blame
6.41 kB
"""Tests for BM25 sparse retrieval."""
import pytest
from src.models import DocumentChunk, QueryResult
from src.retrieval.bm25_search import BM25Search
def _make_chunk(chunk_id: str, text: str) -> DocumentChunk:
"""Create a DocumentChunk helper."""
return DocumentChunk(chunk_id=chunk_id, document_id="doc1", text=text)
class TestBM25Index:
"""Tests for index construction."""
def test_is_indexed_false_before_indexing(self) -> None:
bm25 = BM25Search()
assert bm25.is_indexed is False
def test_is_indexed_true_after_indexing(self) -> None:
bm25 = BM25Search()
bm25.index([_make_chunk("1", "hello world")])
assert bm25.is_indexed is True
def test_index_stores_chunks(self) -> None:
bm25 = BM25Search()
chunks = [_make_chunk("1", "hello world"), _make_chunk("2", "foo bar")]
bm25.index(chunks)
assert bm25._chunks == chunks
assert bm25._index is not None
def test_index_replaces_previous(self) -> None:
bm25 = BM25Search()
bm25.index([_make_chunk("1", "old text")])
bm25.index([_make_chunk("2", "new text")])
assert len(bm25._chunks) == 1
assert bm25._chunks[0].chunk_id == "2"
def test_index_empty_list_raises(self) -> None:
bm25 = BM25Search()
with pytest.raises(ZeroDivisionError):
bm25.index([])
class TestBM25Search:
"""Tests for query and ranking correctness."""
def test_search_returns_relevant_results(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "university policy on student enrollment"),
_make_chunk("2", "library opening hours and access"),
_make_chunk("3", "student enrollment deadline and requirements"),
])
results = bm25.search("student enrollment", top_k=3)
assert len(results) >= 2
# The two chunks mentioning "student enrollment" should rank highest
top_ids = [r.chunk.chunk_id for r in results[:2]]
assert "1" in top_ids
assert "3" in top_ids
def test_search_respects_top_k(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "alpha beta gamma"),
_make_chunk("2", "beta gamma delta"),
_make_chunk("3", "gamma delta epsilon"),
_make_chunk("4", "delta epsilon zeta"),
])
# "alpha" only in chunk 1, "beta" in 1&2 — at most 2 have nonzero scores
results = bm25.search("alpha beta", top_k=2)
assert len(results) <= 2
def test_search_scores_descending(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "data"),
_make_chunk("2", "data data data"),
_make_chunk("3", "data data"),
])
results = bm25.search("data", top_k=3)
scores = [r.score for r in results]
assert scores == sorted(scores, reverse=True)
def test_search_result_fields(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "test document content"),
_make_chunk("2", "unrelated other stuff"),
_make_chunk("3", "more filler material here"),
])
results = bm25.search("test", top_k=1)
assert len(results) == 1
r = results[0]
assert isinstance(r, QueryResult)
assert r.source == "bm25"
assert r.score > 0.0
assert r.chunk.chunk_id == "1"
def test_search_no_match_returns_empty(self) -> None:
bm25 = BM25Search()
bm25.index([_make_chunk("1", "hello world")])
results = bm25.search("zzzznotfound", top_k=5)
assert results == []
def test_search_filters_zero_scores(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "relevant keyword"),
_make_chunk("2", "completely unrelated text"),
])
results = bm25.search("keyword", top_k=10)
for r in results:
assert r.score > 0.0
class TestBM25Danish:
"""Tests for Danish text with æ, ø, å characters."""
def test_danish_characters_indexed_and_searchable(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "københavns universitet uddannelsespolitik"),
_make_chunk("2", "studerende skal følge reglerne"),
_make_chunk("3", "årsrapport for forskningsområdet"),
])
results = bm25.search("københavns", top_k=3)
assert len(results) == 1
assert results[0].chunk.chunk_id == "1"
def test_danish_oe_character(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "følgende bestemmelser gælder"),
_make_chunk("2", "other english text here"),
_make_chunk("3", "mere dansk tekst uden søgeord"),
])
results = bm25.search("følgende", top_k=3)
assert len(results) == 1
assert results[0].chunk.chunk_id == "1"
def test_danish_aa_character(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "åben adgang til dokumenter"),
_make_chunk("2", "lukket periode for eksamen"),
_make_chunk("3", "generel information om kurser"),
])
results = bm25.search("åben", top_k=3)
assert len(results) == 1
assert results[0].chunk.chunk_id == "1"
def test_danish_case_insensitive(self) -> None:
bm25 = BM25Search()
bm25.index([
_make_chunk("1", "Ændringer i studieordningen"),
_make_chunk("2", "andet dokument uden relevans"),
_make_chunk("3", "tredje dokument om noget helt andet"),
])
results = bm25.search("ændringer", top_k=3)
assert len(results) == 1
class TestBM25EmptyIndex:
"""Tests for querying before or on an empty index."""
def test_search_before_indexing(self) -> None:
bm25 = BM25Search()
results = bm25.search("anything", top_k=5)
assert results == []
def test_search_on_empty_index_not_possible(self) -> None:
"""BM25Okapi raises ZeroDivisionError on empty corpus,
so searching an empty index is only possible if index() was never called."""
bm25 = BM25Search()
with pytest.raises(ZeroDivisionError):
bm25.index([])