Spaces:

XQ
/

Dokumentassistent

Sleeping

File size: 6,410 Bytes

"""Tests for BM25 sparse retrieval."""

import pytest

from src.models import DocumentChunk, QueryResult
from src.retrieval.bm25_search import BM25Search


def _make_chunk(chunk_id: str, text: str) -> DocumentChunk:
    """Create a DocumentChunk helper."""
    return DocumentChunk(chunk_id=chunk_id, document_id="doc1", text=text)


class TestBM25Index:
    """Tests for index construction."""

    def test_is_indexed_false_before_indexing(self) -> None:
        bm25 = BM25Search()
        assert bm25.is_indexed is False

    def test_is_indexed_true_after_indexing(self) -> None:
        bm25 = BM25Search()
        bm25.index([_make_chunk("1", "hello world")])
        assert bm25.is_indexed is True

    def test_index_stores_chunks(self) -> None:
        bm25 = BM25Search()
        chunks = [_make_chunk("1", "hello world"), _make_chunk("2", "foo bar")]
        bm25.index(chunks)
        assert bm25._chunks == chunks
        assert bm25._index is not None

    def test_index_replaces_previous(self) -> None:
        bm25 = BM25Search()
        bm25.index([_make_chunk("1", "old text")])
        bm25.index([_make_chunk("2", "new text")])
        assert len(bm25._chunks) == 1
        assert bm25._chunks[0].chunk_id == "2"

    def test_index_empty_list_raises(self) -> None:
        bm25 = BM25Search()
        with pytest.raises(ZeroDivisionError):
            bm25.index([])


class TestBM25Search:
    """Tests for query and ranking correctness."""

    def test_search_returns_relevant_results(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "university policy on student enrollment"),
            _make_chunk("2", "library opening hours and access"),
            _make_chunk("3", "student enrollment deadline and requirements"),
        ])
        results = bm25.search("student enrollment", top_k=3)
        assert len(results) >= 2
        # The two chunks mentioning "student enrollment" should rank highest
        top_ids = [r.chunk.chunk_id for r in results[:2]]
        assert "1" in top_ids
        assert "3" in top_ids

    def test_search_respects_top_k(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "alpha beta gamma"),
            _make_chunk("2", "beta gamma delta"),
            _make_chunk("3", "gamma delta epsilon"),
            _make_chunk("4", "delta epsilon zeta"),
        ])
        # "alpha" only in chunk 1, "beta" in 1&2 — at most 2 have nonzero scores
        results = bm25.search("alpha beta", top_k=2)
        assert len(results) <= 2

    def test_search_scores_descending(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "data"),
            _make_chunk("2", "data data data"),
            _make_chunk("3", "data data"),
        ])
        results = bm25.search("data", top_k=3)
        scores = [r.score for r in results]
        assert scores == sorted(scores, reverse=True)

    def test_search_result_fields(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "test document content"),
            _make_chunk("2", "unrelated other stuff"),
            _make_chunk("3", "more filler material here"),
        ])
        results = bm25.search("test", top_k=1)
        assert len(results) == 1
        r = results[0]
        assert isinstance(r, QueryResult)
        assert r.source == "bm25"
        assert r.score > 0.0
        assert r.chunk.chunk_id == "1"

    def test_search_no_match_returns_empty(self) -> None:
        bm25 = BM25Search()
        bm25.index([_make_chunk("1", "hello world")])
        results = bm25.search("zzzznotfound", top_k=5)
        assert results == []

    def test_search_filters_zero_scores(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "relevant keyword"),
            _make_chunk("2", "completely unrelated text"),
        ])
        results = bm25.search("keyword", top_k=10)
        for r in results:
            assert r.score > 0.0


class TestBM25Danish:
    """Tests for Danish text with æ, ø, å characters."""

    def test_danish_characters_indexed_and_searchable(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "københavns universitet uddannelsespolitik"),
            _make_chunk("2", "studerende skal følge reglerne"),
            _make_chunk("3", "årsrapport for forskningsområdet"),
        ])
        results = bm25.search("københavns", top_k=3)
        assert len(results) == 1
        assert results[0].chunk.chunk_id == "1"

    def test_danish_oe_character(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "følgende bestemmelser gælder"),
            _make_chunk("2", "other english text here"),
            _make_chunk("3", "mere dansk tekst uden søgeord"),
        ])
        results = bm25.search("følgende", top_k=3)
        assert len(results) == 1
        assert results[0].chunk.chunk_id == "1"

    def test_danish_aa_character(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "åben adgang til dokumenter"),
            _make_chunk("2", "lukket periode for eksamen"),
            _make_chunk("3", "generel information om kurser"),
        ])
        results = bm25.search("åben", top_k=3)
        assert len(results) == 1
        assert results[0].chunk.chunk_id == "1"

    def test_danish_case_insensitive(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "Ændringer i studieordningen"),
            _make_chunk("2", "andet dokument uden relevans"),
            _make_chunk("3", "tredje dokument om noget helt andet"),
        ])
        results = bm25.search("ændringer", top_k=3)
        assert len(results) == 1


class TestBM25EmptyIndex:
    """Tests for querying before or on an empty index."""

    def test_search_before_indexing(self) -> None:
        bm25 = BM25Search()
        results = bm25.search("anything", top_k=5)
        assert results == []

    def test_search_on_empty_index_not_possible(self) -> None:
        """BM25Okapi raises ZeroDivisionError on empty corpus,
        so searching an empty index is only possible if index() was never called."""
        bm25 = BM25Search()
        with pytest.raises(ZeroDivisionError):
            bm25.index([])