File size: 6,410 Bytes
31a2688
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f19c23
 
 
 
 
 
 
 
 
31a2688
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""Tests for BM25 sparse retrieval."""

import pytest

from src.models import DocumentChunk, QueryResult
from src.retrieval.bm25_search import BM25Search


def _make_chunk(chunk_id: str, text: str) -> DocumentChunk:
    """Create a DocumentChunk helper."""
    return DocumentChunk(chunk_id=chunk_id, document_id="doc1", text=text)


class TestBM25Index:
    """Tests for index construction."""

    def test_is_indexed_false_before_indexing(self) -> None:
        bm25 = BM25Search()
        assert bm25.is_indexed is False

    def test_is_indexed_true_after_indexing(self) -> None:
        bm25 = BM25Search()
        bm25.index([_make_chunk("1", "hello world")])
        assert bm25.is_indexed is True

    def test_index_stores_chunks(self) -> None:
        bm25 = BM25Search()
        chunks = [_make_chunk("1", "hello world"), _make_chunk("2", "foo bar")]
        bm25.index(chunks)
        assert bm25._chunks == chunks
        assert bm25._index is not None

    def test_index_replaces_previous(self) -> None:
        bm25 = BM25Search()
        bm25.index([_make_chunk("1", "old text")])
        bm25.index([_make_chunk("2", "new text")])
        assert len(bm25._chunks) == 1
        assert bm25._chunks[0].chunk_id == "2"

    def test_index_empty_list_raises(self) -> None:
        bm25 = BM25Search()
        with pytest.raises(ZeroDivisionError):
            bm25.index([])


class TestBM25Search:
    """Tests for query and ranking correctness."""

    def test_search_returns_relevant_results(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "university policy on student enrollment"),
            _make_chunk("2", "library opening hours and access"),
            _make_chunk("3", "student enrollment deadline and requirements"),
        ])
        results = bm25.search("student enrollment", top_k=3)
        assert len(results) >= 2
        # The two chunks mentioning "student enrollment" should rank highest
        top_ids = [r.chunk.chunk_id for r in results[:2]]
        assert "1" in top_ids
        assert "3" in top_ids

    def test_search_respects_top_k(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "alpha beta gamma"),
            _make_chunk("2", "beta gamma delta"),
            _make_chunk("3", "gamma delta epsilon"),
            _make_chunk("4", "delta epsilon zeta"),
        ])
        # "alpha" only in chunk 1, "beta" in 1&2 — at most 2 have nonzero scores
        results = bm25.search("alpha beta", top_k=2)
        assert len(results) <= 2

    def test_search_scores_descending(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "data"),
            _make_chunk("2", "data data data"),
            _make_chunk("3", "data data"),
        ])
        results = bm25.search("data", top_k=3)
        scores = [r.score for r in results]
        assert scores == sorted(scores, reverse=True)

    def test_search_result_fields(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "test document content"),
            _make_chunk("2", "unrelated other stuff"),
            _make_chunk("3", "more filler material here"),
        ])
        results = bm25.search("test", top_k=1)
        assert len(results) == 1
        r = results[0]
        assert isinstance(r, QueryResult)
        assert r.source == "bm25"
        assert r.score > 0.0
        assert r.chunk.chunk_id == "1"

    def test_search_no_match_returns_empty(self) -> None:
        bm25 = BM25Search()
        bm25.index([_make_chunk("1", "hello world")])
        results = bm25.search("zzzznotfound", top_k=5)
        assert results == []

    def test_search_filters_zero_scores(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "relevant keyword"),
            _make_chunk("2", "completely unrelated text"),
        ])
        results = bm25.search("keyword", top_k=10)
        for r in results:
            assert r.score > 0.0


class TestBM25Danish:
    """Tests for Danish text with æ, ø, å characters."""

    def test_danish_characters_indexed_and_searchable(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "københavns universitet uddannelsespolitik"),
            _make_chunk("2", "studerende skal følge reglerne"),
            _make_chunk("3", "årsrapport for forskningsområdet"),
        ])
        results = bm25.search("københavns", top_k=3)
        assert len(results) == 1
        assert results[0].chunk.chunk_id == "1"

    def test_danish_oe_character(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "følgende bestemmelser gælder"),
            _make_chunk("2", "other english text here"),
            _make_chunk("3", "mere dansk tekst uden søgeord"),
        ])
        results = bm25.search("følgende", top_k=3)
        assert len(results) == 1
        assert results[0].chunk.chunk_id == "1"

    def test_danish_aa_character(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "åben adgang til dokumenter"),
            _make_chunk("2", "lukket periode for eksamen"),
            _make_chunk("3", "generel information om kurser"),
        ])
        results = bm25.search("åben", top_k=3)
        assert len(results) == 1
        assert results[0].chunk.chunk_id == "1"

    def test_danish_case_insensitive(self) -> None:
        bm25 = BM25Search()
        bm25.index([
            _make_chunk("1", "Ændringer i studieordningen"),
            _make_chunk("2", "andet dokument uden relevans"),
            _make_chunk("3", "tredje dokument om noget helt andet"),
        ])
        results = bm25.search("ændringer", top_k=3)
        assert len(results) == 1


class TestBM25EmptyIndex:
    """Tests for querying before or on an empty index."""

    def test_search_before_indexing(self) -> None:
        bm25 = BM25Search()
        results = bm25.search("anything", top_k=5)
        assert results == []

    def test_search_on_empty_index_not_possible(self) -> None:
        """BM25Okapi raises ZeroDivisionError on empty corpus,
        so searching an empty index is only possible if index() was never called."""
        bm25 = BM25Search()
        with pytest.raises(ZeroDivisionError):
            bm25.index([])