File size: 3,063 Bytes
256edfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc20fed
 
 
 
 
 
256edfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dc151e
256edfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc20fed
256edfa
 
 
 
 
3b8840f
fc20fed
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import json
import subprocess
from pathlib import Path

import pytest

from evaluation.retrievers.bm25 import BM25Retriever
from evaluation.retrievers.base import Context

class DummyHit:
    def __init__(self, docid, raw, score):
        self.docid = docid
        self.raw = raw
        self.score = score


class DummySearcher:
    def __init__(self, index_dir):
        # do nothing
        pass

    def set_bm25(self):
        pass

    def search(self, query, k):
        # Return a predictable list of hits
        return [
            DummyHit(docid=0, raw="first doc text", score=2.0),
            DummyHit(docid=1, raw="second doc text", score=1.5),
        ]


@pytest.fixture(autouse=True)
def patch_subprocess_and_pyserini(monkeypatch):
    # ❶ Prevent subprocess.run from actually calling "pyserini.index"
    monkeypatch.setattr(subprocess, "run", lambda *args, **kwargs: None)

    # ❷ Stub out pyserini.search.SimpleSearcher if available
    try:
        import pyserini.search
        monkeypatch.setattr(pyserini.search, "SimpleSearcher", DummySearcher)
    except ImportError:
        pass


def test_bm25_index_build_and_query(tmp_path):
    # Create a tiny doc_store JSONL
    docs = [
        {"id": 0, "text": "Retrieval Augmented Generation"},
        {"id": 1, "text": "BM25 is strong"},
    ]
    doc_store_path = tmp_path / "docs.jsonl"
    with doc_store_path.open("w") as f:
        for obj in docs:
            f.write(json.dumps(obj) + "\n")

    # Point to a non‐existent index directory
    index_dir = tmp_path / "bm25_index"
    assert not index_dir.exists()

    # Instantiate BM25Retriever; __init__ should “build” the index (subprocess.run no‐ops)
    retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store=str(doc_store_path))

    # After init, index_dir “exists” (because build_index created it)
    assert index_dir.exists()

    # Now call retrieve(...)
    results = retriever.retrieve("any query", top_k=2)

    # Verify that we get two Context objects with correct fields
    assert isinstance(results, list)
    assert len(results) == 2
    assert all(isinstance(r, Context) for r in results)

    # Because DummySearcher returns docid=0 then docid=1
    assert results[0].id == "0"
    assert results[0].text == "first doc text"
    assert results[0].score == pytest.approx(2.0, rel=1e-6)

    assert results[1].id == "1"
    assert results[1].text == "second doc text"
    assert results[1].score == pytest.approx(1.5, rel=1e-6)

def test_bm25_retrieve_when_pyserini_missing(monkeypatch, tmp_path):
    # Simulate ImportError for pyserini.search.SimpleSearcher
    monkeypatch.setitem(__import__("sys").modules, "pyserini.search", None)

    doc_store_path = tmp_path / "docs.jsonl"
    doc_store_path.write_text('{"id":0,"text":"hello"}\n')

    index_dir = tmp_path / "bm25_index2"
    retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store=str(doc_store_path))
    # If SimpleSearcher failed to import, retrieve() returns []
    assert retriever.retrieve("whatever", top_k=5) == []