Spaces:
Sleeping
Sleeping
| import json | |
| import subprocess | |
| from pathlib import Path | |
| import pytest | |
| from evaluation.retrievers.bm25 import BM25Retriever | |
| from evaluation.retrievers.base import Context | |
| class DummyHit: | |
| def __init__(self, docid, raw, score): | |
| self.docid = docid | |
| self.raw = raw | |
| self.score = score | |
| class DummySearcher: | |
| def __init__(self, index_dir): | |
| # do nothing | |
| pass | |
| def set_bm25(self): | |
| pass | |
| def search(self, query, k): | |
| # Return a predictable list of hits | |
| return [ | |
| DummyHit(docid=0, raw="first doc text", score=2.0), | |
| DummyHit(docid=1, raw="second doc text", score=1.5), | |
| ] | |
| def patch_subprocess_and_pyserini(monkeypatch): | |
| # ❶ Prevent subprocess.run from actually calling "pyserini.index" | |
| monkeypatch.setattr(subprocess, "run", lambda *args, **kwargs: None) | |
| # ❷ Stub out pyserini.search.SimpleSearcher if available | |
| try: | |
| import pyserini.search | |
| monkeypatch.setattr(pyserini.search, "SimpleSearcher", DummySearcher) | |
| except ImportError: | |
| pass | |
| def test_bm25_index_build_and_query(tmp_path): | |
| # Create a tiny doc_store JSONL | |
| docs = [ | |
| {"id": 0, "text": "Retrieval Augmented Generation"}, | |
| {"id": 1, "text": "BM25 is strong"}, | |
| ] | |
| doc_store_path = tmp_path / "docs.jsonl" | |
| with doc_store_path.open("w") as f: | |
| for obj in docs: | |
| f.write(json.dumps(obj) + "\n") | |
| # Point to a non‐existent index directory | |
| index_dir = tmp_path / "bm25_index" | |
| assert not index_dir.exists() | |
| # Instantiate BM25Retriever; __init__ should “build” the index (subprocess.run no‐ops) | |
| retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store=str(doc_store_path)) | |
| # After init, index_dir “exists” (because build_index created it) | |
| assert index_dir.exists() | |
| # Now call retrieve(...) | |
| results = retriever.retrieve("any query", top_k=2) | |
| # Verify that we get two Context objects with correct fields | |
| assert isinstance(results, list) | |
| assert len(results) == 2 | |
| assert all(isinstance(r, Context) for r in results) | |
| # Because DummySearcher returns docid=0 then docid=1 | |
| assert results[0].id == "0" | |
| assert results[0].text == "first doc text" | |
| assert results[0].score == pytest.approx(2.0, rel=1e-6) | |
| assert results[1].id == "1" | |
| assert results[1].text == "second doc text" | |
| assert results[1].score == pytest.approx(1.5, rel=1e-6) | |
| def test_bm25_retrieve_when_pyserini_missing(monkeypatch, tmp_path): | |
| # Simulate ImportError for pyserini.search.SimpleSearcher | |
| monkeypatch.setitem(__import__("sys").modules, "pyserini.search", None) | |
| doc_store_path = tmp_path / "docs.jsonl" | |
| doc_store_path.write_text('{"id":0,"text":"hello"}\n') | |
| index_dir = tmp_path / "bm25_index2" | |
| retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store=str(doc_store_path)) | |
| # If SimpleSearcher failed to import, retrieve() returns [] | |
| assert retriever.retrieve("whatever", top_k=5) == [] |