File size: 3,193 Bytes
256edfa
cdf4160
256edfa
cdf4160
 
 
256edfa
cdf4160
 
256edfa
 
 
fc20fed
 
 
 
cdf4160
256edfa
fc20fed
256edfa
 
 
 
 
 
 
fc20fed
256edfa
 
 
 
 
fc20fed
256edfa
 
 
 
fc20fed
256edfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc20fed
cdf4160
256edfa
 
 
cdf4160
 
256edfa
fc20fed
256edfa
 
 
 
 
fc20fed
256edfa
 
 
fc20fed
 
 
256edfa
 
 
 
 
 
fc20fed
256edfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc20fed
256edfa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import json
import numpy as np
import pytest
from pathlib import Path

from evaluation.retrievers.dense import DenseRetriever
from evaluation.retrievers.base import Context


class DummyIndex:
    def __init__(self):
        self.ntotal = 3
        import faiss

        # Use IP if available, else fallback to L2
        self.metric_type = getattr(faiss, "METRIC_INNER_PRODUCT", faiss.METRIC_L2)

    def search(self, vec, top_k):
        # Always return three dummy distances/indices
        dists = np.array([[0.2, 0.15, 0.05]])
        idxs = np.array([[0, 1, 2]])
        return dists, idxs


class DummyEmbedder:
    def encode(self, texts, normalize_embeddings):
        # Return a fixed-size vector (the actual values don't matter)
        return np.array([0.1, 0.2, 0.3, 0.4], dtype="float32")


@pytest.fixture(autouse=True)
def patch_faiss_and_transformer(monkeypatch):
    # Stub out faiss.read_index → DummyIndex()
    import faiss

    monkeypatch.setattr(faiss, "read_index", lambda _: DummyIndex())

    # Stub out SentenceTransformer → DummyEmbedder()
    import sentence_transformers

    monkeypatch.setattr(
        sentence_transformers,
        "SentenceTransformer",
        lambda *args, **kwargs: DummyEmbedder(),
    )
    yield


def test_dense_index_build_and_search(tmp_path):
    docs = [
        {"id": 0, "text": "Doc zero"},
        {"id": 1, "text": "Doc one"},
        {"id": 2, "text": "Doc two"},
    ]
    doc_store_path = tmp_path / "docs.jsonl"
    with doc_store_path.open("w") as f:
        for obj in docs:
            f.write(json.dumps(obj) + "\n")

    faiss_idx = tmp_path / "index.faiss"
    if faiss_idx.exists():
        faiss_idx.unlink()

    # Instantiate DenseRetriever; should write a real FAISS file to disk
    retriever = DenseRetriever(
        faiss_index=faiss_idx,
        doc_store=doc_store_path,
        model_name="dummy-model-name",
        device="cpu",
    )

    # Now the FAISS file should exist on disk
    assert faiss_idx.exists()

    results = retriever.retrieve("any query", top_k=3)
    assert isinstance(results, list)
    assert len(results) == 3

    for i, ctx in enumerate(results):
        assert isinstance(ctx, Context)
        assert ctx.id == str(i)
        # DummyIndex returned dists [0.2, 0.15, 0.05]
        assert ctx.score == pytest.approx([0.2, 0.15, 0.05][i], rel=1e-6)
        # The text must come from doc_store
        assert ctx.text in {"Doc zero", "Doc one", "Doc two"}


def test_dense_retrieve_when_faiss_or_transformer_fails(monkeypatch, tmp_path):
    import faiss

    # Force faiss.read_index to raise
    monkeypatch.setattr(faiss, "read_index", lambda _: (_ for _ in ()).throw(Exception("fail")))

    doc_store_path = tmp_path / "docs.jsonl"
    doc_store_path.write_text('{"id":0,"text":"hello"}\n')

    faiss_idx = tmp_path / "index2.faiss"
    if faiss_idx.exists():
        faiss_idx.unlink()

    retriever = DenseRetriever(
        faiss_index=faiss_idx,
        doc_store=doc_store_path,
        model_name="dummy-model-name",
        device="cpu",
    )

    # Since index load failed, retrieve() must return []
    assert retriever.retrieve("whatever", top_k=5) == []