File size: 2,001 Bytes
cdf4160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import json
import shutil
import tempfile
from pathlib import Path
from types import SimpleNamespace
from typing import List

import numpy as np
import pytest


@pytest.fixture(scope="session")
def tmp_doc_store(tmp_path_factory):
    """Create a tiny JSONL doc store for testing."""
    docs = [
        {"id": 0, "text": "Retrieval Augmented Generation combines retrieval and generation."},
        {"id": 1, "text": "BM25 is a strong lexical baseline in information retrieval."},
        {"id": 2, "text": "FAISS enables efficient similarity search over dense embeddings."},
    ]
    doc_path = tmp_path_factory.mktemp("docs") / "docs.jsonl"
    with doc_path.open("w") as f:
        for doc in docs:
            f.write(json.dumps(doc) + "\n")
    return doc_path


class _DummyEmbedder:
    """Fast, deterministic replacement for SentenceTransformer during tests.

    * Encodes text into a 16‑dim vector with a fixed random seed.
    * Normalises vectors so the retriever workflow (IP metric) is preserved.
    """

    _dim = 16

    def __init__(self, *args, **kwargs):
        self.rs = np.random.RandomState(42)

    def encode(self, texts, **kw):
        if isinstance(texts, str):
            texts = [texts]
        vecs = []
        for t in texts:
            # Simple hash-based seed for determinism
            h = abs(hash(t)) % (2**32)
            self.rs.seed(h)
            v = self.rs.randn(self._dim)
            v = v / np.linalg.norm(v)
            vecs.append(v.astype("float32"))
        return np.stack(vecs)

    # SentenceTransformer.elasticsearch compatibility
    def __str__(self):
        return "DummyEmbedder"

@pytest.fixture(autouse=True)
def patch_sentence_transformers(monkeypatch):
    """Monkeypatch SentenceTransformer to a lightweight dummy implementation."""

    # Import path inside our retriever module
    from evaluation.retrievers import dense as dense_mod

    monkeypatch.setattr(dense_mod, "SentenceTransformer", _DummyEmbedder)
    yield