File size: 1,863 Bytes
a34068e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from app.core.retriever import HybridRetriever


def test_rrf_fusion_basic():
    dense = [
        {"chunk_id": "a", "text": "doc a", "score": 0.9, "metadata": {}},
        {"chunk_id": "b", "text": "doc b", "score": 0.8, "metadata": {}},
    ]
    sparse = [
        {"chunk_id": "b", "text": "doc b", "score": 5.0, "metadata": {}},
        {"chunk_id": "c", "text": "doc c", "score": 4.0, "metadata": {}},
    ]
    fused = HybridRetriever.rrf_fuse([dense, sparse])
    ids = [item["chunk_id"] for item in fused]
    # "b" appears in both lists so should rank highest
    assert ids[0] == "b"
    assert len(fused) == 3


def test_rrf_fusion_empty():
    fused = HybridRetriever.rrf_fuse([[], []])
    assert fused == []


def test_rrf_fusion_single_list():
    results = [
        {"chunk_id": "x", "text": "x", "score": 1.0, "metadata": {}},
    ]
    fused = HybridRetriever.rrf_fuse([results])
    assert len(fused) == 1
    assert fused[0]["chunk_id"] == "x"


def test_rrf_fusion_with_weights():
    dense = [
        {"chunk_id": "a", "text": "a", "score": 0.9, "metadata": {}},
    ]
    sparse = [
        {"chunk_id": "b", "text": "b", "score": 5.0, "metadata": {}},
    ]
    fused = HybridRetriever.rrf_fuse([dense, sparse], weights=[1.0, 0.0])
    # With weight 0 on sparse, only dense matters
    assert fused[0]["chunk_id"] == "a"


def test_apply_filters():
    results = [
        {"chunk_id": "1", "text": "t", "score": 1, "metadata": {"doc_type": "pdf", "source": "a.pdf", "tags": []}},
        {"chunk_id": "2", "text": "t", "score": 1, "metadata": {"doc_type": "html", "source": "b.html", "tags": []}},
    ]
    from app.models.schemas import SearchFilters

    filters = SearchFilters(doc_type="pdf")
    filtered = HybridRetriever._apply_filters(results, filters)
    assert len(filtered) == 1
    assert filtered[0]["chunk_id"] == "1"