File size: 4,103 Bytes
31a2688
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""Tests for the Qdrant vector store."""

import pytest

from src.models import ChunkStrategy, DocumentChunk
from src.retrieval.vector_store import VectorStore

DIMENSION = 4


def _make_chunk(chunk_id: str, text: str) -> DocumentChunk:
    return DocumentChunk(
        chunk_id=chunk_id,
        document_id="doc-1",
        text=text,
        metadata={"page": 1},
        strategy=ChunkStrategy.FIXED_SIZE,
    )


def _fake_embedding(seed: float) -> list[float]:
    """Return a deterministic fake embedding of length DIMENSION."""
    return [seed, 1.0 - seed, seed * 0.5, 0.1]


@pytest.fixture()
def store(tmp_path: str) -> VectorStore:
    return VectorStore(
        path=str(tmp_path / "qdrant"),
        collection_name="test_collection",
        dimension=DIMENSION,
    )


class TestAddChunks:
    """Tests for inserting documents into the store."""

    def test_add_and_search_returns_inserted_chunks(self, store: VectorStore) -> None:
        chunks = [_make_chunk("c1", "hello world"), _make_chunk("c2", "foo bar")]
        embeddings = [_fake_embedding(0.1), _fake_embedding(0.9)]

        store.add_chunks(chunks, embeddings)
        results = store.search(query_embedding=_fake_embedding(0.1), top_k=10)

        assert len(results) == 2
        returned_ids = {r.chunk.chunk_id for r in results}
        assert returned_ids == {"c1", "c2"}

    def test_add_chunks_length_mismatch_raises(self, store: VectorStore) -> None:
        chunks = [_make_chunk("c1", "text")]
        embeddings = [_fake_embedding(0.1), _fake_embedding(0.2)]

        with pytest.raises(ValueError, match="mismatch"):
            store.add_chunks(chunks, embeddings)

    def test_add_empty_chunks_is_noop(self, store: VectorStore) -> None:
        store.add_chunks([], [])
        results = store.search(query_embedding=_fake_embedding(0.5), top_k=10)
        assert results == []


class TestSearch:
    """Tests for querying the vector store."""

    def test_top_k_limits_results(self, store: VectorStore) -> None:
        chunks = [_make_chunk(f"c{i}", f"text {i}") for i in range(5)]
        embeddings = [_fake_embedding(i * 0.1) for i in range(5)]
        store.add_chunks(chunks, embeddings)

        results = store.search(query_embedding=_fake_embedding(0.0), top_k=3)
        assert len(results) == 3

    def test_search_empty_collection(self, store: VectorStore) -> None:
        results = store.search(query_embedding=_fake_embedding(0.5), top_k=5)
        assert results == []

    def test_results_contain_correct_metadata(self, store: VectorStore) -> None:
        chunk = _make_chunk("c1", "some text")
        store.add_chunks([chunk], [_fake_embedding(0.3)])

        results = store.search(query_embedding=_fake_embedding(0.3), top_k=1)
        assert len(results) == 1
        r = results[0]
        assert r.chunk.chunk_id == "c1"
        assert r.chunk.document_id == "doc-1"
        assert r.chunk.text == "some text"
        assert r.chunk.metadata == {"page": 1}
        assert r.chunk.strategy == ChunkStrategy.FIXED_SIZE
        assert r.source == "dense"

    def test_results_sorted_by_relevance(self, store: VectorStore) -> None:
        chunks = [_make_chunk("c1", "a"), _make_chunk("c2", "b")]
        embeddings = [_fake_embedding(0.9), _fake_embedding(0.1)]
        store.add_chunks(chunks, embeddings)

        results = store.search(query_embedding=_fake_embedding(0.9), top_k=2)
        assert results[0].score >= results[1].score


class TestDuplicateInsert:
    """Tests for upserting (re-inserting) documents."""

    def test_upsert_overwrites_existing_chunk(self, store: VectorStore) -> None:
        chunk_v1 = _make_chunk("c1", "version 1")
        chunk_v2 = _make_chunk("c1", "version 2")

        store.add_chunks([chunk_v1], [_fake_embedding(0.5)])
        store.add_chunks([chunk_v2], [_fake_embedding(0.5)])

        results = store.search(query_embedding=_fake_embedding(0.5), top_k=10)
        # Upsert uses enumerate index as point id, so same index → overwrite
        assert len(results) == 1
        assert results[0].chunk.text == "version 2"