Spaces:
Running
Running
File size: 4,103 Bytes
31a2688 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | """Tests for the Qdrant vector store."""
import pytest
from src.models import ChunkStrategy, DocumentChunk
from src.retrieval.vector_store import VectorStore
DIMENSION = 4
def _make_chunk(chunk_id: str, text: str) -> DocumentChunk:
return DocumentChunk(
chunk_id=chunk_id,
document_id="doc-1",
text=text,
metadata={"page": 1},
strategy=ChunkStrategy.FIXED_SIZE,
)
def _fake_embedding(seed: float) -> list[float]:
"""Return a deterministic fake embedding of length DIMENSION."""
return [seed, 1.0 - seed, seed * 0.5, 0.1]
@pytest.fixture()
def store(tmp_path: str) -> VectorStore:
return VectorStore(
path=str(tmp_path / "qdrant"),
collection_name="test_collection",
dimension=DIMENSION,
)
class TestAddChunks:
"""Tests for inserting documents into the store."""
def test_add_and_search_returns_inserted_chunks(self, store: VectorStore) -> None:
chunks = [_make_chunk("c1", "hello world"), _make_chunk("c2", "foo bar")]
embeddings = [_fake_embedding(0.1), _fake_embedding(0.9)]
store.add_chunks(chunks, embeddings)
results = store.search(query_embedding=_fake_embedding(0.1), top_k=10)
assert len(results) == 2
returned_ids = {r.chunk.chunk_id for r in results}
assert returned_ids == {"c1", "c2"}
def test_add_chunks_length_mismatch_raises(self, store: VectorStore) -> None:
chunks = [_make_chunk("c1", "text")]
embeddings = [_fake_embedding(0.1), _fake_embedding(0.2)]
with pytest.raises(ValueError, match="mismatch"):
store.add_chunks(chunks, embeddings)
def test_add_empty_chunks_is_noop(self, store: VectorStore) -> None:
store.add_chunks([], [])
results = store.search(query_embedding=_fake_embedding(0.5), top_k=10)
assert results == []
class TestSearch:
"""Tests for querying the vector store."""
def test_top_k_limits_results(self, store: VectorStore) -> None:
chunks = [_make_chunk(f"c{i}", f"text {i}") for i in range(5)]
embeddings = [_fake_embedding(i * 0.1) for i in range(5)]
store.add_chunks(chunks, embeddings)
results = store.search(query_embedding=_fake_embedding(0.0), top_k=3)
assert len(results) == 3
def test_search_empty_collection(self, store: VectorStore) -> None:
results = store.search(query_embedding=_fake_embedding(0.5), top_k=5)
assert results == []
def test_results_contain_correct_metadata(self, store: VectorStore) -> None:
chunk = _make_chunk("c1", "some text")
store.add_chunks([chunk], [_fake_embedding(0.3)])
results = store.search(query_embedding=_fake_embedding(0.3), top_k=1)
assert len(results) == 1
r = results[0]
assert r.chunk.chunk_id == "c1"
assert r.chunk.document_id == "doc-1"
assert r.chunk.text == "some text"
assert r.chunk.metadata == {"page": 1}
assert r.chunk.strategy == ChunkStrategy.FIXED_SIZE
assert r.source == "dense"
def test_results_sorted_by_relevance(self, store: VectorStore) -> None:
chunks = [_make_chunk("c1", "a"), _make_chunk("c2", "b")]
embeddings = [_fake_embedding(0.9), _fake_embedding(0.1)]
store.add_chunks(chunks, embeddings)
results = store.search(query_embedding=_fake_embedding(0.9), top_k=2)
assert results[0].score >= results[1].score
class TestDuplicateInsert:
"""Tests for upserting (re-inserting) documents."""
def test_upsert_overwrites_existing_chunk(self, store: VectorStore) -> None:
chunk_v1 = _make_chunk("c1", "version 1")
chunk_v2 = _make_chunk("c1", "version 2")
store.add_chunks([chunk_v1], [_fake_embedding(0.5)])
store.add_chunks([chunk_v2], [_fake_embedding(0.5)])
results = store.search(query_embedding=_fake_embedding(0.5), top_k=10)
# Upsert uses enumerate index as point id, so same index → overwrite
assert len(results) == 1
assert results[0].chunk.text == "version 2"
|