"""Tests for the Qdrant vector store.""" import pytest from src.models import ChunkStrategy, DocumentChunk from src.retrieval.vector_store import VectorStore DIMENSION = 4 def _make_chunk(chunk_id: str, text: str) -> DocumentChunk: return DocumentChunk( chunk_id=chunk_id, document_id="doc-1", text=text, metadata={"page": 1}, strategy=ChunkStrategy.FIXED_SIZE, ) def _fake_embedding(seed: float) -> list[float]: """Return a deterministic fake embedding of length DIMENSION.""" return [seed, 1.0 - seed, seed * 0.5, 0.1] @pytest.fixture() def store(tmp_path: str) -> VectorStore: return VectorStore( path=str(tmp_path / "qdrant"), collection_name="test_collection", dimension=DIMENSION, ) class TestAddChunks: """Tests for inserting documents into the store.""" def test_add_and_search_returns_inserted_chunks(self, store: VectorStore) -> None: chunks = [_make_chunk("c1", "hello world"), _make_chunk("c2", "foo bar")] embeddings = [_fake_embedding(0.1), _fake_embedding(0.9)] store.add_chunks(chunks, embeddings) results = store.search(query_embedding=_fake_embedding(0.1), top_k=10) assert len(results) == 2 returned_ids = {r.chunk.chunk_id for r in results} assert returned_ids == {"c1", "c2"} def test_add_chunks_length_mismatch_raises(self, store: VectorStore) -> None: chunks = [_make_chunk("c1", "text")] embeddings = [_fake_embedding(0.1), _fake_embedding(0.2)] with pytest.raises(ValueError, match="mismatch"): store.add_chunks(chunks, embeddings) def test_add_empty_chunks_is_noop(self, store: VectorStore) -> None: store.add_chunks([], []) results = store.search(query_embedding=_fake_embedding(0.5), top_k=10) assert results == [] class TestSearch: """Tests for querying the vector store.""" def test_top_k_limits_results(self, store: VectorStore) -> None: chunks = [_make_chunk(f"c{i}", f"text {i}") for i in range(5)] embeddings = [_fake_embedding(i * 0.1) for i in range(5)] store.add_chunks(chunks, embeddings) results = store.search(query_embedding=_fake_embedding(0.0), top_k=3) assert len(results) == 3 def test_search_empty_collection(self, store: VectorStore) -> None: results = store.search(query_embedding=_fake_embedding(0.5), top_k=5) assert results == [] def test_results_contain_correct_metadata(self, store: VectorStore) -> None: chunk = _make_chunk("c1", "some text") store.add_chunks([chunk], [_fake_embedding(0.3)]) results = store.search(query_embedding=_fake_embedding(0.3), top_k=1) assert len(results) == 1 r = results[0] assert r.chunk.chunk_id == "c1" assert r.chunk.document_id == "doc-1" assert r.chunk.text == "some text" assert r.chunk.metadata == {"page": 1} assert r.chunk.strategy == ChunkStrategy.FIXED_SIZE assert r.source == "dense" def test_results_sorted_by_relevance(self, store: VectorStore) -> None: chunks = [_make_chunk("c1", "a"), _make_chunk("c2", "b")] embeddings = [_fake_embedding(0.9), _fake_embedding(0.1)] store.add_chunks(chunks, embeddings) results = store.search(query_embedding=_fake_embedding(0.9), top_k=2) assert results[0].score >= results[1].score class TestDuplicateInsert: """Tests for upserting (re-inserting) documents.""" def test_upsert_overwrites_existing_chunk(self, store: VectorStore) -> None: chunk_v1 = _make_chunk("c1", "version 1") chunk_v2 = _make_chunk("c1", "version 2") store.add_chunks([chunk_v1], [_fake_embedding(0.5)]) store.add_chunks([chunk_v2], [_fake_embedding(0.5)]) results = store.search(query_embedding=_fake_embedding(0.5), top_k=10) # Upsert uses enumerate index as point id, so same index → overwrite assert len(results) == 1 assert results[0].chunk.text == "version 2"