Dokumentassistent / tests /test_vector_store.py
XQ
Code cleaning
db45c50
raw
history blame
4.1 kB
"""Tests for the Qdrant vector store."""
import pytest
from src.models import ChunkStrategy, DocumentChunk
from src.retrieval.vector_store import VectorStore
DIMENSION = 4
def _make_chunk(chunk_id: str, text: str) -> DocumentChunk:
return DocumentChunk(
chunk_id=chunk_id,
document_id="doc-1",
text=text,
metadata={"page": 1},
strategy=ChunkStrategy.FIXED_SIZE,
)
def _fake_embedding(seed: float) -> list[float]:
"""Return a deterministic fake embedding of length DIMENSION."""
return [seed, 1.0 - seed, seed * 0.5, 0.1]
@pytest.fixture()
def store(tmp_path: str) -> VectorStore:
return VectorStore(
path=str(tmp_path / "qdrant"),
collection_name="test_collection",
dimension=DIMENSION,
)
class TestAddChunks:
"""Tests for inserting documents into the store."""
def test_add_and_search_returns_inserted_chunks(self, store: VectorStore) -> None:
chunks = [_make_chunk("c1", "hello world"), _make_chunk("c2", "foo bar")]
embeddings = [_fake_embedding(0.1), _fake_embedding(0.9)]
store.add_chunks(chunks, embeddings)
results = store.search(query_embedding=_fake_embedding(0.1), top_k=10)
assert len(results) == 2
returned_ids = {r.chunk.chunk_id for r in results}
assert returned_ids == {"c1", "c2"}
def test_add_chunks_length_mismatch_raises(self, store: VectorStore) -> None:
chunks = [_make_chunk("c1", "text")]
embeddings = [_fake_embedding(0.1), _fake_embedding(0.2)]
with pytest.raises(ValueError, match="mismatch"):
store.add_chunks(chunks, embeddings)
def test_add_empty_chunks_is_noop(self, store: VectorStore) -> None:
store.add_chunks([], [])
results = store.search(query_embedding=_fake_embedding(0.5), top_k=10)
assert results == []
class TestSearch:
"""Tests for querying the vector store."""
def test_top_k_limits_results(self, store: VectorStore) -> None:
chunks = [_make_chunk(f"c{i}", f"text {i}") for i in range(5)]
embeddings = [_fake_embedding(i * 0.1) for i in range(5)]
store.add_chunks(chunks, embeddings)
results = store.search(query_embedding=_fake_embedding(0.0), top_k=3)
assert len(results) == 3
def test_search_empty_collection(self, store: VectorStore) -> None:
results = store.search(query_embedding=_fake_embedding(0.5), top_k=5)
assert results == []
def test_results_contain_correct_metadata(self, store: VectorStore) -> None:
chunk = _make_chunk("c1", "some text")
store.add_chunks([chunk], [_fake_embedding(0.3)])
results = store.search(query_embedding=_fake_embedding(0.3), top_k=1)
assert len(results) == 1
r = results[0]
assert r.chunk.chunk_id == "c1"
assert r.chunk.document_id == "doc-1"
assert r.chunk.text == "some text"
assert r.chunk.metadata == {"page": 1}
assert r.chunk.strategy == ChunkStrategy.FIXED_SIZE
assert r.source == "dense"
def test_results_sorted_by_relevance(self, store: VectorStore) -> None:
chunks = [_make_chunk("c1", "a"), _make_chunk("c2", "b")]
embeddings = [_fake_embedding(0.9), _fake_embedding(0.1)]
store.add_chunks(chunks, embeddings)
results = store.search(query_embedding=_fake_embedding(0.9), top_k=2)
assert results[0].score >= results[1].score
class TestDuplicateInsert:
"""Tests for upserting (re-inserting) documents."""
def test_upsert_overwrites_existing_chunk(self, store: VectorStore) -> None:
chunk_v1 = _make_chunk("c1", "version 1")
chunk_v2 = _make_chunk("c1", "version 2")
store.add_chunks([chunk_v1], [_fake_embedding(0.5)])
store.add_chunks([chunk_v2], [_fake_embedding(0.5)])
results = store.search(query_embedding=_fake_embedding(0.5), top_k=10)
# Upsert uses enumerate index as point id, so same index → overwrite
assert len(results) == 1
assert results[0].chunk.text == "version 2"