Spaces:
Running
Running
| """ | |
| tests/test_phase2.py | |
| ==================== | |
| Phase 2 — Hybrid Retrieval Engine Tests | |
| Tests: | |
| - BM25Retriever: loading, search, empty index, tokenization | |
| - VectorRetriever: embedding, search, empty collection | |
| - HybridRetriever: RRF merge mathematics, cross-encoder reranking, | |
| diversity filter, query expansion | |
| - ContextBuilder: output format, citation map, history integration | |
| Run with: pytest tests/test_phase2.py -v | |
| """ | |
| from __future__ import annotations | |
| import pickle | |
| import uuid | |
| from pathlib import Path | |
| import numpy as np | |
| import pytest | |
| from voicevault.models import Citation, RetrievalResult | |
| # ------------------------------------------------------------------ # | |
| # Fixtures # | |
| # ------------------------------------------------------------------ # | |
| def populated_kb(tmp_path: Path, tmp_db: Path): | |
| """ | |
| Return (kb_name, bm25_pkl_path, chroma_persist_dir, db_path) for a KB | |
| with 5 indexed chunks covering distinct topics. | |
| """ | |
| from rank_bm25 import BM25Okapi | |
| from voicevault.storage.chroma_store import ChromaStore | |
| from voicevault.storage.sqlite_store import create_kb, register_document, register_chunk | |
| from voicevault.models import DocumentChunk | |
| kb_name = "test-retrieval-kb" | |
| bm25_path = tmp_path / "bm25.pkl" | |
| chroma_dir = tmp_path / "chroma" | |
| # 5 sample chunks | |
| texts = [ | |
| "Machine learning is a subset of artificial intelligence that enables systems to learn from data.", | |
| "Deep learning uses neural networks with many layers to model complex patterns.", | |
| "Natural language processing helps computers understand and generate human language.", | |
| "Reinforcement learning trains agents to make decisions by rewarding desired behaviors.", | |
| "Computer vision allows machines to interpret and understand images and videos.", | |
| ] | |
| chunk_ids = [str(uuid.uuid4()) for _ in texts] | |
| # Build and save BM25 index | |
| tokenized = [t.lower().split() for t in texts] | |
| bm25 = BM25Okapi(tokenized) | |
| with open(bm25_path, "wb") as f: | |
| pickle.dump({"corpus": tokenized, "chunk_ids": chunk_ids, "bm25": bm25}, f) | |
| # Build ChromaDB collection | |
| store = ChromaStore.__new__(ChromaStore) | |
| store._kb_name = kb_name | |
| store._persist_dir = chroma_dir | |
| store._client = None | |
| store._collection = None | |
| from sentence_transformers import SentenceTransformer | |
| embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| embeddings = embedder.encode(texts).tolist() | |
| chunks = [ | |
| DocumentChunk( | |
| chunk_id=chunk_ids[i], | |
| kb_name=kb_name, | |
| source_file="ml_overview.pdf", | |
| page_number=i + 1, | |
| section="Overview", | |
| chunk_index=i, | |
| text=texts[i], | |
| text_hash=f"hash_{i}", | |
| token_count=len(texts[i].split()), | |
| ) | |
| for i in range(len(texts)) | |
| ] | |
| store.add_chunks(chunks, embeddings) | |
| # Register in SQLite | |
| create_kb(tmp_db, kb_name, "Test Retrieval KB") | |
| register_document(tmp_db, "doc-001", kb_name, "ml_overview.pdf", "file_hash_001", | |
| page_count=5, chunk_count=5) | |
| for i, (chunk_id, text) in enumerate(zip(chunk_ids, texts)): | |
| register_chunk(tmp_db, chunk_id, kb_name, "doc-001", | |
| "ml_overview.pdf", i + 1, "Overview", i, f"hash_{i}", | |
| len(text.split())) | |
| return kb_name, bm25_path, chroma_dir, tmp_db, chunk_ids, texts | |
| # ------------------------------------------------------------------ # | |
| # BM25Retriever Tests # | |
| # ------------------------------------------------------------------ # | |
| class TestBM25Retriever: | |
| """Tests for voicevault.retrieval.bm25_retriever.BM25Retriever.""" | |
| def test_search_returns_results(self, populated_kb, tmp_path: Path) -> None: | |
| from voicevault.retrieval.bm25_retriever import BM25Retriever | |
| kb_name, bm25_path, _, _, chunk_ids, texts = populated_kb | |
| retriever = BM25Retriever.__new__(BM25Retriever) | |
| retriever._kb_name = kb_name | |
| retriever._bm25_path = bm25_path | |
| retriever._bm25 = None | |
| retriever._chunk_ids = [] | |
| retriever._corpus = [] | |
| retriever._loaded = False | |
| results = retriever.search("machine learning", top_k=3) | |
| assert len(results) >= 1 | |
| assert all("chunk_id" in r for r in results) | |
| assert all("bm25_score" in r for r in results) | |
| def test_search_ranks_relevant_first(self, populated_kb) -> None: | |
| from voicevault.retrieval.bm25_retriever import BM25Retriever | |
| kb_name, bm25_path, _, _, chunk_ids, texts = populated_kb | |
| retriever = BM25Retriever.__new__(BM25Retriever) | |
| retriever._kb_name = kb_name | |
| retriever._bm25_path = bm25_path | |
| retriever._bm25 = None | |
| retriever._chunk_ids = [] | |
| retriever._corpus = [] | |
| retriever._loaded = False | |
| results = retriever.search("neural networks deep learning", top_k=5) | |
| # The deep learning chunk should rank high | |
| top_ids = [r["chunk_id"] for r in results[:2]] | |
| assert chunk_ids[1] in top_ids # "Deep learning uses neural networks..." | |
| def test_search_empty_index_returns_empty(self, tmp_path: Path) -> None: | |
| from voicevault.retrieval.bm25_retriever import BM25Retriever | |
| retriever = BM25Retriever.__new__(BM25Retriever) | |
| retriever._kb_name = "empty-kb" | |
| retriever._bm25_path = tmp_path / "nonexistent_bm25.pkl" | |
| retriever._bm25 = None | |
| retriever._chunk_ids = [] | |
| retriever._corpus = [] | |
| retriever._loaded = False | |
| results = retriever.search("anything") | |
| assert results == [] | |
| def test_scores_are_sorted_descending(self, populated_kb) -> None: | |
| from voicevault.retrieval.bm25_retriever import BM25Retriever | |
| kb_name, bm25_path, _, _, _, _ = populated_kb | |
| retriever = BM25Retriever.__new__(BM25Retriever) | |
| retriever._kb_name = kb_name | |
| retriever._bm25_path = bm25_path | |
| retriever._bm25 = None | |
| retriever._chunk_ids = [] | |
| retriever._corpus = [] | |
| retriever._loaded = False | |
| results = retriever.search("learning data intelligence systems", top_k=5) | |
| scores = [r["bm25_score"] for r in results] | |
| assert scores == sorted(scores, reverse=True) | |
| def test_top_k_limits_results(self, populated_kb) -> None: | |
| from voicevault.retrieval.bm25_retriever import BM25Retriever | |
| kb_name, bm25_path, _, _, _, _ = populated_kb | |
| retriever = BM25Retriever.__new__(BM25Retriever) | |
| retriever._kb_name = kb_name | |
| retriever._bm25_path = bm25_path | |
| retriever._bm25 = None | |
| retriever._chunk_ids = [] | |
| retriever._corpus = [] | |
| retriever._loaded = False | |
| results = retriever.search("learning", top_k=2) | |
| assert len(results) <= 2 | |
| def test_tokenize_lowercases(self) -> None: | |
| from voicevault.retrieval.bm25_retriever import BM25Retriever | |
| tokens = BM25Retriever._tokenize("Machine Learning Is FUN") | |
| assert tokens == ["machine", "learning", "is", "fun"] | |
| def test_is_ready_false_when_no_index(self, tmp_path: Path) -> None: | |
| from voicevault.retrieval.bm25_retriever import BM25Retriever | |
| retriever = BM25Retriever.__new__(BM25Retriever) | |
| retriever._kb_name = "noindex" | |
| retriever._bm25_path = tmp_path / "no_bm25.pkl" | |
| retriever._bm25 = None | |
| retriever._chunk_ids = [] | |
| retriever._corpus = [] | |
| retriever._loaded = False | |
| assert retriever.is_ready() is False | |
| def test_is_ready_true_when_loaded(self, populated_kb) -> None: | |
| from voicevault.retrieval.bm25_retriever import BM25Retriever | |
| kb_name, bm25_path, _, _, _, _ = populated_kb | |
| retriever = BM25Retriever.__new__(BM25Retriever) | |
| retriever._kb_name = kb_name | |
| retriever._bm25_path = bm25_path | |
| retriever._bm25 = None | |
| retriever._chunk_ids = [] | |
| retriever._corpus = [] | |
| retriever._loaded = False | |
| assert retriever.is_ready() is True | |
| # ------------------------------------------------------------------ # | |
| # VectorRetriever Tests # | |
| # ------------------------------------------------------------------ # | |
| class TestVectorRetriever: | |
| """Tests for voicevault.retrieval.vector_retriever.VectorRetriever.""" | |
| def test_search_returns_results(self, populated_kb) -> None: | |
| from voicevault.retrieval.vector_retriever import VectorRetriever | |
| from voicevault.storage.chroma_store import ChromaStore | |
| kb_name, _, chroma_dir, _, _, texts = populated_kb | |
| retriever = VectorRetriever.__new__(VectorRetriever) | |
| retriever._kb_name = kb_name | |
| retriever._embedder = None | |
| retriever._chroma = ChromaStore.__new__(ChromaStore) | |
| retriever._chroma._kb_name = kb_name | |
| retriever._chroma._persist_dir = chroma_dir | |
| retriever._chroma._client = None | |
| retriever._chroma._collection = None | |
| results = retriever.search("what is machine learning", top_k=5) | |
| assert len(results) >= 1 | |
| def test_search_returns_vector_scores(self, populated_kb) -> None: | |
| from voicevault.retrieval.vector_retriever import VectorRetriever | |
| from voicevault.storage.chroma_store import ChromaStore | |
| kb_name, _, chroma_dir, _, _, _ = populated_kb | |
| retriever = VectorRetriever.__new__(VectorRetriever) | |
| retriever._kb_name = kb_name | |
| retriever._embedder = None | |
| retriever._chroma = ChromaStore.__new__(ChromaStore) | |
| retriever._chroma._kb_name = kb_name | |
| retriever._chroma._persist_dir = chroma_dir | |
| retriever._chroma._client = None | |
| retriever._chroma._collection = None | |
| results = retriever.search("neural network deep learning", top_k=5) | |
| assert all("vector_score" in r for r in results) | |
| assert all(0.0 <= r["vector_score"] <= 1.0 for r in results) | |
| def test_embed_query_returns_384_dim(self, populated_kb) -> None: | |
| from voicevault.retrieval.vector_retriever import VectorRetriever | |
| from voicevault.storage.chroma_store import ChromaStore | |
| kb_name, _, chroma_dir, _, _, _ = populated_kb | |
| retriever = VectorRetriever.__new__(VectorRetriever) | |
| retriever._kb_name = kb_name | |
| retriever._embedder = None | |
| retriever._chroma = ChromaStore.__new__(ChromaStore) | |
| retriever._chroma._kb_name = kb_name | |
| retriever._chroma._persist_dir = chroma_dir | |
| retriever._chroma._client = None | |
| retriever._chroma._collection = None | |
| embedding = retriever.embed_query("machine learning") | |
| assert isinstance(embedding, list) | |
| assert len(embedding) == 384 | |
| def test_search_empty_collection_returns_empty(self, tmp_path: Path) -> None: | |
| from voicevault.retrieval.vector_retriever import VectorRetriever | |
| from voicevault.storage.chroma_store import ChromaStore | |
| retriever = VectorRetriever.__new__(VectorRetriever) | |
| retriever._kb_name = "empty-vec-kb" | |
| retriever._embedder = None | |
| retriever._chroma = ChromaStore.__new__(ChromaStore) | |
| retriever._chroma._kb_name = "empty-vec-kb" | |
| retriever._chroma._persist_dir = tmp_path / "empty-chroma" | |
| retriever._chroma._client = None | |
| retriever._chroma._collection = None | |
| results = retriever.search("anything") | |
| assert results == [] | |
| # ------------------------------------------------------------------ # | |
| # RRF Mathematics Tests # | |
| # ------------------------------------------------------------------ # | |
| class TestRRFMerge: | |
| """Test the RRF merge logic with known inputs.""" | |
| def _make_retriever(self) -> object: | |
| from voicevault.retrieval.hybrid_retriever import HybridRetriever | |
| r = HybridRetriever.__new__(HybridRetriever) | |
| r._rrf_k = 60 | |
| r._final_top_k = 5 | |
| r._rerank_top_k = 20 | |
| r._use_reranker = False | |
| r._kb_names = [] | |
| r._bm25_retrievers = {} | |
| r._vector_retrievers = {} | |
| r._cross_encoder = None | |
| return r | |
| def test_rrf_chunk_in_both_lists_gets_higher_score(self) -> None: | |
| """A chunk appearing in both BM25 and vector results must score higher than one appearing in only one.""" | |
| retriever = self._make_retriever() | |
| bm25 = {"chunk-A": {"chunk_id": "chunk-A", "bm25_score": 5.0, "rank": 1}, | |
| "chunk-B": {"chunk_id": "chunk-B", "bm25_score": 3.0, "rank": 2}} | |
| vector = {"chunk-A": {"chunk_id": "chunk-A", "vector_score": 0.9, "rank": 1}, | |
| "chunk-C": {"chunk_id": "chunk-C", "vector_score": 0.8, "rank": 2}} | |
| scores = retriever._rrf_merge(bm25, vector) | |
| assert scores["chunk-A"] > scores["chunk-B"] | |
| assert scores["chunk-A"] > scores["chunk-C"] | |
| def test_rrf_score_formula(self) -> None: | |
| """Verify RRF score = 1/(60+1) + 1/(60+1) = 2/61 for rank-1 in both lists.""" | |
| retriever = self._make_retriever() | |
| bm25 = {"chunk-X": {"chunk_id": "chunk-X", "bm25_score": 10.0, "rank": 1}} | |
| vector = {"chunk-X": {"chunk_id": "chunk-X", "vector_score": 0.99, "rank": 1}} | |
| scores = retriever._rrf_merge(bm25, vector) | |
| expected = 1.0 / (60 + 1) + 1.0 / (60 + 1) | |
| assert abs(scores["chunk-X"] - expected) < 1e-9 | |
| def test_rrf_higher_rank_gets_lower_score(self) -> None: | |
| """Rank 1 must score higher than rank 5 in RRF.""" | |
| retriever = self._make_retriever() | |
| bm25 = { | |
| "rank1": {"chunk_id": "rank1", "bm25_score": 10.0, "rank": 1}, | |
| "rank5": {"chunk_id": "rank5", "bm25_score": 6.0, "rank": 5}, | |
| } | |
| scores = retriever._rrf_merge(bm25, {}) | |
| assert scores["rank1"] > scores["rank5"] | |
| def test_rrf_empty_inputs(self) -> None: | |
| retriever = self._make_retriever() | |
| scores = retriever._rrf_merge({}, {}) | |
| assert scores == {} | |
| def test_rrf_single_method_only(self) -> None: | |
| """RRF should work with results from only one method.""" | |
| retriever = self._make_retriever() | |
| bm25 = {"chunk-Z": {"chunk_id": "chunk-Z", "bm25_score": 3.0, "rank": 1}} | |
| scores = retriever._rrf_merge(bm25, {}) | |
| assert "chunk-Z" in scores | |
| assert scores["chunk-Z"] == pytest.approx(1.0 / (60 + 1)) | |
| # ------------------------------------------------------------------ # | |
| # Diversity Filter Tests # | |
| # ------------------------------------------------------------------ # | |
| class TestDiversityFilter: | |
| """Test the diversity filter logic.""" | |
| def _make_retriever(self) -> object: | |
| from voicevault.retrieval.hybrid_retriever import HybridRetriever | |
| import unittest.mock as mock | |
| r = HybridRetriever.__new__(HybridRetriever) | |
| r._rrf_k = 60 | |
| r._final_top_k = 5 | |
| r._rerank_top_k = 20 | |
| r._use_reranker = False | |
| r._kb_names = [] | |
| r._bm25_retrievers = {} | |
| r._vector_retrievers = {} | |
| r._cross_encoder = None | |
| return r | |
| def _make_result(self, chunk_id: str, source: str, page: int, score: float = 0.5) -> RetrievalResult: | |
| return RetrievalResult( | |
| chunk_id=chunk_id, text="text", source_file=source, | |
| page_number=page, rerank_score=score, | |
| ) | |
| def test_allows_max_chunks_per_page(self) -> None: | |
| from config import cfg | |
| retriever = self._make_retriever() | |
| limit = cfg.max_chunks_per_page # Should be 2 | |
| results = [self._make_result(f"c{i}", "doc.pdf", 1) for i in range(limit + 2)] | |
| filtered = retriever._diversity_filter(results) | |
| from_page_1 = [r for r in filtered if r.source_file == "doc.pdf" and r.page_number == 1] | |
| assert len(from_page_1) <= limit | |
| def test_different_pages_all_pass(self) -> None: | |
| retriever = self._make_retriever() | |
| results = [ | |
| self._make_result("c1", "doc.pdf", 1), | |
| self._make_result("c2", "doc.pdf", 2), | |
| self._make_result("c3", "doc.pdf", 3), | |
| ] | |
| filtered = retriever._diversity_filter(results) | |
| assert len(filtered) == 3 | |
| def test_different_sources_all_pass(self) -> None: | |
| retriever = self._make_retriever() | |
| results = [ | |
| self._make_result("c1", "a.pdf", 1), | |
| self._make_result("c2", "b.pdf", 1), | |
| self._make_result("c3", "c.pdf", 1), | |
| ] | |
| filtered = retriever._diversity_filter(results) | |
| assert len(filtered) == 3 | |
| # ------------------------------------------------------------------ # | |
| # Query Expansion Tests # | |
| # ------------------------------------------------------------------ # | |
| class TestQueryExpansion: | |
| def _make_retriever(self): | |
| from voicevault.retrieval.hybrid_retriever import HybridRetriever | |
| r = HybridRetriever.__new__(HybridRetriever) | |
| r._kb_names = [] | |
| return r | |
| def test_expand_includes_original(self) -> None: | |
| retriever = self._make_retriever() | |
| variants = retriever._expand_query("what is machine learning?") | |
| assert "what is machine learning?" in variants | |
| def test_expand_question_to_declarative(self) -> None: | |
| retriever = self._make_retriever() | |
| variants = retriever._expand_query("what is machine learning") | |
| declarative = "machine learning" | |
| assert any(declarative in v for v in variants) | |
| def test_expand_returns_at_most_3(self) -> None: | |
| retriever = self._make_retriever() | |
| variants = retriever._expand_query("how does transformer work") | |
| assert len(variants) <= 3 | |
| def test_expand_empty_query(self) -> None: | |
| retriever = self._make_retriever() | |
| variants = retriever._expand_query("") | |
| assert variants[0] == "" | |
| # ------------------------------------------------------------------ # | |
| # ContextBuilder Tests # | |
| # ------------------------------------------------------------------ # | |
| class TestContextBuilder: | |
| """Tests for voicevault.retrieval.context_builder.ContextBuilder.""" | |
| def _make_results(self, n: int = 3) -> list[RetrievalResult]: | |
| return [ | |
| RetrievalResult( | |
| chunk_id=f"chunk-{i}", | |
| text=f"Sample text for chunk {i}. It contains relevant information.", | |
| source_file=f"doc{i}.pdf", | |
| page_number=i + 1, | |
| section=f"Section {i}", | |
| rrf_score=0.05 - i * 0.01, | |
| rerank_score=0.9 - i * 0.1, | |
| ) | |
| for i in range(n) | |
| ] | |
| def test_build_returns_non_empty_context(self) -> None: | |
| from voicevault.retrieval.context_builder import ContextBuilder | |
| builder = ContextBuilder() | |
| context, citations = builder.build(self._make_results(3)) | |
| assert context | |
| assert len(citations) == 3 | |
| def test_build_empty_results(self) -> None: | |
| from voicevault.retrieval.context_builder import ContextBuilder | |
| builder = ContextBuilder() | |
| context, citations = builder.build([]) | |
| assert context == "" | |
| assert citations == [] | |
| def test_context_contains_source_headers(self) -> None: | |
| from voicevault.retrieval.context_builder import ContextBuilder | |
| builder = ContextBuilder() | |
| results = self._make_results(2) | |
| context, _ = builder.build(results) | |
| assert "doc0.pdf" in context | |
| assert "doc1.pdf" in context | |
| assert "p.1" in context | |
| assert "p.2" in context | |
| def test_context_contains_section_when_present(self) -> None: | |
| from voicevault.retrieval.context_builder import ContextBuilder | |
| builder = ContextBuilder() | |
| results = self._make_results(1) | |
| context, _ = builder.build(results) | |
| assert "Section 0" in context | |
| def test_citation_map_matches_results(self) -> None: | |
| from voicevault.retrieval.context_builder import ContextBuilder | |
| builder = ContextBuilder() | |
| results = self._make_results(3) | |
| _, citations = builder.build(results) | |
| for i, (result, citation) in enumerate(zip(results, citations)): | |
| assert citation.source_file == result.source_file | |
| assert citation.page_number == result.page_number | |
| def test_context_includes_conversation_history(self) -> None: | |
| from voicevault.retrieval.context_builder import ContextBuilder | |
| builder = ContextBuilder() | |
| history = [("What is AI?", "AI is artificial intelligence.")] | |
| context, _ = builder.build(self._make_results(1), history=history) | |
| assert "What is AI?" in context | |
| assert "Conversation History" in context | |
| def test_history_limited_to_max_turns(self) -> None: | |
| from voicevault.retrieval.context_builder import ContextBuilder | |
| builder = ContextBuilder() | |
| history = [(f"Q{i}", f"A{i}") for i in range(10)] | |
| context, _ = builder.build(self._make_results(1), history=history, max_history_turns=3) | |
| # Only last 3 turns should appear | |
| assert "Q9" in context | |
| assert "Q0" not in context | |
| def test_citation_instructions_returned(self) -> None: | |
| from voicevault.retrieval.context_builder import ContextBuilder | |
| instructions = ContextBuilder.format_citation_instructions() | |
| assert "[Source:" in instructions | |
| assert "I could not find this in your documents" in instructions | |
| def test_citation_excerpts_truncated(self) -> None: | |
| from voicevault.retrieval.context_builder import ContextBuilder | |
| builder = ContextBuilder() | |
| long_result = RetrievalResult( | |
| chunk_id="long-chunk", | |
| text="word " * 500, # 500 words >> 200 char excerpt limit | |
| source_file="long.pdf", | |
| page_number=1, | |
| rerank_score=0.9, | |
| ) | |
| _, citations = builder.build([long_result]) | |
| assert len(citations[0].excerpt) <= 204 # 200 + "..." | |