""" tests/test_search.py — C1.3 Tests for SearchIndex, HybridSearch, and synonym expansion. """ import sys import os import pytest sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) # --------------------------------------------------------------------------- # SearchIndex unit tests # --------------------------------------------------------------------------- class TestSearchIndex: def _make_obs(self, obs_id, title, narrative="", concepts=None, obs_type="other"): return { "id": obs_id, "sessionId": "sess_test", "title": title, "narrative": narrative, "concepts": concepts or [], "files": [], "type": obs_type, } def test_add_and_exact_match_returns_rank_one(self): from search import SearchIndex idx = SearchIndex() obs = self._make_obs("obs_001", "authentication middleware refactor") idx.add(obs) results = idx.search("authentication middleware") assert len(results) > 0 assert results[0]["obsId"] == "obs_001" def test_prefix_matching(self): from search import SearchIndex idx = SearchIndex() obs = self._make_obs("obs_002", "authentication token validation") idx.add(obs) results = idx.search("authen") assert any(r["obsId"] == "obs_002" for r in results) def test_synonym_expansion_db_conn(self): """'db conn' should find document indexed with 'database connection'.""" from search import SearchIndex idx = SearchIndex() obs = self._make_obs("obs_003", "database connection pooling setup", "configure database connection pool") idx.add(obs) results = idx.search("db conn") assert any(r["obsId"] == "obs_003" for r in results) def test_remove_document(self): from search import SearchIndex idx = SearchIndex() obs = self._make_obs("obs_004", "deploy kubernetes service mesh") idx.add(obs) idx.remove("obs_004") results = idx.search("kubernetes") assert not any(r["obsId"] == "obs_004" for r in results) def test_empty_index_returns_empty(self): from search import SearchIndex idx = SearchIndex() results = idx.search("anything") assert results == [] def test_multiple_docs_rank_order(self): from search import SearchIndex idx = SearchIndex() idx.add(self._make_obs("obs_a", "authentication login system", "user authentication flow")) idx.add(self._make_obs("obs_b", "database migration script", "run database migration")) idx.add(self._make_obs("obs_c", "deployment pipeline CI", "CI CD pipeline deployment")) results = idx.search("authentication") assert results[0]["obsId"] == "obs_a" def test_size_property(self): from search import SearchIndex idx = SearchIndex() assert idx.size == 0 idx.add(self._make_obs("x1", "title one")) idx.add(self._make_obs("x2", "title two")) assert idx.size == 2 idx.remove("x1") assert idx.size == 1 def test_clear(self): from search import SearchIndex idx = SearchIndex() idx.add(self._make_obs("x1", "something")) idx.clear() assert idx.size == 0 assert idx.search("something") == [] def test_dirty_flag_set_on_add(self): from search import SearchIndex idx = SearchIndex() assert idx._dirty is False idx.add(self._make_obs("x1", "test dirty flag")) assert idx._dirty is True def test_dirty_flag_set_on_remove(self): from search import SearchIndex idx = SearchIndex() idx.add(self._make_obs("x1", "test remove dirty")) idx._dirty = False # reset manually idx.remove("x1") assert idx._dirty is True def test_dirty_flag_reset_after_restore(self): from search import SearchIndex idx = SearchIndex() idx.add(self._make_obs("x1", "test restore")) data = idx.serialize_data() idx2 = SearchIndex() idx2.restore_from_data(data) assert idx2._dirty is False def test_has_method(self): from search import SearchIndex idx = SearchIndex() obs = self._make_obs("obs_has", "has method test") assert not idx.has("obs_has") idx.add(obs) assert idx.has("obs_has") idx.remove("obs_has") assert not idx.has("obs_has") # --------------------------------------------------------------------------- # VectorIndex unit tests # --------------------------------------------------------------------------- class TestVectorIndex: def test_dirty_flag_on_add(self): from search import VectorIndex vi = VectorIndex() assert vi._dirty is False vi.add("v1", "sess", [0.1, 0.2, 0.3]) assert vi._dirty is True def test_dirty_flag_on_remove(self): from search import VectorIndex vi = VectorIndex() vi.add("v1", "sess", [0.1, 0.2, 0.3]) vi._dirty = False vi.remove("v1") assert vi._dirty is True def test_dirty_flag_reset_after_restore(self): from search import VectorIndex vi = VectorIndex() vi.add("v1", "sess", [0.1, 0.2, 0.3]) data = vi.serialize_data() vi2 = VectorIndex() vi2.restore_from_data(data) assert vi2._dirty is False # --------------------------------------------------------------------------- # HybridSearch in BM25-only mode # --------------------------------------------------------------------------- class TestHybridSearchBM25Only: def _make_obs(self, obs_id, title, narrative=""): return { "id": obs_id, "sessionId": "sess_hybrid", "title": title, "narrative": narrative, "concepts": [], "files": [], "type": "other", } def test_hybrid_bm25_only_returns_same_results_as_search_index(self): from search import SearchIndex, VectorIndex, HybridSearch bm25 = SearchIndex() vector = VectorIndex() docs = [ self._make_obs("h1", "authentication middleware implementation"), self._make_obs("h2", "database migration scripts"), self._make_obs("h3", "deployment kubernetes configuration"), ] for d in docs: bm25.add(d) # HybridSearch with no embedding provider — BM25 only hybrid = HybridSearch(bm25, vector, None, None) bm25_direct = bm25.search("authentication", 10) hybrid_results = hybrid.search("authentication", 10) bm25_ids = [r["obsId"] for r in bm25_direct] hybrid_ids = [r["obsId"] for r in hybrid_results] # The same document should appear at the top in both assert bm25_ids[0] == hybrid_ids[0] def test_hybrid_returns_empty_for_no_matches(self): from search import SearchIndex, VectorIndex, HybridSearch bm25 = SearchIndex() hybrid = HybridSearch(bm25, VectorIndex(), None, None) assert hybrid.search("zzznomatch", 10) == [] # --------------------------------------------------------------------------- # Serialization round-trip # --------------------------------------------------------------------------- class TestSearchIndexSerialization: def test_roundtrip_preserves_search_results(self): from search import SearchIndex idx = SearchIndex() idx.add({ "id": "rt_001", "sessionId": "sess_rt", "title": "serialization round trip test", "narrative": "verify that index survives serialize/restore", "concepts": ["test"], "files": [], "type": "other", }) data = idx.serialize_data() idx2 = SearchIndex() idx2.restore_from_data(data) results = idx2.search("serialization round trip") assert any(r["obsId"] == "rt_001" for r in results)