import asyncio import sys import types from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) import pytest import app.vector_store as vector_store def test_verification_thresholds_calibrated_for_bge_m3(): # User-preferred conservative thresholds for BGE-M3 embedding space assert vector_store.VERIFIED_DENSE_THRESHOLD == 0.70 assert vector_store.VERIFIED_HYBRID_THRESHOLD == 0.65 def test_embed_text_rejects_non_1024_vector(monkeypatch): class FakeEmbedding: def __init__(self, values): self._values = values def tolist(self): return self._values class FakeModel: def query_embed(self, texts): yield FakeEmbedding([0.1, 0.2]) monkeypatch.setattr(vector_store, "_get_embedding_model", lambda: FakeModel()) with pytest.raises(ValueError, match="expected 1024"): vector_store.embed_text("lettuce tipburn") class FakeResponse: def __init__(self, status_code=200, payload=None, text="ok"): self.status_code = status_code self._payload = payload or [] self.text = text def json(self): return self._payload class FakeAsyncClient: def __init__(self, *args, **kwargs): self.calls = [] async def __aenter__(self): return self async def __aexit__(self, exc_type, exc, tb): return False async def post(self, url, headers=None, json=None): self.calls.append({"url": url, "headers": headers, "json": json}) return FakeResponse( payload=[ { "source": "Doc", "filename": "doc.pdf", "page_number": 2, "content": "Expanded horticultural context", "similarity": 0.91, } ] ) def test_search_knowledge_logs_hyde_query_label_and_embeds_transformed_query(monkeypatch, capsys): captured = {} client = FakeAsyncClient() monkeypatch.setattr(vector_store, "is_configured", lambda: True) monkeypatch.setattr(vector_store, "SUPABASE_URL", "https://example.supabase.co") def fake_embed_text(text): captured["embedded_query"] = text return [0.1, 0.2] monkeypatch.setattr(vector_store, "embed_text", fake_embed_text) monkeypatch.setattr(vector_store.httpx, "AsyncClient", lambda timeout=10.0: client) chunks = asyncio.run( vector_store.search_knowledge( query="Expanded agronomic explanation for lettuce humidity", query_label="hyde", ) ) output = capsys.readouterr().out assert chunks[0]["source"] == "Doc" assert "filename" in chunks[0], "Supabase response must include 'filename' for parent expansion" assert captured["embedded_query"] == "Expanded agronomic explanation for lettuce humidity" assert client.calls[0]["json"]["match_count"] == vector_store.DEFAULT_MATCH_COUNT assert "[VectorRAG:hyde]" in output def test_search_knowledge_defaults_to_raw_query_label(monkeypatch, capsys): client = FakeAsyncClient() monkeypatch.setattr(vector_store, "is_configured", lambda: True) monkeypatch.setattr(vector_store, "SUPABASE_URL", "https://example.supabase.co") monkeypatch.setattr(vector_store, "embed_text", lambda text: [0.1, 0.2]) monkeypatch.setattr(vector_store.httpx, "AsyncClient", lambda timeout=10.0: client) asyncio.run(vector_store.search_knowledge(query="plain query")) output = capsys.readouterr().out assert "[VectorRAG:raw]" in output def test_merge_knowledge_results_deduplicates_by_filename_page_content_keeps_higher_similarity(): from app.vector_store import merge_knowledge_results primary = [ {"filename": "cornell.pdf", "source": "Cornell-Lettuce", "page_number": 5, "content": "tipburn info", "similarity": 0.75}, {"filename": "kubis.pdf", "source": "Kubis-Guide", "page_number": 2, "content": "cabbage info", "similarity": 0.82}, ] english = [ {"filename": "cornell.pdf", "source": "Cornell-Lettuce", "page_number": 5, "content": "tipburn info", "similarity": 0.85}, {"filename": "new.pdf", "source": "New-Source", "page_number": 1, "content": "new content", "similarity": 0.90}, ] merged = merge_knowledge_results([primary, english]) sources = [c["source"] for c in merged] assert "Cornell-Lettuce" in sources # deduped assert "Kubis-Guide" in sources assert "New-Source" in sources assert len(merged) == 3 # no duplicates cornell_chunk = next(c for c in merged if c["source"] == "Cornell-Lettuce") assert cornell_chunk["similarity"] == 0.85 # higher similarity kept def test_merge_knowledge_results_does_not_dedup_different_content_same_page(): from app.vector_store import merge_knowledge_results chunks = [[ {"filename": "doc.pdf", "page_number": 3, "content": "first paragraph", "similarity": 0.80}, {"filename": "doc.pdf", "page_number": 3, "content": "second paragraph", "similarity": 0.79}, ]] merged = merge_knowledge_results(chunks) assert len(merged) == 2 # different content → not deduped def test_merge_knowledge_results_respects_top_k(): from app.vector_store import merge_knowledge_results chunks = [ [{"source": f"Doc{i}", "page_number": i, "content": "x", "similarity": 0.9 - i * 0.01} for i in range(4)] ] merged = merge_knowledge_results(chunks, top_k=2) assert len(merged) == 2 def test_merge_knowledge_results_handles_empty_inputs(): from app.vector_store import merge_knowledge_results assert merge_knowledge_results([]) == [] assert merge_knowledge_results([[], []]) == [] # ============================================================================= # expand_knowledge_results tests # ============================================================================= def test_expand_knowledge_results_passthrough_on_empty_corpus(monkeypatch): from app import vector_store monkeypatch.setattr(vector_store, "_corpus", []) monkeypatch.setattr(vector_store, "_corpus_lookup", {}) chunks = [ {"filename": "a.pdf", "page_number": 1, "content": "hello", "similarity": 0.80}, {"filename": "b.pdf", "page_number": 2, "content": "world", "similarity": 0.75}, ] pairs = vector_store.expand_knowledge_results(chunks) assert len(pairs) == 2 for original, window in pairs: assert window is None assert pairs[0][0]["filename"] == "a.pdf" assert pairs[1][0]["filename"] == "b.pdf" def test_expand_knowledge_results_returns_none_window_when_corpus_empty(monkeypatch): from app import vector_store monkeypatch.setattr(vector_store, "_corpus", []) monkeypatch.setattr(vector_store, "_corpus_lookup", {}) chunks = [{"filename": "doc.pdf", "page_number": 1, "content": "some text", "similarity": 0.85}] pairs = vector_store.expand_knowledge_results(chunks) assert len(pairs) == 1 original, window = pairs[0] assert original["content"] == "some text" assert window is None def test_expand_knowledge_results_returns_window_when_match_found(monkeypatch): from app import vector_store from app.knowledge_chunking import NormalizedChildChunk from app.parent_context import ParentWindow chunk = {"filename": "guide.pdf", "page_number": 2, "content": "matched text", "similarity": 0.91} fake_chunk = NormalizedChildChunk( child_id="guide.pdf::p2::i0", source="Guide", filename="guide.pdf", page_number=2, content="matched text", corpus_ordinal=0, ) fake_window = ParentWindow( primary_child=fake_chunk, left_neighbor=None, right_neighbor=None, combined_text="matched text", ) monkeypatch.setattr(vector_store, "_corpus", [fake_chunk]) monkeypatch.setattr(vector_store, "_corpus_lookup", { ("guide.pdf", 2, "matched text"): fake_chunk }) import app.parent_context as pc_mod monkeypatch.setattr(pc_mod, "find_and_expand", lambda hit, corpus, lookup: fake_window) pairs = vector_store.expand_knowledge_results([chunk]) assert len(pairs) == 1 original, window = pairs[0] assert original["similarity"] == 0.91 assert window is not None assert window is fake_window # ============================================================================= # format_knowledge_context with parent windows # ============================================================================= def test_format_knowledge_context_renders_matched_paragraph_label(monkeypatch): """format_knowledge_context should label the primary text as [MATCHED PARAGRAPH].""" from app import vector_store monkeypatch.setattr(vector_store, "_corpus", []) monkeypatch.setattr(vector_store, "_corpus_lookup", {}) chunks = [{"source": "Guide", "page_number": 1, "content": "tipburn info", "similarity": 0.91, "filename": "guide.pdf"}] result = vector_store.format_knowledge_context(chunks) assert "[MATCHED PARAGRAPH]" in result assert "tipburn info" in result assert "CITE AS 📖" in result def test_format_knowledge_context_renders_supporting_context_when_window_present(monkeypatch): """format_knowledge_context should render Supporting context when neighbors exist.""" from app import vector_store from app.knowledge_chunking import NormalizedChildChunk from app.parent_context import ParentWindow left = NormalizedChildChunk("f::p0::i0", "Guide", "guide.pdf", 0, "left neighbor text", 0) primary = NormalizedChildChunk("f::p1::i1", "Guide", "guide.pdf", 1, "primary text", 1) right = NormalizedChildChunk("f::p2::i2", "Guide", "guide.pdf", 2, "right neighbor text", 2) fake_window = ParentWindow( primary_child=primary, left_neighbor=left, right_neighbor=right, combined_text="left neighbor text\n\nprimary text\n\nright neighbor text", ) chunk = {"filename": "guide.pdf", "page_number": 1, "content": "primary text", "similarity": 0.91, "source": "Guide"} monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(chunks[0], fake_window)]) result = vector_store.format_knowledge_context([chunk]) assert "[MATCHED PARAGRAPH]" in result assert "primary text" in result assert "Supporting context" in result assert "left neighbor text" in result assert "right neighbor text" in result assert "CITE AS" in result # ============================================================================= # format_knowledge_context — plant_aliases citation filter # ============================================================================= def test_format_knowledge_context_plant_alias_filter_promotes_matching_chunk(monkeypatch): """Chunk that mentions an alias IN CONTENT stays as 📖 Verified.""" from app import vector_store monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(c, None) for c in chunks]) chunk = { "source": "Petunjuk Teknis Budidaya Sayuran Dataran Rendah", "filename": "sayuran.pdf", "page_number": 22, "content": "Hama yang menyerang tanaman kangkung antara lain ulat grayak.", "similarity": 0.72, "retrieval_modes": ["dense"], } result = vector_store.format_knowledge_context( [chunk], plant_aliases=["kangkung", "Water Spinach", "Ipomoea aquatica"], ) assert "CITE AS 📖" in result assert "Background Context" not in result def test_format_knowledge_context_plant_alias_matches_source_name(monkeypatch): """Chunk whose SOURCE NAME contains the alias qualifies even if content does not mention it. This covers dedicated crop documents (e.g. 'Budidaya Cabe Di Perkotaan') where ~59% of chunks never repeat the crop name inside the paragraph body. """ from app import vector_store monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(c, None) for c in chunks]) chunk = { "source": "Budidaya Cabe Di Perkotaan", # "Cabe" is in the source name "filename": "budidaya-cabe.pdf", "page_number": 33, "content": "Layu Fusarium / Fusarium wilt disebabkan oleh jamur Fusarium oxysporum.", "similarity": 0.72, "retrieval_modes": ["dense"], } result = vector_store.format_knowledge_context( [chunk], plant_aliases=["Cabe", "Cabai", "Chili", "Capsicum annuum"], ) assert "CITE AS 📖" in result assert "Background Context" not in result def test_format_knowledge_context_plant_alias_filter_demotes_non_matching_chunk(monkeypatch): """Chunk with NO alias in content AND no alias in source name → Background Context.""" from app import vector_store monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(c, None) for c in chunks]) chunk = { "source": "Melon Pest Guide", "filename": "melon.pdf", "page_number": 38, "content": "Patogen masuk ke dalam tanaman melalui ujung-ujung akar.", "similarity": 0.72, "retrieval_modes": ["dense"], } result = vector_store.format_knowledge_context( [chunk], plant_aliases=["kangkung", "Water Spinach", "Ipomoea aquatica"], ) assert "CITE AS 📖" not in result assert "Background Context" in result def test_format_knowledge_context_no_plant_aliases_skips_filter(monkeypatch): """When plant_aliases=None (general query), verified chunks keep 📖 regardless of content.""" from app import vector_store monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(c, None) for c in chunks]) chunk = { "source": "Melon Pest Guide", "filename": "melon.pdf", "page_number": 38, "content": "Patogen masuk ke dalam tanaman melon melalui ujung-ujung akar.", "similarity": 0.72, "retrieval_modes": ["dense"], } result = vector_store.format_knowledge_context([chunk], plant_aliases=None) assert "CITE AS 📖" in result def test_format_knowledge_context_plant_alias_case_insensitive(monkeypatch): """Alias matching is case-insensitive.""" from app import vector_store monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(c, None) for c in chunks]) chunk = { "source": "Guide", "filename": "guide.pdf", "page_number": 1, "content": "Water Spinach is susceptible to Pythium root rot.", "similarity": 0.75, "retrieval_modes": ["dense"], } result = vector_store.format_knowledge_context( [chunk], plant_aliases=["water spinach"], # lowercase ) assert "CITE AS 📖" in result def test_format_knowledge_context_uses_selected_chunk_order(monkeypatch): from app import vector_store monkeypatch.setattr( vector_store, "select_knowledge_chunks", lambda chunks, plant_aliases=None, stage=None, max_verified_chunks=3, max_background_chunks=1: [ dict(chunks[1], selection_score=0.93, selection_promoted_background=False), dict(chunks[0], selection_score=0.51, selection_promoted_background=False), ], ) monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(c, None) for c in chunks]) chunks = [ {"filename": "generic.pdf", "page_number": 1, "source": "Generic", "content": "generic", "similarity": 0.72, "retrieval_modes": ["dense"]}, {"filename": "lettuce.pdf", "page_number": 2, "source": "Lettuce", "content": "lettuce", "similarity": 0.68, "retrieval_modes": ["dense", "lexical"]}, ] result = vector_store.format_knowledge_context(chunks, plant_aliases=["lettuce"], stage="vegetative") assert result.index("lettuce") < result.index("generic") def test_format_knowledge_context_drops_extra_background_chunks(monkeypatch): from app import vector_store def fake_select(chunks, plant_aliases=None, stage=None, max_verified_chunks=3, max_background_chunks=1): return [dict(chunks[0], selection_score=0.44, selection_promoted_background=False)] monkeypatch.setattr(vector_store, "select_knowledge_chunks", fake_select) monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(c, None) for c in chunks]) chunks = [ {"filename": "bg-0.pdf", "page_number": 1, "source": "BG 0", "content": "first context", "similarity": 0.41, "retrieval_modes": ["dense"]}, {"filename": "bg-1.pdf", "page_number": 2, "source": "BG 1", "content": "second context", "similarity": 0.40, "retrieval_modes": ["dense"]}, ] result = vector_store.format_knowledge_context(chunks, plant_aliases=["lettuce"], stage="vegetative") assert "first context" in result assert "second context" not in result def test_format_knowledge_context_returns_empty_when_selected_empty(monkeypatch): from app import vector_store def fake_select(chunks, plant_aliases=None, stage=None, max_verified_chunks=3, max_background_chunks=1): return [] monkeypatch.setattr(vector_store, "select_knowledge_chunks", fake_select) chunks = [ {"filename": "bg-0.pdf", "page_number": 1, "source": "BG 0", "content": "first context", "similarity": 0.41, "retrieval_modes": ["dense"]}, ] result = vector_store.format_knowledge_context(chunks, plant_aliases=["lettuce"], stage="vegetative") assert result == "" # ============================================================================= # Lexical retrieval and RRF utilities # ============================================================================= def test_search_knowledge_fts_posts_query_text_to_match_knowledge_fts(monkeypatch): client = FakeAsyncClient() monkeypatch.setattr(vector_store, "is_configured", lambda: True) monkeypatch.setattr(vector_store, "SUPABASE_URL", "https://example.supabase.co") monkeypatch.setattr(vector_store.httpx, "AsyncClient", lambda timeout=10.0: client) result = asyncio.run(vector_store.search_knowledge_fts("pythium root rot", match_count=6)) assert client.calls[0]["url"].endswith("/rpc/match_knowledge_fts") assert client.calls[0]["json"]["query_text"] == "pythium root rot" assert client.calls[0]["json"]["match_count"] == 6 assert result[0]["filename"] == "doc.pdf" def test_search_knowledge_fts_returns_empty_when_not_configured(monkeypatch): monkeypatch.setattr(vector_store, "is_configured", lambda: False) result = asyncio.run(vector_store.search_knowledge_fts("pythium root rot")) assert result == [] def test_search_knowledge_fts_returns_empty_on_non_200_response(monkeypatch): class ErrorClient: async def __aenter__(self): return self async def __aexit__(self, *a): return False async def post(self, *a, **kw): return FakeResponse(status_code=503, payload=[], text="Service Unavailable") monkeypatch.setattr(vector_store, "is_configured", lambda: True) monkeypatch.setattr(vector_store, "SUPABASE_URL", "https://example.supabase.co") monkeypatch.setattr(vector_store.httpx, "AsyncClient", lambda timeout=10.0: ErrorClient()) result = asyncio.run(vector_store.search_knowledge_fts("query")) assert result == [] def test_reciprocal_rank_fuse_prefers_chunk_seen_by_both_lists(): dense = [ {"filename": "leafy.pdf", "page_number": 79, "content": "Pythium Root Rot...", "similarity": 0.68}, {"filename": "pumpkin.pdf", "page_number": 13, "content": "Fusarium crown rot...", "similarity": 0.67}, ] lexical = [ {"filename": "leafy.pdf", "page_number": 79, "content": "Pythium Root Rot...", "lexical_score": 0.42}, {"filename": "manual.pdf", "page_number": 47, "content": "Growing plants...", "lexical_score": 0.31}, ] fused = vector_store.reciprocal_rank_fuse(dense, lexical, top_k=2, rrf_k=60) assert fused[0]["filename"] == "leafy.pdf" assert fused[0]["retrieval_modes"] == ["dense", "lexical"] def test_search_knowledge_hybrid_falls_back_to_dense_only_on_lexical_failure(monkeypatch): async def fake_search_knowledge(query, match_count=7, match_threshold=0.30, query_label="raw"): return [{"filename": "doc.pdf", "page_number": 1, "content": "dense hit", "similarity": 0.8}] async def fake_search_knowledge_fts(query, match_count=7): raise RuntimeError("fts down") monkeypatch.setattr(vector_store, "search_knowledge", fake_search_knowledge) monkeypatch.setattr(vector_store, "search_knowledge_fts", fake_search_knowledge_fts) dense_queries = [types.SimpleNamespace(text="dense query", label="hyde")] fused_list, _ = asyncio.run(vector_store.search_knowledge_hybrid(raw_query="raw query", dense_queries=dense_queries)) assert fused_list[0]["content"] == "dense hit" def test_select_knowledge_chunks_prefers_cross_modal_plant_match(): """Cross-modal (dense+lexical) signal should win when plant and stage signals are equal. Construct two chunks with identical plant mentions and stage mentions; only the presence of the lexical signal differs. The dense+lexical chunk should be preferred. """ from app import vector_store chunks = [ { "filename": "lettuce.pdf", "page_number": 8, "source": "Generic Lettuce Guide", "content": "Lettuce crop management. Vegetative stage note.", "similarity": 0.67, "retrieval_modes": ["dense", "lexical"], }, { "filename": "generic.pdf", "page_number": 2, "source": "Generic Lettuce Guide", "content": "Lettuce crop management. Vegetative stage note.", "similarity": 0.67, "retrieval_modes": ["dense"], }, ] selected = vector_store.select_knowledge_chunks( chunks, plant_aliases=["lettuce"], stage="vegetative", max_verified_chunks=2, max_background_chunks=1, ) assert selected[0]["filename"] == "lettuce.pdf" assert selected[0]["selection_score"] > selected[1]["selection_score"] def test_select_knowledge_chunks_limits_background_chunks(): from app import vector_store chunks = [ { "filename": f"bg-{i}.pdf", "page_number": i, "source": f"Background {i}", "content": f"Generic context {i}", "similarity": 0.41 - (i * 0.01), "retrieval_modes": ["dense"], } for i in range(4) ] selected = vector_store.select_knowledge_chunks( chunks, plant_aliases=["lettuce"], stage="vegetative", max_verified_chunks=0, max_background_chunks=1, ) assert len(selected) == 1 assert selected[0]["filename"] == "bg-0.pdf" def test_select_knowledge_chunks_rewards_stage_match(): """Ensure stage signal breaks ties when plant/cross-modal signals are equal. Both candidate chunks are constructed to have identical plant-match and similarity/retrieval signals; only the stage mention differs. The vegetative chunk should therefore be preferred when stage="vegetative". """ from app import vector_store # Both chunks mention the plant equally (in source/content) and have identical # similarity and retrieval_modes so that the only distinguishing signal is stage. chunks = [ { "filename": "veg.pdf", "page_number": 4, "source": "Generic Lettuce Guide", "content": "Lettuce crop management. Vegetative stage details.", "similarity": 0.62, "retrieval_modes": ["dense"], }, { "filename": "fruiting.pdf", "page_number": 9, "source": "Generic Lettuce Guide", "content": "Lettuce crop management. Fruiting stage details.", "similarity": 0.62, "retrieval_modes": ["dense"], }, ] selected = vector_store.select_knowledge_chunks( chunks, plant_aliases=["lettuce"], stage="vegetative", max_verified_chunks=2, max_background_chunks=0, ) assert selected[0]["filename"] == "veg.pdf" def test_selection_promoted_background_flag(): """Verify selection_promoted_background flags promoted and non-promoted chunks. - One true verified chunk should be selected and have selection_promoted_background False. - The highest-scoring non-verified (background) chunk should be promoted into the remaining verified slot and be marked selection_promoted_background True. - Any additionally appended background chunk should have the flag False. """ from app import vector_store chunks = [ # True verified chunk (similarity >= 0.70) { "filename": "verified.pdf", "page_number": 1, "source": "Verified Guide", "content": "Verified authoritative content", "similarity": 0.72, "retrieval_modes": ["dense"], }, # Background chunks (below verified threshold) { "filename": "bg-promoted.pdf", "page_number": 2, "source": "Background Source", "content": "Relevant background content A", "similarity": 0.60, "retrieval_modes": ["dense"], }, { "filename": "bg-normal.pdf", "page_number": 3, "source": "Background Source", "content": "Relevant background content B", "similarity": 0.59, "retrieval_modes": ["dense"], }, ] selected = vector_store.select_knowledge_chunks( chunks, plant_aliases=None, stage=None, max_verified_chunks=2, max_background_chunks=1, ) # Expect order: verified (non-promoted), bg-promoted (promoted into verified), then bg-normal (background) assert len(selected) == 3 assert selected[0]["filename"] == "verified.pdf" assert selected[0]["selection_promoted_background"] is False assert selected[1]["filename"] == "bg-promoted.pdf" assert selected[1]["selection_promoted_background"] is True assert selected[2]["filename"] == "bg-normal.pdf" assert selected[2]["selection_promoted_background"] is False