Spaces:
Running
Running
| import asyncio | |
| import sys | |
| import types | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| import pytest | |
| import app.vector_store as vector_store | |
| def test_verification_thresholds_calibrated_for_bge_m3(): | |
| # User-preferred conservative thresholds for BGE-M3 embedding space | |
| assert vector_store.VERIFIED_DENSE_THRESHOLD == 0.70 | |
| assert vector_store.VERIFIED_HYBRID_THRESHOLD == 0.65 | |
| def test_embed_text_rejects_non_1024_vector(monkeypatch): | |
| class FakeEmbedding: | |
| def __init__(self, values): | |
| self._values = values | |
| def tolist(self): | |
| return self._values | |
| class FakeModel: | |
| def query_embed(self, texts): | |
| yield FakeEmbedding([0.1, 0.2]) | |
| monkeypatch.setattr(vector_store, "_get_embedding_model", lambda: FakeModel()) | |
| with pytest.raises(ValueError, match="expected 1024"): | |
| vector_store.embed_text("lettuce tipburn") | |
| class FakeResponse: | |
| def __init__(self, status_code=200, payload=None, text="ok"): | |
| self.status_code = status_code | |
| self._payload = payload or [] | |
| self.text = text | |
| def json(self): | |
| return self._payload | |
| class FakeAsyncClient: | |
| def __init__(self, *args, **kwargs): | |
| self.calls = [] | |
| async def __aenter__(self): | |
| return self | |
| async def __aexit__(self, exc_type, exc, tb): | |
| return False | |
| async def post(self, url, headers=None, json=None): | |
| self.calls.append({"url": url, "headers": headers, "json": json}) | |
| return FakeResponse( | |
| payload=[ | |
| { | |
| "source": "Doc", | |
| "filename": "doc.pdf", | |
| "page_number": 2, | |
| "content": "Expanded horticultural context", | |
| "similarity": 0.91, | |
| } | |
| ] | |
| ) | |
| def test_search_knowledge_logs_hyde_query_label_and_embeds_transformed_query(monkeypatch, capsys): | |
| captured = {} | |
| client = FakeAsyncClient() | |
| monkeypatch.setattr(vector_store, "is_configured", lambda: True) | |
| monkeypatch.setattr(vector_store, "SUPABASE_URL", "https://example.supabase.co") | |
| def fake_embed_text(text): | |
| captured["embedded_query"] = text | |
| return [0.1, 0.2] | |
| monkeypatch.setattr(vector_store, "embed_text", fake_embed_text) | |
| monkeypatch.setattr(vector_store.httpx, "AsyncClient", lambda timeout=10.0: client) | |
| chunks = asyncio.run( | |
| vector_store.search_knowledge( | |
| query="Expanded agronomic explanation for lettuce humidity", | |
| query_label="hyde", | |
| ) | |
| ) | |
| output = capsys.readouterr().out | |
| assert chunks[0]["source"] == "Doc" | |
| assert "filename" in chunks[0], "Supabase response must include 'filename' for parent expansion" | |
| assert captured["embedded_query"] == "Expanded agronomic explanation for lettuce humidity" | |
| assert client.calls[0]["json"]["match_count"] == vector_store.DEFAULT_MATCH_COUNT | |
| assert "[VectorRAG:hyde]" in output | |
| def test_search_knowledge_defaults_to_raw_query_label(monkeypatch, capsys): | |
| client = FakeAsyncClient() | |
| monkeypatch.setattr(vector_store, "is_configured", lambda: True) | |
| monkeypatch.setattr(vector_store, "SUPABASE_URL", "https://example.supabase.co") | |
| monkeypatch.setattr(vector_store, "embed_text", lambda text: [0.1, 0.2]) | |
| monkeypatch.setattr(vector_store.httpx, "AsyncClient", lambda timeout=10.0: client) | |
| asyncio.run(vector_store.search_knowledge(query="plain query")) | |
| output = capsys.readouterr().out | |
| assert "[VectorRAG:raw]" in output | |
| def test_merge_knowledge_results_deduplicates_by_filename_page_content_keeps_higher_similarity(): | |
| from app.vector_store import merge_knowledge_results | |
| primary = [ | |
| {"filename": "cornell.pdf", "source": "Cornell-Lettuce", "page_number": 5, "content": "tipburn info", "similarity": 0.75}, | |
| {"filename": "kubis.pdf", "source": "Kubis-Guide", "page_number": 2, "content": "cabbage info", "similarity": 0.82}, | |
| ] | |
| english = [ | |
| {"filename": "cornell.pdf", "source": "Cornell-Lettuce", "page_number": 5, "content": "tipburn info", "similarity": 0.85}, | |
| {"filename": "new.pdf", "source": "New-Source", "page_number": 1, "content": "new content", "similarity": 0.90}, | |
| ] | |
| merged = merge_knowledge_results([primary, english]) | |
| sources = [c["source"] for c in merged] | |
| assert "Cornell-Lettuce" in sources # deduped | |
| assert "Kubis-Guide" in sources | |
| assert "New-Source" in sources | |
| assert len(merged) == 3 # no duplicates | |
| cornell_chunk = next(c for c in merged if c["source"] == "Cornell-Lettuce") | |
| assert cornell_chunk["similarity"] == 0.85 # higher similarity kept | |
| def test_merge_knowledge_results_does_not_dedup_different_content_same_page(): | |
| from app.vector_store import merge_knowledge_results | |
| chunks = [[ | |
| {"filename": "doc.pdf", "page_number": 3, "content": "first paragraph", "similarity": 0.80}, | |
| {"filename": "doc.pdf", "page_number": 3, "content": "second paragraph", "similarity": 0.79}, | |
| ]] | |
| merged = merge_knowledge_results(chunks) | |
| assert len(merged) == 2 # different content → not deduped | |
| def test_merge_knowledge_results_respects_top_k(): | |
| from app.vector_store import merge_knowledge_results | |
| chunks = [ | |
| [{"source": f"Doc{i}", "page_number": i, "content": "x", "similarity": 0.9 - i * 0.01} | |
| for i in range(4)] | |
| ] | |
| merged = merge_knowledge_results(chunks, top_k=2) | |
| assert len(merged) == 2 | |
| def test_merge_knowledge_results_handles_empty_inputs(): | |
| from app.vector_store import merge_knowledge_results | |
| assert merge_knowledge_results([]) == [] | |
| assert merge_knowledge_results([[], []]) == [] | |
| # ============================================================================= | |
| # expand_knowledge_results tests | |
| # ============================================================================= | |
| def test_expand_knowledge_results_passthrough_on_empty_corpus(monkeypatch): | |
| from app import vector_store | |
| monkeypatch.setattr(vector_store, "_corpus", []) | |
| monkeypatch.setattr(vector_store, "_corpus_lookup", {}) | |
| chunks = [ | |
| {"filename": "a.pdf", "page_number": 1, "content": "hello", "similarity": 0.80}, | |
| {"filename": "b.pdf", "page_number": 2, "content": "world", "similarity": 0.75}, | |
| ] | |
| pairs = vector_store.expand_knowledge_results(chunks) | |
| assert len(pairs) == 2 | |
| for original, window in pairs: | |
| assert window is None | |
| assert pairs[0][0]["filename"] == "a.pdf" | |
| assert pairs[1][0]["filename"] == "b.pdf" | |
| def test_expand_knowledge_results_returns_none_window_when_corpus_empty(monkeypatch): | |
| from app import vector_store | |
| monkeypatch.setattr(vector_store, "_corpus", []) | |
| monkeypatch.setattr(vector_store, "_corpus_lookup", {}) | |
| chunks = [{"filename": "doc.pdf", "page_number": 1, "content": "some text", "similarity": 0.85}] | |
| pairs = vector_store.expand_knowledge_results(chunks) | |
| assert len(pairs) == 1 | |
| original, window = pairs[0] | |
| assert original["content"] == "some text" | |
| assert window is None | |
| def test_expand_knowledge_results_returns_window_when_match_found(monkeypatch): | |
| from app import vector_store | |
| from app.knowledge_chunking import NormalizedChildChunk | |
| from app.parent_context import ParentWindow | |
| chunk = {"filename": "guide.pdf", "page_number": 2, "content": "matched text", "similarity": 0.91} | |
| fake_chunk = NormalizedChildChunk( | |
| child_id="guide.pdf::p2::i0", | |
| source="Guide", | |
| filename="guide.pdf", | |
| page_number=2, | |
| content="matched text", | |
| corpus_ordinal=0, | |
| ) | |
| fake_window = ParentWindow( | |
| primary_child=fake_chunk, | |
| left_neighbor=None, | |
| right_neighbor=None, | |
| combined_text="matched text", | |
| ) | |
| monkeypatch.setattr(vector_store, "_corpus", [fake_chunk]) | |
| monkeypatch.setattr(vector_store, "_corpus_lookup", { | |
| ("guide.pdf", 2, "matched text"): fake_chunk | |
| }) | |
| import app.parent_context as pc_mod | |
| monkeypatch.setattr(pc_mod, "find_and_expand", lambda hit, corpus, lookup: fake_window) | |
| pairs = vector_store.expand_knowledge_results([chunk]) | |
| assert len(pairs) == 1 | |
| original, window = pairs[0] | |
| assert original["similarity"] == 0.91 | |
| assert window is not None | |
| assert window is fake_window | |
| # ============================================================================= | |
| # format_knowledge_context with parent windows | |
| # ============================================================================= | |
| def test_format_knowledge_context_renders_matched_paragraph_label(monkeypatch): | |
| """format_knowledge_context should label the primary text as [MATCHED PARAGRAPH].""" | |
| from app import vector_store | |
| monkeypatch.setattr(vector_store, "_corpus", []) | |
| monkeypatch.setattr(vector_store, "_corpus_lookup", {}) | |
| chunks = [{"source": "Guide", "page_number": 1, "content": "tipburn info", "similarity": 0.91, "filename": "guide.pdf"}] | |
| result = vector_store.format_knowledge_context(chunks) | |
| assert "[MATCHED PARAGRAPH]" in result | |
| assert "tipburn info" in result | |
| assert "CITE AS 📖" in result | |
| def test_format_knowledge_context_renders_supporting_context_when_window_present(monkeypatch): | |
| """format_knowledge_context should render Supporting context when neighbors exist.""" | |
| from app import vector_store | |
| from app.knowledge_chunking import NormalizedChildChunk | |
| from app.parent_context import ParentWindow | |
| left = NormalizedChildChunk("f::p0::i0", "Guide", "guide.pdf", 0, "left neighbor text", 0) | |
| primary = NormalizedChildChunk("f::p1::i1", "Guide", "guide.pdf", 1, "primary text", 1) | |
| right = NormalizedChildChunk("f::p2::i2", "Guide", "guide.pdf", 2, "right neighbor text", 2) | |
| fake_window = ParentWindow( | |
| primary_child=primary, | |
| left_neighbor=left, | |
| right_neighbor=right, | |
| combined_text="left neighbor text\n\nprimary text\n\nright neighbor text", | |
| ) | |
| chunk = {"filename": "guide.pdf", "page_number": 1, "content": "primary text", "similarity": 0.91, "source": "Guide"} | |
| monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(chunks[0], fake_window)]) | |
| result = vector_store.format_knowledge_context([chunk]) | |
| assert "[MATCHED PARAGRAPH]" in result | |
| assert "primary text" in result | |
| assert "Supporting context" in result | |
| assert "left neighbor text" in result | |
| assert "right neighbor text" in result | |
| assert "CITE AS" in result | |
| # ============================================================================= | |
| # format_knowledge_context — plant_aliases citation filter | |
| # ============================================================================= | |
| def test_format_knowledge_context_plant_alias_filter_promotes_matching_chunk(monkeypatch): | |
| """Chunk that mentions an alias IN CONTENT stays as 📖 Verified.""" | |
| from app import vector_store | |
| monkeypatch.setattr(vector_store, "expand_knowledge_results", | |
| lambda chunks: [(c, None) for c in chunks]) | |
| chunk = { | |
| "source": "Petunjuk Teknis Budidaya Sayuran Dataran Rendah", | |
| "filename": "sayuran.pdf", | |
| "page_number": 22, | |
| "content": "Hama yang menyerang tanaman kangkung antara lain ulat grayak.", | |
| "similarity": 0.72, | |
| "retrieval_modes": ["dense"], | |
| } | |
| result = vector_store.format_knowledge_context( | |
| [chunk], | |
| plant_aliases=["kangkung", "Water Spinach", "Ipomoea aquatica"], | |
| ) | |
| assert "CITE AS 📖" in result | |
| assert "Background Context" not in result | |
| def test_format_knowledge_context_plant_alias_matches_source_name(monkeypatch): | |
| """Chunk whose SOURCE NAME contains the alias qualifies even if content does not mention it. | |
| This covers dedicated crop documents (e.g. 'Budidaya Cabe Di Perkotaan') where | |
| ~59% of chunks never repeat the crop name inside the paragraph body. | |
| """ | |
| from app import vector_store | |
| monkeypatch.setattr(vector_store, "expand_knowledge_results", | |
| lambda chunks: [(c, None) for c in chunks]) | |
| chunk = { | |
| "source": "Budidaya Cabe Di Perkotaan", # "Cabe" is in the source name | |
| "filename": "budidaya-cabe.pdf", | |
| "page_number": 33, | |
| "content": "Layu Fusarium / Fusarium wilt disebabkan oleh jamur Fusarium oxysporum.", | |
| "similarity": 0.72, | |
| "retrieval_modes": ["dense"], | |
| } | |
| result = vector_store.format_knowledge_context( | |
| [chunk], | |
| plant_aliases=["Cabe", "Cabai", "Chili", "Capsicum annuum"], | |
| ) | |
| assert "CITE AS 📖" in result | |
| assert "Background Context" not in result | |
| def test_format_knowledge_context_plant_alias_filter_demotes_non_matching_chunk(monkeypatch): | |
| """Chunk with NO alias in content AND no alias in source name → Background Context.""" | |
| from app import vector_store | |
| monkeypatch.setattr(vector_store, "expand_knowledge_results", | |
| lambda chunks: [(c, None) for c in chunks]) | |
| chunk = { | |
| "source": "Melon Pest Guide", | |
| "filename": "melon.pdf", | |
| "page_number": 38, | |
| "content": "Patogen masuk ke dalam tanaman melalui ujung-ujung akar.", | |
| "similarity": 0.72, | |
| "retrieval_modes": ["dense"], | |
| } | |
| result = vector_store.format_knowledge_context( | |
| [chunk], | |
| plant_aliases=["kangkung", "Water Spinach", "Ipomoea aquatica"], | |
| ) | |
| assert "CITE AS 📖" not in result | |
| assert "Background Context" in result | |
| def test_format_knowledge_context_no_plant_aliases_skips_filter(monkeypatch): | |
| """When plant_aliases=None (general query), verified chunks keep 📖 regardless of content.""" | |
| from app import vector_store | |
| monkeypatch.setattr(vector_store, "expand_knowledge_results", | |
| lambda chunks: [(c, None) for c in chunks]) | |
| chunk = { | |
| "source": "Melon Pest Guide", | |
| "filename": "melon.pdf", | |
| "page_number": 38, | |
| "content": "Patogen masuk ke dalam tanaman melon melalui ujung-ujung akar.", | |
| "similarity": 0.72, | |
| "retrieval_modes": ["dense"], | |
| } | |
| result = vector_store.format_knowledge_context([chunk], plant_aliases=None) | |
| assert "CITE AS 📖" in result | |
| def test_format_knowledge_context_plant_alias_case_insensitive(monkeypatch): | |
| """Alias matching is case-insensitive.""" | |
| from app import vector_store | |
| monkeypatch.setattr(vector_store, "expand_knowledge_results", | |
| lambda chunks: [(c, None) for c in chunks]) | |
| chunk = { | |
| "source": "Guide", | |
| "filename": "guide.pdf", | |
| "page_number": 1, | |
| "content": "Water Spinach is susceptible to Pythium root rot.", | |
| "similarity": 0.75, | |
| "retrieval_modes": ["dense"], | |
| } | |
| result = vector_store.format_knowledge_context( | |
| [chunk], | |
| plant_aliases=["water spinach"], # lowercase | |
| ) | |
| assert "CITE AS 📖" in result | |
| def test_format_knowledge_context_uses_selected_chunk_order(monkeypatch): | |
| from app import vector_store | |
| monkeypatch.setattr( | |
| vector_store, | |
| "select_knowledge_chunks", | |
| lambda chunks, plant_aliases=None, stage=None, max_verified_chunks=3, max_background_chunks=1: [ | |
| dict(chunks[1], selection_score=0.93, selection_promoted_background=False), | |
| dict(chunks[0], selection_score=0.51, selection_promoted_background=False), | |
| ], | |
| ) | |
| monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(c, None) for c in chunks]) | |
| chunks = [ | |
| {"filename": "generic.pdf", "page_number": 1, "source": "Generic", "content": "generic", "similarity": 0.72, "retrieval_modes": ["dense"]}, | |
| {"filename": "lettuce.pdf", "page_number": 2, "source": "Lettuce", "content": "lettuce", "similarity": 0.68, "retrieval_modes": ["dense", "lexical"]}, | |
| ] | |
| result = vector_store.format_knowledge_context(chunks, plant_aliases=["lettuce"], stage="vegetative") | |
| assert result.index("lettuce") < result.index("generic") | |
| def test_format_knowledge_context_drops_extra_background_chunks(monkeypatch): | |
| from app import vector_store | |
| def fake_select(chunks, plant_aliases=None, stage=None, max_verified_chunks=3, max_background_chunks=1): | |
| return [dict(chunks[0], selection_score=0.44, selection_promoted_background=False)] | |
| monkeypatch.setattr(vector_store, "select_knowledge_chunks", fake_select) | |
| monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(c, None) for c in chunks]) | |
| chunks = [ | |
| {"filename": "bg-0.pdf", "page_number": 1, "source": "BG 0", "content": "first context", "similarity": 0.41, "retrieval_modes": ["dense"]}, | |
| {"filename": "bg-1.pdf", "page_number": 2, "source": "BG 1", "content": "second context", "similarity": 0.40, "retrieval_modes": ["dense"]}, | |
| ] | |
| result = vector_store.format_knowledge_context(chunks, plant_aliases=["lettuce"], stage="vegetative") | |
| assert "first context" in result | |
| assert "second context" not in result | |
| def test_format_knowledge_context_returns_empty_when_selected_empty(monkeypatch): | |
| from app import vector_store | |
| def fake_select(chunks, plant_aliases=None, stage=None, max_verified_chunks=3, max_background_chunks=1): | |
| return [] | |
| monkeypatch.setattr(vector_store, "select_knowledge_chunks", fake_select) | |
| chunks = [ | |
| {"filename": "bg-0.pdf", "page_number": 1, "source": "BG 0", "content": "first context", "similarity": 0.41, "retrieval_modes": ["dense"]}, | |
| ] | |
| result = vector_store.format_knowledge_context(chunks, plant_aliases=["lettuce"], stage="vegetative") | |
| assert result == "" | |
| # ============================================================================= | |
| # Lexical retrieval and RRF utilities | |
| # ============================================================================= | |
| def test_search_knowledge_fts_posts_query_text_to_match_knowledge_fts(monkeypatch): | |
| client = FakeAsyncClient() | |
| monkeypatch.setattr(vector_store, "is_configured", lambda: True) | |
| monkeypatch.setattr(vector_store, "SUPABASE_URL", "https://example.supabase.co") | |
| monkeypatch.setattr(vector_store.httpx, "AsyncClient", lambda timeout=10.0: client) | |
| result = asyncio.run(vector_store.search_knowledge_fts("pythium root rot", match_count=6)) | |
| assert client.calls[0]["url"].endswith("/rpc/match_knowledge_fts") | |
| assert client.calls[0]["json"]["query_text"] == "pythium root rot" | |
| assert client.calls[0]["json"]["match_count"] == 6 | |
| assert result[0]["filename"] == "doc.pdf" | |
| def test_search_knowledge_fts_returns_empty_when_not_configured(monkeypatch): | |
| monkeypatch.setattr(vector_store, "is_configured", lambda: False) | |
| result = asyncio.run(vector_store.search_knowledge_fts("pythium root rot")) | |
| assert result == [] | |
| def test_search_knowledge_fts_returns_empty_on_non_200_response(monkeypatch): | |
| class ErrorClient: | |
| async def __aenter__(self): | |
| return self | |
| async def __aexit__(self, *a): | |
| return False | |
| async def post(self, *a, **kw): | |
| return FakeResponse(status_code=503, payload=[], text="Service Unavailable") | |
| monkeypatch.setattr(vector_store, "is_configured", lambda: True) | |
| monkeypatch.setattr(vector_store, "SUPABASE_URL", "https://example.supabase.co") | |
| monkeypatch.setattr(vector_store.httpx, "AsyncClient", lambda timeout=10.0: ErrorClient()) | |
| result = asyncio.run(vector_store.search_knowledge_fts("query")) | |
| assert result == [] | |
| def test_reciprocal_rank_fuse_prefers_chunk_seen_by_both_lists(): | |
| dense = [ | |
| {"filename": "leafy.pdf", "page_number": 79, "content": "Pythium Root Rot...", "similarity": 0.68}, | |
| {"filename": "pumpkin.pdf", "page_number": 13, "content": "Fusarium crown rot...", "similarity": 0.67}, | |
| ] | |
| lexical = [ | |
| {"filename": "leafy.pdf", "page_number": 79, "content": "Pythium Root Rot...", "lexical_score": 0.42}, | |
| {"filename": "manual.pdf", "page_number": 47, "content": "Growing plants...", "lexical_score": 0.31}, | |
| ] | |
| fused = vector_store.reciprocal_rank_fuse(dense, lexical, top_k=2, rrf_k=60) | |
| assert fused[0]["filename"] == "leafy.pdf" | |
| assert fused[0]["retrieval_modes"] == ["dense", "lexical"] | |
| def test_search_knowledge_hybrid_falls_back_to_dense_only_on_lexical_failure(monkeypatch): | |
| async def fake_search_knowledge(query, match_count=7, match_threshold=0.30, query_label="raw"): | |
| return [{"filename": "doc.pdf", "page_number": 1, "content": "dense hit", "similarity": 0.8}] | |
| async def fake_search_knowledge_fts(query, match_count=7): | |
| raise RuntimeError("fts down") | |
| monkeypatch.setattr(vector_store, "search_knowledge", fake_search_knowledge) | |
| monkeypatch.setattr(vector_store, "search_knowledge_fts", fake_search_knowledge_fts) | |
| dense_queries = [types.SimpleNamespace(text="dense query", label="hyde")] | |
| fused_list, _ = asyncio.run(vector_store.search_knowledge_hybrid(raw_query="raw query", dense_queries=dense_queries)) | |
| assert fused_list[0]["content"] == "dense hit" | |
| def test_select_knowledge_chunks_prefers_cross_modal_plant_match(): | |
| """Cross-modal (dense+lexical) signal should win when plant and stage signals are equal. | |
| Construct two chunks with identical plant mentions and stage mentions; only the | |
| presence of the lexical signal differs. The dense+lexical chunk should be preferred. | |
| """ | |
| from app import vector_store | |
| chunks = [ | |
| { | |
| "filename": "lettuce.pdf", | |
| "page_number": 8, | |
| "source": "Generic Lettuce Guide", | |
| "content": "Lettuce crop management. Vegetative stage note.", | |
| "similarity": 0.67, | |
| "retrieval_modes": ["dense", "lexical"], | |
| }, | |
| { | |
| "filename": "generic.pdf", | |
| "page_number": 2, | |
| "source": "Generic Lettuce Guide", | |
| "content": "Lettuce crop management. Vegetative stage note.", | |
| "similarity": 0.67, | |
| "retrieval_modes": ["dense"], | |
| }, | |
| ] | |
| selected = vector_store.select_knowledge_chunks( | |
| chunks, | |
| plant_aliases=["lettuce"], | |
| stage="vegetative", | |
| max_verified_chunks=2, | |
| max_background_chunks=1, | |
| ) | |
| assert selected[0]["filename"] == "lettuce.pdf" | |
| assert selected[0]["selection_score"] > selected[1]["selection_score"] | |
| def test_select_knowledge_chunks_limits_background_chunks(): | |
| from app import vector_store | |
| chunks = [ | |
| { | |
| "filename": f"bg-{i}.pdf", | |
| "page_number": i, | |
| "source": f"Background {i}", | |
| "content": f"Generic context {i}", | |
| "similarity": 0.41 - (i * 0.01), | |
| "retrieval_modes": ["dense"], | |
| } | |
| for i in range(4) | |
| ] | |
| selected = vector_store.select_knowledge_chunks( | |
| chunks, | |
| plant_aliases=["lettuce"], | |
| stage="vegetative", | |
| max_verified_chunks=0, | |
| max_background_chunks=1, | |
| ) | |
| assert len(selected) == 1 | |
| assert selected[0]["filename"] == "bg-0.pdf" | |
| def test_select_knowledge_chunks_rewards_stage_match(): | |
| """Ensure stage signal breaks ties when plant/cross-modal signals are equal. | |
| Both candidate chunks are constructed to have identical plant-match and | |
| similarity/retrieval signals; only the stage mention differs. The vegetative | |
| chunk should therefore be preferred when stage="vegetative". | |
| """ | |
| from app import vector_store | |
| # Both chunks mention the plant equally (in source/content) and have identical | |
| # similarity and retrieval_modes so that the only distinguishing signal is stage. | |
| chunks = [ | |
| { | |
| "filename": "veg.pdf", | |
| "page_number": 4, | |
| "source": "Generic Lettuce Guide", | |
| "content": "Lettuce crop management. Vegetative stage details.", | |
| "similarity": 0.62, | |
| "retrieval_modes": ["dense"], | |
| }, | |
| { | |
| "filename": "fruiting.pdf", | |
| "page_number": 9, | |
| "source": "Generic Lettuce Guide", | |
| "content": "Lettuce crop management. Fruiting stage details.", | |
| "similarity": 0.62, | |
| "retrieval_modes": ["dense"], | |
| }, | |
| ] | |
| selected = vector_store.select_knowledge_chunks( | |
| chunks, | |
| plant_aliases=["lettuce"], | |
| stage="vegetative", | |
| max_verified_chunks=2, | |
| max_background_chunks=0, | |
| ) | |
| assert selected[0]["filename"] == "veg.pdf" | |
| def test_selection_promoted_background_flag(): | |
| """Verify selection_promoted_background flags promoted and non-promoted chunks. | |
| - One true verified chunk should be selected and have selection_promoted_background False. | |
| - The highest-scoring non-verified (background) chunk should be promoted into the | |
| remaining verified slot and be marked selection_promoted_background True. | |
| - Any additionally appended background chunk should have the flag False. | |
| """ | |
| from app import vector_store | |
| chunks = [ | |
| # True verified chunk (similarity >= 0.70) | |
| { | |
| "filename": "verified.pdf", | |
| "page_number": 1, | |
| "source": "Verified Guide", | |
| "content": "Verified authoritative content", | |
| "similarity": 0.72, | |
| "retrieval_modes": ["dense"], | |
| }, | |
| # Background chunks (below verified threshold) | |
| { | |
| "filename": "bg-promoted.pdf", | |
| "page_number": 2, | |
| "source": "Background Source", | |
| "content": "Relevant background content A", | |
| "similarity": 0.60, | |
| "retrieval_modes": ["dense"], | |
| }, | |
| { | |
| "filename": "bg-normal.pdf", | |
| "page_number": 3, | |
| "source": "Background Source", | |
| "content": "Relevant background content B", | |
| "similarity": 0.59, | |
| "retrieval_modes": ["dense"], | |
| }, | |
| ] | |
| selected = vector_store.select_knowledge_chunks( | |
| chunks, | |
| plant_aliases=None, | |
| stage=None, | |
| max_verified_chunks=2, | |
| max_background_chunks=1, | |
| ) | |
| # Expect order: verified (non-promoted), bg-promoted (promoted into verified), then bg-normal (background) | |
| assert len(selected) == 3 | |
| assert selected[0]["filename"] == "verified.pdf" | |
| assert selected[0]["selection_promoted_background"] is False | |
| assert selected[1]["filename"] == "bg-promoted.pdf" | |
| assert selected[1]["selection_promoted_background"] is True | |
| assert selected[2]["filename"] == "bg-normal.pdf" | |
| assert selected[2]["selection_promoted_background"] is False | |