PGC-AI-Chatbot / tests /test_vector_store.py
Jacooo's picture
Deploy from GitHub: f66aedb
c397c36 verified
import asyncio
import sys
import types
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import pytest
import app.vector_store as vector_store
def test_verification_thresholds_calibrated_for_bge_m3():
# User-preferred conservative thresholds for BGE-M3 embedding space
assert vector_store.VERIFIED_DENSE_THRESHOLD == 0.70
assert vector_store.VERIFIED_HYBRID_THRESHOLD == 0.65
def test_embed_text_rejects_non_1024_vector(monkeypatch):
class FakeEmbedding:
def __init__(self, values):
self._values = values
def tolist(self):
return self._values
class FakeModel:
def query_embed(self, texts):
yield FakeEmbedding([0.1, 0.2])
monkeypatch.setattr(vector_store, "_get_embedding_model", lambda: FakeModel())
with pytest.raises(ValueError, match="expected 1024"):
vector_store.embed_text("lettuce tipburn")
class FakeResponse:
def __init__(self, status_code=200, payload=None, text="ok"):
self.status_code = status_code
self._payload = payload or []
self.text = text
def json(self):
return self._payload
class FakeAsyncClient:
def __init__(self, *args, **kwargs):
self.calls = []
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb):
return False
async def post(self, url, headers=None, json=None):
self.calls.append({"url": url, "headers": headers, "json": json})
return FakeResponse(
payload=[
{
"source": "Doc",
"filename": "doc.pdf",
"page_number": 2,
"content": "Expanded horticultural context",
"similarity": 0.91,
}
]
)
def test_search_knowledge_logs_hyde_query_label_and_embeds_transformed_query(monkeypatch, capsys):
captured = {}
client = FakeAsyncClient()
monkeypatch.setattr(vector_store, "is_configured", lambda: True)
monkeypatch.setattr(vector_store, "SUPABASE_URL", "https://example.supabase.co")
def fake_embed_text(text):
captured["embedded_query"] = text
return [0.1, 0.2]
monkeypatch.setattr(vector_store, "embed_text", fake_embed_text)
monkeypatch.setattr(vector_store.httpx, "AsyncClient", lambda timeout=10.0: client)
chunks = asyncio.run(
vector_store.search_knowledge(
query="Expanded agronomic explanation for lettuce humidity",
query_label="hyde",
)
)
output = capsys.readouterr().out
assert chunks[0]["source"] == "Doc"
assert "filename" in chunks[0], "Supabase response must include 'filename' for parent expansion"
assert captured["embedded_query"] == "Expanded agronomic explanation for lettuce humidity"
assert client.calls[0]["json"]["match_count"] == vector_store.DEFAULT_MATCH_COUNT
assert "[VectorRAG:hyde]" in output
def test_search_knowledge_defaults_to_raw_query_label(monkeypatch, capsys):
client = FakeAsyncClient()
monkeypatch.setattr(vector_store, "is_configured", lambda: True)
monkeypatch.setattr(vector_store, "SUPABASE_URL", "https://example.supabase.co")
monkeypatch.setattr(vector_store, "embed_text", lambda text: [0.1, 0.2])
monkeypatch.setattr(vector_store.httpx, "AsyncClient", lambda timeout=10.0: client)
asyncio.run(vector_store.search_knowledge(query="plain query"))
output = capsys.readouterr().out
assert "[VectorRAG:raw]" in output
def test_merge_knowledge_results_deduplicates_by_filename_page_content_keeps_higher_similarity():
from app.vector_store import merge_knowledge_results
primary = [
{"filename": "cornell.pdf", "source": "Cornell-Lettuce", "page_number": 5, "content": "tipburn info", "similarity": 0.75},
{"filename": "kubis.pdf", "source": "Kubis-Guide", "page_number": 2, "content": "cabbage info", "similarity": 0.82},
]
english = [
{"filename": "cornell.pdf", "source": "Cornell-Lettuce", "page_number": 5, "content": "tipburn info", "similarity": 0.85},
{"filename": "new.pdf", "source": "New-Source", "page_number": 1, "content": "new content", "similarity": 0.90},
]
merged = merge_knowledge_results([primary, english])
sources = [c["source"] for c in merged]
assert "Cornell-Lettuce" in sources # deduped
assert "Kubis-Guide" in sources
assert "New-Source" in sources
assert len(merged) == 3 # no duplicates
cornell_chunk = next(c for c in merged if c["source"] == "Cornell-Lettuce")
assert cornell_chunk["similarity"] == 0.85 # higher similarity kept
def test_merge_knowledge_results_does_not_dedup_different_content_same_page():
from app.vector_store import merge_knowledge_results
chunks = [[
{"filename": "doc.pdf", "page_number": 3, "content": "first paragraph", "similarity": 0.80},
{"filename": "doc.pdf", "page_number": 3, "content": "second paragraph", "similarity": 0.79},
]]
merged = merge_knowledge_results(chunks)
assert len(merged) == 2 # different content → not deduped
def test_merge_knowledge_results_respects_top_k():
from app.vector_store import merge_knowledge_results
chunks = [
[{"source": f"Doc{i}", "page_number": i, "content": "x", "similarity": 0.9 - i * 0.01}
for i in range(4)]
]
merged = merge_knowledge_results(chunks, top_k=2)
assert len(merged) == 2
def test_merge_knowledge_results_handles_empty_inputs():
from app.vector_store import merge_knowledge_results
assert merge_knowledge_results([]) == []
assert merge_knowledge_results([[], []]) == []
# =============================================================================
# expand_knowledge_results tests
# =============================================================================
def test_expand_knowledge_results_passthrough_on_empty_corpus(monkeypatch):
from app import vector_store
monkeypatch.setattr(vector_store, "_corpus", [])
monkeypatch.setattr(vector_store, "_corpus_lookup", {})
chunks = [
{"filename": "a.pdf", "page_number": 1, "content": "hello", "similarity": 0.80},
{"filename": "b.pdf", "page_number": 2, "content": "world", "similarity": 0.75},
]
pairs = vector_store.expand_knowledge_results(chunks)
assert len(pairs) == 2
for original, window in pairs:
assert window is None
assert pairs[0][0]["filename"] == "a.pdf"
assert pairs[1][0]["filename"] == "b.pdf"
def test_expand_knowledge_results_returns_none_window_when_corpus_empty(monkeypatch):
from app import vector_store
monkeypatch.setattr(vector_store, "_corpus", [])
monkeypatch.setattr(vector_store, "_corpus_lookup", {})
chunks = [{"filename": "doc.pdf", "page_number": 1, "content": "some text", "similarity": 0.85}]
pairs = vector_store.expand_knowledge_results(chunks)
assert len(pairs) == 1
original, window = pairs[0]
assert original["content"] == "some text"
assert window is None
def test_expand_knowledge_results_returns_window_when_match_found(monkeypatch):
from app import vector_store
from app.knowledge_chunking import NormalizedChildChunk
from app.parent_context import ParentWindow
chunk = {"filename": "guide.pdf", "page_number": 2, "content": "matched text", "similarity": 0.91}
fake_chunk = NormalizedChildChunk(
child_id="guide.pdf::p2::i0",
source="Guide",
filename="guide.pdf",
page_number=2,
content="matched text",
corpus_ordinal=0,
)
fake_window = ParentWindow(
primary_child=fake_chunk,
left_neighbor=None,
right_neighbor=None,
combined_text="matched text",
)
monkeypatch.setattr(vector_store, "_corpus", [fake_chunk])
monkeypatch.setattr(vector_store, "_corpus_lookup", {
("guide.pdf", 2, "matched text"): fake_chunk
})
import app.parent_context as pc_mod
monkeypatch.setattr(pc_mod, "find_and_expand", lambda hit, corpus, lookup: fake_window)
pairs = vector_store.expand_knowledge_results([chunk])
assert len(pairs) == 1
original, window = pairs[0]
assert original["similarity"] == 0.91
assert window is not None
assert window is fake_window
# =============================================================================
# format_knowledge_context with parent windows
# =============================================================================
def test_format_knowledge_context_renders_matched_paragraph_label(monkeypatch):
"""format_knowledge_context should label the primary text as [MATCHED PARAGRAPH]."""
from app import vector_store
monkeypatch.setattr(vector_store, "_corpus", [])
monkeypatch.setattr(vector_store, "_corpus_lookup", {})
chunks = [{"source": "Guide", "page_number": 1, "content": "tipburn info", "similarity": 0.91, "filename": "guide.pdf"}]
result = vector_store.format_knowledge_context(chunks)
assert "[MATCHED PARAGRAPH]" in result
assert "tipburn info" in result
assert "CITE AS 📖" in result
def test_format_knowledge_context_renders_supporting_context_when_window_present(monkeypatch):
"""format_knowledge_context should render Supporting context when neighbors exist."""
from app import vector_store
from app.knowledge_chunking import NormalizedChildChunk
from app.parent_context import ParentWindow
left = NormalizedChildChunk("f::p0::i0", "Guide", "guide.pdf", 0, "left neighbor text", 0)
primary = NormalizedChildChunk("f::p1::i1", "Guide", "guide.pdf", 1, "primary text", 1)
right = NormalizedChildChunk("f::p2::i2", "Guide", "guide.pdf", 2, "right neighbor text", 2)
fake_window = ParentWindow(
primary_child=primary,
left_neighbor=left,
right_neighbor=right,
combined_text="left neighbor text\n\nprimary text\n\nright neighbor text",
)
chunk = {"filename": "guide.pdf", "page_number": 1, "content": "primary text", "similarity": 0.91, "source": "Guide"}
monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(chunks[0], fake_window)])
result = vector_store.format_knowledge_context([chunk])
assert "[MATCHED PARAGRAPH]" in result
assert "primary text" in result
assert "Supporting context" in result
assert "left neighbor text" in result
assert "right neighbor text" in result
assert "CITE AS" in result
# =============================================================================
# format_knowledge_context — plant_aliases citation filter
# =============================================================================
def test_format_knowledge_context_plant_alias_filter_promotes_matching_chunk(monkeypatch):
"""Chunk that mentions an alias IN CONTENT stays as 📖 Verified."""
from app import vector_store
monkeypatch.setattr(vector_store, "expand_knowledge_results",
lambda chunks: [(c, None) for c in chunks])
chunk = {
"source": "Petunjuk Teknis Budidaya Sayuran Dataran Rendah",
"filename": "sayuran.pdf",
"page_number": 22,
"content": "Hama yang menyerang tanaman kangkung antara lain ulat grayak.",
"similarity": 0.72,
"retrieval_modes": ["dense"],
}
result = vector_store.format_knowledge_context(
[chunk],
plant_aliases=["kangkung", "Water Spinach", "Ipomoea aquatica"],
)
assert "CITE AS 📖" in result
assert "Background Context" not in result
def test_format_knowledge_context_plant_alias_matches_source_name(monkeypatch):
"""Chunk whose SOURCE NAME contains the alias qualifies even if content does not mention it.
This covers dedicated crop documents (e.g. 'Budidaya Cabe Di Perkotaan') where
~59% of chunks never repeat the crop name inside the paragraph body.
"""
from app import vector_store
monkeypatch.setattr(vector_store, "expand_knowledge_results",
lambda chunks: [(c, None) for c in chunks])
chunk = {
"source": "Budidaya Cabe Di Perkotaan", # "Cabe" is in the source name
"filename": "budidaya-cabe.pdf",
"page_number": 33,
"content": "Layu Fusarium / Fusarium wilt disebabkan oleh jamur Fusarium oxysporum.",
"similarity": 0.72,
"retrieval_modes": ["dense"],
}
result = vector_store.format_knowledge_context(
[chunk],
plant_aliases=["Cabe", "Cabai", "Chili", "Capsicum annuum"],
)
assert "CITE AS 📖" in result
assert "Background Context" not in result
def test_format_knowledge_context_plant_alias_filter_demotes_non_matching_chunk(monkeypatch):
"""Chunk with NO alias in content AND no alias in source name → Background Context."""
from app import vector_store
monkeypatch.setattr(vector_store, "expand_knowledge_results",
lambda chunks: [(c, None) for c in chunks])
chunk = {
"source": "Melon Pest Guide",
"filename": "melon.pdf",
"page_number": 38,
"content": "Patogen masuk ke dalam tanaman melalui ujung-ujung akar.",
"similarity": 0.72,
"retrieval_modes": ["dense"],
}
result = vector_store.format_knowledge_context(
[chunk],
plant_aliases=["kangkung", "Water Spinach", "Ipomoea aquatica"],
)
assert "CITE AS 📖" not in result
assert "Background Context" in result
def test_format_knowledge_context_no_plant_aliases_skips_filter(monkeypatch):
"""When plant_aliases=None (general query), verified chunks keep 📖 regardless of content."""
from app import vector_store
monkeypatch.setattr(vector_store, "expand_knowledge_results",
lambda chunks: [(c, None) for c in chunks])
chunk = {
"source": "Melon Pest Guide",
"filename": "melon.pdf",
"page_number": 38,
"content": "Patogen masuk ke dalam tanaman melon melalui ujung-ujung akar.",
"similarity": 0.72,
"retrieval_modes": ["dense"],
}
result = vector_store.format_knowledge_context([chunk], plant_aliases=None)
assert "CITE AS 📖" in result
def test_format_knowledge_context_plant_alias_case_insensitive(monkeypatch):
"""Alias matching is case-insensitive."""
from app import vector_store
monkeypatch.setattr(vector_store, "expand_knowledge_results",
lambda chunks: [(c, None) for c in chunks])
chunk = {
"source": "Guide",
"filename": "guide.pdf",
"page_number": 1,
"content": "Water Spinach is susceptible to Pythium root rot.",
"similarity": 0.75,
"retrieval_modes": ["dense"],
}
result = vector_store.format_knowledge_context(
[chunk],
plant_aliases=["water spinach"], # lowercase
)
assert "CITE AS 📖" in result
def test_format_knowledge_context_uses_selected_chunk_order(monkeypatch):
from app import vector_store
monkeypatch.setattr(
vector_store,
"select_knowledge_chunks",
lambda chunks, plant_aliases=None, stage=None, max_verified_chunks=3, max_background_chunks=1: [
dict(chunks[1], selection_score=0.93, selection_promoted_background=False),
dict(chunks[0], selection_score=0.51, selection_promoted_background=False),
],
)
monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(c, None) for c in chunks])
chunks = [
{"filename": "generic.pdf", "page_number": 1, "source": "Generic", "content": "generic", "similarity": 0.72, "retrieval_modes": ["dense"]},
{"filename": "lettuce.pdf", "page_number": 2, "source": "Lettuce", "content": "lettuce", "similarity": 0.68, "retrieval_modes": ["dense", "lexical"]},
]
result = vector_store.format_knowledge_context(chunks, plant_aliases=["lettuce"], stage="vegetative")
assert result.index("lettuce") < result.index("generic")
def test_format_knowledge_context_drops_extra_background_chunks(monkeypatch):
from app import vector_store
def fake_select(chunks, plant_aliases=None, stage=None, max_verified_chunks=3, max_background_chunks=1):
return [dict(chunks[0], selection_score=0.44, selection_promoted_background=False)]
monkeypatch.setattr(vector_store, "select_knowledge_chunks", fake_select)
monkeypatch.setattr(vector_store, "expand_knowledge_results", lambda chunks: [(c, None) for c in chunks])
chunks = [
{"filename": "bg-0.pdf", "page_number": 1, "source": "BG 0", "content": "first context", "similarity": 0.41, "retrieval_modes": ["dense"]},
{"filename": "bg-1.pdf", "page_number": 2, "source": "BG 1", "content": "second context", "similarity": 0.40, "retrieval_modes": ["dense"]},
]
result = vector_store.format_knowledge_context(chunks, plant_aliases=["lettuce"], stage="vegetative")
assert "first context" in result
assert "second context" not in result
def test_format_knowledge_context_returns_empty_when_selected_empty(monkeypatch):
from app import vector_store
def fake_select(chunks, plant_aliases=None, stage=None, max_verified_chunks=3, max_background_chunks=1):
return []
monkeypatch.setattr(vector_store, "select_knowledge_chunks", fake_select)
chunks = [
{"filename": "bg-0.pdf", "page_number": 1, "source": "BG 0", "content": "first context", "similarity": 0.41, "retrieval_modes": ["dense"]},
]
result = vector_store.format_knowledge_context(chunks, plant_aliases=["lettuce"], stage="vegetative")
assert result == ""
# =============================================================================
# Lexical retrieval and RRF utilities
# =============================================================================
def test_search_knowledge_fts_posts_query_text_to_match_knowledge_fts(monkeypatch):
client = FakeAsyncClient()
monkeypatch.setattr(vector_store, "is_configured", lambda: True)
monkeypatch.setattr(vector_store, "SUPABASE_URL", "https://example.supabase.co")
monkeypatch.setattr(vector_store.httpx, "AsyncClient", lambda timeout=10.0: client)
result = asyncio.run(vector_store.search_knowledge_fts("pythium root rot", match_count=6))
assert client.calls[0]["url"].endswith("/rpc/match_knowledge_fts")
assert client.calls[0]["json"]["query_text"] == "pythium root rot"
assert client.calls[0]["json"]["match_count"] == 6
assert result[0]["filename"] == "doc.pdf"
def test_search_knowledge_fts_returns_empty_when_not_configured(monkeypatch):
monkeypatch.setattr(vector_store, "is_configured", lambda: False)
result = asyncio.run(vector_store.search_knowledge_fts("pythium root rot"))
assert result == []
def test_search_knowledge_fts_returns_empty_on_non_200_response(monkeypatch):
class ErrorClient:
async def __aenter__(self):
return self
async def __aexit__(self, *a):
return False
async def post(self, *a, **kw):
return FakeResponse(status_code=503, payload=[], text="Service Unavailable")
monkeypatch.setattr(vector_store, "is_configured", lambda: True)
monkeypatch.setattr(vector_store, "SUPABASE_URL", "https://example.supabase.co")
monkeypatch.setattr(vector_store.httpx, "AsyncClient", lambda timeout=10.0: ErrorClient())
result = asyncio.run(vector_store.search_knowledge_fts("query"))
assert result == []
def test_reciprocal_rank_fuse_prefers_chunk_seen_by_both_lists():
dense = [
{"filename": "leafy.pdf", "page_number": 79, "content": "Pythium Root Rot...", "similarity": 0.68},
{"filename": "pumpkin.pdf", "page_number": 13, "content": "Fusarium crown rot...", "similarity": 0.67},
]
lexical = [
{"filename": "leafy.pdf", "page_number": 79, "content": "Pythium Root Rot...", "lexical_score": 0.42},
{"filename": "manual.pdf", "page_number": 47, "content": "Growing plants...", "lexical_score": 0.31},
]
fused = vector_store.reciprocal_rank_fuse(dense, lexical, top_k=2, rrf_k=60)
assert fused[0]["filename"] == "leafy.pdf"
assert fused[0]["retrieval_modes"] == ["dense", "lexical"]
def test_search_knowledge_hybrid_falls_back_to_dense_only_on_lexical_failure(monkeypatch):
async def fake_search_knowledge(query, match_count=7, match_threshold=0.30, query_label="raw"):
return [{"filename": "doc.pdf", "page_number": 1, "content": "dense hit", "similarity": 0.8}]
async def fake_search_knowledge_fts(query, match_count=7):
raise RuntimeError("fts down")
monkeypatch.setattr(vector_store, "search_knowledge", fake_search_knowledge)
monkeypatch.setattr(vector_store, "search_knowledge_fts", fake_search_knowledge_fts)
dense_queries = [types.SimpleNamespace(text="dense query", label="hyde")]
fused_list, _ = asyncio.run(vector_store.search_knowledge_hybrid(raw_query="raw query", dense_queries=dense_queries))
assert fused_list[0]["content"] == "dense hit"
def test_select_knowledge_chunks_prefers_cross_modal_plant_match():
"""Cross-modal (dense+lexical) signal should win when plant and stage signals are equal.
Construct two chunks with identical plant mentions and stage mentions; only the
presence of the lexical signal differs. The dense+lexical chunk should be preferred.
"""
from app import vector_store
chunks = [
{
"filename": "lettuce.pdf",
"page_number": 8,
"source": "Generic Lettuce Guide",
"content": "Lettuce crop management. Vegetative stage note.",
"similarity": 0.67,
"retrieval_modes": ["dense", "lexical"],
},
{
"filename": "generic.pdf",
"page_number": 2,
"source": "Generic Lettuce Guide",
"content": "Lettuce crop management. Vegetative stage note.",
"similarity": 0.67,
"retrieval_modes": ["dense"],
},
]
selected = vector_store.select_knowledge_chunks(
chunks,
plant_aliases=["lettuce"],
stage="vegetative",
max_verified_chunks=2,
max_background_chunks=1,
)
assert selected[0]["filename"] == "lettuce.pdf"
assert selected[0]["selection_score"] > selected[1]["selection_score"]
def test_select_knowledge_chunks_limits_background_chunks():
from app import vector_store
chunks = [
{
"filename": f"bg-{i}.pdf",
"page_number": i,
"source": f"Background {i}",
"content": f"Generic context {i}",
"similarity": 0.41 - (i * 0.01),
"retrieval_modes": ["dense"],
}
for i in range(4)
]
selected = vector_store.select_knowledge_chunks(
chunks,
plant_aliases=["lettuce"],
stage="vegetative",
max_verified_chunks=0,
max_background_chunks=1,
)
assert len(selected) == 1
assert selected[0]["filename"] == "bg-0.pdf"
def test_select_knowledge_chunks_rewards_stage_match():
"""Ensure stage signal breaks ties when plant/cross-modal signals are equal.
Both candidate chunks are constructed to have identical plant-match and
similarity/retrieval signals; only the stage mention differs. The vegetative
chunk should therefore be preferred when stage="vegetative".
"""
from app import vector_store
# Both chunks mention the plant equally (in source/content) and have identical
# similarity and retrieval_modes so that the only distinguishing signal is stage.
chunks = [
{
"filename": "veg.pdf",
"page_number": 4,
"source": "Generic Lettuce Guide",
"content": "Lettuce crop management. Vegetative stage details.",
"similarity": 0.62,
"retrieval_modes": ["dense"],
},
{
"filename": "fruiting.pdf",
"page_number": 9,
"source": "Generic Lettuce Guide",
"content": "Lettuce crop management. Fruiting stage details.",
"similarity": 0.62,
"retrieval_modes": ["dense"],
},
]
selected = vector_store.select_knowledge_chunks(
chunks,
plant_aliases=["lettuce"],
stage="vegetative",
max_verified_chunks=2,
max_background_chunks=0,
)
assert selected[0]["filename"] == "veg.pdf"
def test_selection_promoted_background_flag():
"""Verify selection_promoted_background flags promoted and non-promoted chunks.
- One true verified chunk should be selected and have selection_promoted_background False.
- The highest-scoring non-verified (background) chunk should be promoted into the
remaining verified slot and be marked selection_promoted_background True.
- Any additionally appended background chunk should have the flag False.
"""
from app import vector_store
chunks = [
# True verified chunk (similarity >= 0.70)
{
"filename": "verified.pdf",
"page_number": 1,
"source": "Verified Guide",
"content": "Verified authoritative content",
"similarity": 0.72,
"retrieval_modes": ["dense"],
},
# Background chunks (below verified threshold)
{
"filename": "bg-promoted.pdf",
"page_number": 2,
"source": "Background Source",
"content": "Relevant background content A",
"similarity": 0.60,
"retrieval_modes": ["dense"],
},
{
"filename": "bg-normal.pdf",
"page_number": 3,
"source": "Background Source",
"content": "Relevant background content B",
"similarity": 0.59,
"retrieval_modes": ["dense"],
},
]
selected = vector_store.select_knowledge_chunks(
chunks,
plant_aliases=None,
stage=None,
max_verified_chunks=2,
max_background_chunks=1,
)
# Expect order: verified (non-promoted), bg-promoted (promoted into verified), then bg-normal (background)
assert len(selected) == 3
assert selected[0]["filename"] == "verified.pdf"
assert selected[0]["selection_promoted_background"] is False
assert selected[1]["filename"] == "bg-promoted.pdf"
assert selected[1]["selection_promoted_background"] is True
assert selected[2]["filename"] == "bg-normal.pdf"
assert selected[2]["selection_promoted_background"] is False