PGC-AI-Chatbot / scripts /test_fallback.py
Jacooo's picture
Deploy from GitHub: c8fdee9
ac1bc25 verified
#!/usr/bin/env python3
"""
Fallback Retrieval Diagnostic Script
Tests the 4 zero-context RAG cases against all retrieval layers:
Layer 1: search_knowledge() — dense at 0.30 threshold
Layer 2: search_knowledge() — fallback at 0.20 threshold
Layer 3: search_knowledge_fts() — simplified query
Layer 4: search_knowledge_hybrid() — full pipeline
Usage: python scripts/test_fallback.py
"""
import asyncio
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parent.parent / ".env")
from app.vector_store import search_knowledge, search_knowledge_fts, search_knowledge_hybrid
FALLBACK_THRESHOLD = 0.20
ZERO_CONTEXT_CASES = [
{
"case_id": "rag_terung_suhu_ph",
"query": "Berapa suhu dan pH yang sesuai untuk budidaya terung?",
"expected_source": "Budidaya Terung.pdf",
},
{
"case_id": "rag_marigold_production",
"query": "What are the ideal temperature and spacing requirements for marigold flower production?",
"expected_source": "Marigold flower production guideline.pdf",
},
{
"case_id": "rag_chinese_kale_cultivation",
"query": "What are the temperature, spacing, and fertilizer requirements for Chinese kale?",
"expected_source": "Chinese Kale (Brassica oleracea), Cultivar Group.pdf",
},
{
"case_id": "rag_home_hydroponics_lighting",
"query": "What factors should a home hydroponic grower consider when setting up lighting for leafy greens?",
"expected_source": "A Guide to Home Hydroponics for Leafy Greens.pdf",
},
]
async def test_case(case: dict) -> None:
cid = case["case_id"]
q = case["query"]
expected = case["expected_source"]
print(f"\n{'='*70}")
print(f" {cid}")
print(f" Query: {q}")
print(f" Expected source: {expected}")
print(f"{'='*70}")
# Layer 1: Dense at 0.30
print(f"\n [Layer 1] search_knowledge(threshold=0.30)")
try:
chunks = await search_knowledge(q, match_count=7, match_threshold=0.30, query_label="layer1")
print(f" Chunks returned: {len(chunks)}")
for i, c in enumerate(chunks, 1):
src = c.get("source", "?")
sim = c.get("similarity", 0)
preview = c.get("content", "")[:80].replace("\n", " ")
print(f" [{i}] sim={sim:.2%} | {src} | \"{preview}...\"")
except Exception as e:
print(f" ERROR: {e}")
# Layer 2: Dense at 0.20 (fallback)
print(f"\n [Layer 2] search_knowledge(threshold=0.20) — fallback")
try:
chunks = await search_knowledge(q, match_count=7, match_threshold=0.20, query_label="layer2")
print(f" Chunks returned: {len(chunks)}")
for i, c in enumerate(chunks, 1):
src = c.get("source", "?")
sim = c.get("similarity", 0)
preview = c.get("content", "")[:80].replace("\n", " ")
print(f" [{i}] sim={sim:.2%} | {src} | \"{preview}...\"")
except Exception as e:
print(f" ERROR: {e}")
# Layer 3: FTS with simplified query
print(f"\n [Layer 3] search_knowledge_fts()")
try:
chunks = await search_knowledge_fts(q, match_count=7)
print(f" Chunks returned: {len(chunks)}")
for i, c in enumerate(chunks, 1):
src = c.get("source", "?")
rank = c.get("fts_rank", c.get("similarity", 0))
preview = c.get("content", "")[:80].replace("\n", " ")
print(f" [{i}] rank={rank} | {src} | \"{preview}...\"")
except Exception as e:
print(f" ERROR: {e}")
# Layer 4: Full hybrid pipeline
print(f"\n [Layer 4] search_knowledge_hybrid() — full pipeline")
try:
result = await search_knowledge_hybrid(q, top_k=7)
chunks = result if isinstance(result, list) else result.get("chunks", [])
print(f" Chunks returned: {len(chunks)}")
for i, c in enumerate(chunks, 1):
src = c.get("source", "?")
sim = c.get("similarity", c.get("rrf_score", 0))
preview = c.get("content", "")[:80].replace("\n", " ")
print(f" [{i}] sim={sim:.2%}" if isinstance(sim, float) else f" [{i}] score={sim} | {src} | \"{preview}...\"")
except Exception as e:
print(f" ERROR: {e}")
# Summary
print()
async def main():
print("=" * 70)
print(" FALLBACK RETRIEVAL DIAGNOSTIC")
print(" Testing 4 zero-context RAG cases")
print("=" * 70)
for case in ZERO_CONTEXT_CASES:
await test_case(case)
print("=" * 70)
print(" Done")
print("=" * 70)
if __name__ == "__main__":
asyncio.run(main())