Spaces:
Running
Running
File size: 4,763 Bytes
ac1bc25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | #!/usr/bin/env python3
"""
Fallback Retrieval Diagnostic Script
Tests the 4 zero-context RAG cases against all retrieval layers:
Layer 1: search_knowledge() — dense at 0.30 threshold
Layer 2: search_knowledge() — fallback at 0.20 threshold
Layer 3: search_knowledge_fts() — simplified query
Layer 4: search_knowledge_hybrid() — full pipeline
Usage: python scripts/test_fallback.py
"""
import asyncio
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parent.parent / ".env")
from app.vector_store import search_knowledge, search_knowledge_fts, search_knowledge_hybrid
FALLBACK_THRESHOLD = 0.20
ZERO_CONTEXT_CASES = [
{
"case_id": "rag_terung_suhu_ph",
"query": "Berapa suhu dan pH yang sesuai untuk budidaya terung?",
"expected_source": "Budidaya Terung.pdf",
},
{
"case_id": "rag_marigold_production",
"query": "What are the ideal temperature and spacing requirements for marigold flower production?",
"expected_source": "Marigold flower production guideline.pdf",
},
{
"case_id": "rag_chinese_kale_cultivation",
"query": "What are the temperature, spacing, and fertilizer requirements for Chinese kale?",
"expected_source": "Chinese Kale (Brassica oleracea), Cultivar Group.pdf",
},
{
"case_id": "rag_home_hydroponics_lighting",
"query": "What factors should a home hydroponic grower consider when setting up lighting for leafy greens?",
"expected_source": "A Guide to Home Hydroponics for Leafy Greens.pdf",
},
]
async def test_case(case: dict) -> None:
cid = case["case_id"]
q = case["query"]
expected = case["expected_source"]
print(f"\n{'='*70}")
print(f" {cid}")
print(f" Query: {q}")
print(f" Expected source: {expected}")
print(f"{'='*70}")
# Layer 1: Dense at 0.30
print(f"\n [Layer 1] search_knowledge(threshold=0.30)")
try:
chunks = await search_knowledge(q, match_count=7, match_threshold=0.30, query_label="layer1")
print(f" Chunks returned: {len(chunks)}")
for i, c in enumerate(chunks, 1):
src = c.get("source", "?")
sim = c.get("similarity", 0)
preview = c.get("content", "")[:80].replace("\n", " ")
print(f" [{i}] sim={sim:.2%} | {src} | \"{preview}...\"")
except Exception as e:
print(f" ERROR: {e}")
# Layer 2: Dense at 0.20 (fallback)
print(f"\n [Layer 2] search_knowledge(threshold=0.20) — fallback")
try:
chunks = await search_knowledge(q, match_count=7, match_threshold=0.20, query_label="layer2")
print(f" Chunks returned: {len(chunks)}")
for i, c in enumerate(chunks, 1):
src = c.get("source", "?")
sim = c.get("similarity", 0)
preview = c.get("content", "")[:80].replace("\n", " ")
print(f" [{i}] sim={sim:.2%} | {src} | \"{preview}...\"")
except Exception as e:
print(f" ERROR: {e}")
# Layer 3: FTS with simplified query
print(f"\n [Layer 3] search_knowledge_fts()")
try:
chunks = await search_knowledge_fts(q, match_count=7)
print(f" Chunks returned: {len(chunks)}")
for i, c in enumerate(chunks, 1):
src = c.get("source", "?")
rank = c.get("fts_rank", c.get("similarity", 0))
preview = c.get("content", "")[:80].replace("\n", " ")
print(f" [{i}] rank={rank} | {src} | \"{preview}...\"")
except Exception as e:
print(f" ERROR: {e}")
# Layer 4: Full hybrid pipeline
print(f"\n [Layer 4] search_knowledge_hybrid() — full pipeline")
try:
result = await search_knowledge_hybrid(q, top_k=7)
chunks = result if isinstance(result, list) else result.get("chunks", [])
print(f" Chunks returned: {len(chunks)}")
for i, c in enumerate(chunks, 1):
src = c.get("source", "?")
sim = c.get("similarity", c.get("rrf_score", 0))
preview = c.get("content", "")[:80].replace("\n", " ")
print(f" [{i}] sim={sim:.2%}" if isinstance(sim, float) else f" [{i}] score={sim} | {src} | \"{preview}...\"")
except Exception as e:
print(f" ERROR: {e}")
# Summary
print()
async def main():
print("=" * 70)
print(" FALLBACK RETRIEVAL DIAGNOSTIC")
print(" Testing 4 zero-context RAG cases")
print("=" * 70)
for case in ZERO_CONTEXT_CASES:
await test_case(case)
print("=" * 70)
print(" Done")
print("=" * 70)
if __name__ == "__main__":
asyncio.run(main())
|