Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Fallback Retrieval Diagnostic Script | |
| Tests the 4 zero-context RAG cases against all retrieval layers: | |
| Layer 1: search_knowledge() — dense at 0.30 threshold | |
| Layer 2: search_knowledge() — fallback at 0.20 threshold | |
| Layer 3: search_knowledge_fts() — simplified query | |
| Layer 4: search_knowledge_hybrid() — full pipeline | |
| Usage: python scripts/test_fallback.py | |
| """ | |
| import asyncio | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) | |
| from dotenv import load_dotenv | |
| load_dotenv(Path(__file__).resolve().parent.parent / ".env") | |
| from app.vector_store import search_knowledge, search_knowledge_fts, search_knowledge_hybrid | |
| FALLBACK_THRESHOLD = 0.20 | |
| ZERO_CONTEXT_CASES = [ | |
| { | |
| "case_id": "rag_terung_suhu_ph", | |
| "query": "Berapa suhu dan pH yang sesuai untuk budidaya terung?", | |
| "expected_source": "Budidaya Terung.pdf", | |
| }, | |
| { | |
| "case_id": "rag_marigold_production", | |
| "query": "What are the ideal temperature and spacing requirements for marigold flower production?", | |
| "expected_source": "Marigold flower production guideline.pdf", | |
| }, | |
| { | |
| "case_id": "rag_chinese_kale_cultivation", | |
| "query": "What are the temperature, spacing, and fertilizer requirements for Chinese kale?", | |
| "expected_source": "Chinese Kale (Brassica oleracea), Cultivar Group.pdf", | |
| }, | |
| { | |
| "case_id": "rag_home_hydroponics_lighting", | |
| "query": "What factors should a home hydroponic grower consider when setting up lighting for leafy greens?", | |
| "expected_source": "A Guide to Home Hydroponics for Leafy Greens.pdf", | |
| }, | |
| ] | |
| async def test_case(case: dict) -> None: | |
| cid = case["case_id"] | |
| q = case["query"] | |
| expected = case["expected_source"] | |
| print(f"\n{'='*70}") | |
| print(f" {cid}") | |
| print(f" Query: {q}") | |
| print(f" Expected source: {expected}") | |
| print(f"{'='*70}") | |
| # Layer 1: Dense at 0.30 | |
| print(f"\n [Layer 1] search_knowledge(threshold=0.30)") | |
| try: | |
| chunks = await search_knowledge(q, match_count=7, match_threshold=0.30, query_label="layer1") | |
| print(f" Chunks returned: {len(chunks)}") | |
| for i, c in enumerate(chunks, 1): | |
| src = c.get("source", "?") | |
| sim = c.get("similarity", 0) | |
| preview = c.get("content", "")[:80].replace("\n", " ") | |
| print(f" [{i}] sim={sim:.2%} | {src} | \"{preview}...\"") | |
| except Exception as e: | |
| print(f" ERROR: {e}") | |
| # Layer 2: Dense at 0.20 (fallback) | |
| print(f"\n [Layer 2] search_knowledge(threshold=0.20) — fallback") | |
| try: | |
| chunks = await search_knowledge(q, match_count=7, match_threshold=0.20, query_label="layer2") | |
| print(f" Chunks returned: {len(chunks)}") | |
| for i, c in enumerate(chunks, 1): | |
| src = c.get("source", "?") | |
| sim = c.get("similarity", 0) | |
| preview = c.get("content", "")[:80].replace("\n", " ") | |
| print(f" [{i}] sim={sim:.2%} | {src} | \"{preview}...\"") | |
| except Exception as e: | |
| print(f" ERROR: {e}") | |
| # Layer 3: FTS with simplified query | |
| print(f"\n [Layer 3] search_knowledge_fts()") | |
| try: | |
| chunks = await search_knowledge_fts(q, match_count=7) | |
| print(f" Chunks returned: {len(chunks)}") | |
| for i, c in enumerate(chunks, 1): | |
| src = c.get("source", "?") | |
| rank = c.get("fts_rank", c.get("similarity", 0)) | |
| preview = c.get("content", "")[:80].replace("\n", " ") | |
| print(f" [{i}] rank={rank} | {src} | \"{preview}...\"") | |
| except Exception as e: | |
| print(f" ERROR: {e}") | |
| # Layer 4: Full hybrid pipeline | |
| print(f"\n [Layer 4] search_knowledge_hybrid() — full pipeline") | |
| try: | |
| result = await search_knowledge_hybrid(q, top_k=7) | |
| chunks = result if isinstance(result, list) else result.get("chunks", []) | |
| print(f" Chunks returned: {len(chunks)}") | |
| for i, c in enumerate(chunks, 1): | |
| src = c.get("source", "?") | |
| sim = c.get("similarity", c.get("rrf_score", 0)) | |
| preview = c.get("content", "")[:80].replace("\n", " ") | |
| print(f" [{i}] sim={sim:.2%}" if isinstance(sim, float) else f" [{i}] score={sim} | {src} | \"{preview}...\"") | |
| except Exception as e: | |
| print(f" ERROR: {e}") | |
| # Summary | |
| print() | |
| async def main(): | |
| print("=" * 70) | |
| print(" FALLBACK RETRIEVAL DIAGNOSTIC") | |
| print(" Testing 4 zero-context RAG cases") | |
| print("=" * 70) | |
| for case in ZERO_CONTEXT_CASES: | |
| await test_case(case) | |
| print("=" * 70) | |
| print(" Done") | |
| print("=" * 70) | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |