| """Evaluate RAG retrieval quality with golden queries.""" |
| import json |
| import os |
| import pytest |
|
|
| GOLDEN_QUERIES = [ |
| { |
| "query": "what should workers do in extreme heat", |
| "expected_themes": ["shade", "water", "rest", "break"], |
| "category": "safety", |
| }, |
| { |
| "query": "Jangwani informal settlement heat risk", |
| "expected_themes": ["Jangwani", "informal", "tin roof"], |
| "zone_id": "DAR-JAN", |
| "category": "zone", |
| }, |
| { |
| "query": "how does parametric insurance payout automatically", |
| "expected_themes": ["parametric", "automatic", "trigger", "payout"], |
| "category": "insurance", |
| }, |
| { |
| "query": "emergency phone numbers Nairobi", |
| "expected_themes": ["Red Cross", "ambulance", "999", "Nairobi"], |
| "category": "emergency", |
| }, |
| { |
| "query": "critical heat alert actions stop work immediately", |
| "expected_themes": ["critical", "stop", "emergency", "medical"], |
| "category": "actions", |
| }, |
| { |
| "query": "Kibera corrugated tin roofs urban heat", |
| "expected_themes": ["Kibera", "tin", "heat"], |
| "zone_id": "NBO-KIB", |
| "category": "zone", |
| }, |
| { |
| "query": "heat stroke dehydration symptoms workers", |
| "expected_themes": ["heat", "water", "hydration"], |
| "category": "safety", |
| }, |
| { |
| "query": "Swahili translation heat warning joto", |
| "expected_themes": ["joto", "Swahili", "tahadhari"], |
| "category": "language", |
| }, |
| { |
| "query": "basis risk what if trigger wrong", |
| "expected_themes": ["basis risk", "false"], |
| "category": "insurance", |
| }, |
| { |
| "query": "warning level heat alert two consecutive days", |
| "expected_themes": ["warning", "consecutive", "reduce"], |
| "category": "actions", |
| }, |
| ] |
|
|
|
|
| @pytest.fixture |
| def retriever(): |
| try: |
| from src.explanation.rag_provider import HybridRetriever |
| return HybridRetriever() |
| except Exception as e: |
| pytest.skip(f"RAG index not built: {e}") |
|
|
|
|
| def test_retrieval_returns_results(retriever): |
| """Each golden query should return at least 1 result.""" |
| for case in GOLDEN_QUERIES: |
| docs = retriever.retrieve(case["query"], zone_id=case.get("zone_id"), top_k=5) |
| assert len(docs) > 0, f"No results for: {case['query']}" |
|
|
|
|
| def test_retrieval_theme_coverage(retriever): |
| """Retrieved docs should contain expected themes.""" |
| results = [] |
| for case in GOLDEN_QUERIES: |
| docs = retriever.retrieve(case["query"], zone_id=case.get("zone_id"), top_k=5) |
| combined = " ".join(docs).lower() |
|
|
| themes_found = [t for t in case["expected_themes"] if t.lower() in combined] |
| coverage = len(themes_found) / len(case["expected_themes"]) |
|
|
| results.append({ |
| "query": case["query"], |
| "category": case["category"], |
| "themes_expected": case["expected_themes"], |
| "themes_found": themes_found, |
| "coverage": round(coverage, 2), |
| "docs_returned": len(docs), |
| }) |
|
|
| os.makedirs("tests/eval_results", exist_ok=True) |
| with open("tests/eval_results/rag_eval.json", "w") as f: |
| json.dump(results, f, indent=2) |
|
|
| |
| good = sum(1 for r in results if r["coverage"] >= 0.5) |
| assert good >= len(results) * 0.7, f"Only {good}/{len(results)} queries have 50%+ theme coverage" |
|
|
|
|
| def test_zone_boosting(retriever): |
| """Zone-specific queries should rank the matching zone doc higher.""" |
| docs_with_boost = retriever.retrieve( |
| "heat risk informal settlement", zone_id="DAR-JAN", top_k=5 |
| ) |
| docs_without_boost = retriever.retrieve( |
| "heat risk informal settlement", top_k=5 |
| ) |
|
|
| |
| def contains_jangwani(docs): |
| for i, d in enumerate(docs): |
| if "jangwani" in d.lower() or "dar-jan" in d.lower(): |
| return i |
| return len(docs) |
|
|
| rank_with = contains_jangwani(docs_with_boost) |
| rank_without = contains_jangwani(docs_without_boost) |
| assert rank_with <= rank_without, "Zone boosting should improve rank of matching zone" |
|
|