climate-risk-engine / tests /eval_rag.py
jtlevine's picture
Add LSTM neural model, ERA5 data, FAISS+BM25 RAG, Neon DB, eval suite; de-jargon frontend
f2b0895
"""Evaluate RAG retrieval quality with golden queries."""
import json
import os
import pytest
GOLDEN_QUERIES = [
{
"query": "what should workers do in extreme heat",
"expected_themes": ["shade", "water", "rest", "break"],
"category": "safety",
},
{
"query": "Jangwani informal settlement heat risk",
"expected_themes": ["Jangwani", "informal", "tin roof"],
"zone_id": "DAR-JAN",
"category": "zone",
},
{
"query": "how does parametric insurance payout automatically",
"expected_themes": ["parametric", "automatic", "trigger", "payout"],
"category": "insurance",
},
{
"query": "emergency phone numbers Nairobi",
"expected_themes": ["Red Cross", "ambulance", "999", "Nairobi"],
"category": "emergency",
},
{
"query": "critical heat alert actions stop work immediately",
"expected_themes": ["critical", "stop", "emergency", "medical"],
"category": "actions",
},
{
"query": "Kibera corrugated tin roofs urban heat",
"expected_themes": ["Kibera", "tin", "heat"],
"zone_id": "NBO-KIB",
"category": "zone",
},
{
"query": "heat stroke dehydration symptoms workers",
"expected_themes": ["heat", "water", "hydration"],
"category": "safety",
},
{
"query": "Swahili translation heat warning joto",
"expected_themes": ["joto", "Swahili", "tahadhari"],
"category": "language",
},
{
"query": "basis risk what if trigger wrong",
"expected_themes": ["basis risk", "false"],
"category": "insurance",
},
{
"query": "warning level heat alert two consecutive days",
"expected_themes": ["warning", "consecutive", "reduce"],
"category": "actions",
},
]
@pytest.fixture
def retriever():
try:
from src.explanation.rag_provider import HybridRetriever
return HybridRetriever()
except Exception as e:
pytest.skip(f"RAG index not built: {e}")
def test_retrieval_returns_results(retriever):
"""Each golden query should return at least 1 result."""
for case in GOLDEN_QUERIES:
docs = retriever.retrieve(case["query"], zone_id=case.get("zone_id"), top_k=5)
assert len(docs) > 0, f"No results for: {case['query']}"
def test_retrieval_theme_coverage(retriever):
"""Retrieved docs should contain expected themes."""
results = []
for case in GOLDEN_QUERIES:
docs = retriever.retrieve(case["query"], zone_id=case.get("zone_id"), top_k=5)
combined = " ".join(docs).lower()
themes_found = [t for t in case["expected_themes"] if t.lower() in combined]
coverage = len(themes_found) / len(case["expected_themes"])
results.append({
"query": case["query"],
"category": case["category"],
"themes_expected": case["expected_themes"],
"themes_found": themes_found,
"coverage": round(coverage, 2),
"docs_returned": len(docs),
})
os.makedirs("tests/eval_results", exist_ok=True)
with open("tests/eval_results/rag_eval.json", "w") as f:
json.dump(results, f, indent=2)
# At least 70% of queries should have >50% theme coverage
good = sum(1 for r in results if r["coverage"] >= 0.5)
assert good >= len(results) * 0.7, f"Only {good}/{len(results)} queries have 50%+ theme coverage"
def test_zone_boosting(retriever):
"""Zone-specific queries should rank the matching zone doc higher."""
docs_with_boost = retriever.retrieve(
"heat risk informal settlement", zone_id="DAR-JAN", top_k=5
)
docs_without_boost = retriever.retrieve(
"heat risk informal settlement", top_k=5
)
# With zone boost, DAR-JAN content should appear earlier
def contains_jangwani(docs):
for i, d in enumerate(docs):
if "jangwani" in d.lower() or "dar-jan" in d.lower():
return i
return len(docs)
rank_with = contains_jangwani(docs_with_boost)
rank_without = contains_jangwani(docs_without_boost)
assert rank_with <= rank_without, "Zone boosting should improve rank of matching zone"