"""Retrieval evaluation script for MediaStorm RAG. Measures Precision@1, Recall@5, MRR, NDCG@5 against curated ground truth queries. Zero dependencies beyond the project itself. Usage: python eval_retrieval.py [--verbose] """ import asyncio import math import time # --------------------------------------------------------------------------- # Metrics # --------------------------------------------------------------------------- def precision_at_1(retrieved_ids: list[str], expected_ids: set[str]) -> float: """1.0 if top result is relevant, 0.0 otherwise.""" if not retrieved_ids or not expected_ids: return 0.0 return 1.0 if retrieved_ids[0] in expected_ids else 0.0 def recall_at_k(retrieved_ids: list[str], expected_ids: set[str], k: int = 5) -> float: """Fraction of expected docs found in top-k results.""" if not expected_ids: return 1.0 # vacuous truth found = set(retrieved_ids[:k]) & expected_ids return len(found) / len(expected_ids) def mrr(retrieved_ids: list[str], expected_ids: set[str]) -> float: """Mean Reciprocal Rank — 1/rank of first relevant result.""" if not expected_ids: return 0.0 for i, rid in enumerate(retrieved_ids): if rid in expected_ids: return 1.0 / (i + 1) return 0.0 def ndcg_at_k(retrieved_ids: list[str], expected_ids: set[str], k: int = 5) -> float: """Normalized Discounted Cumulative Gain at k.""" if not expected_ids: return 0.0 # DCG dcg = 0.0 for i, rid in enumerate(retrieved_ids[:k]): if rid in expected_ids: dcg += 1.0 / math.log2(i + 2) # i+2 because log2(1)=0 # Ideal DCG (all relevant docs at top) ideal_count = min(len(expected_ids), k) idcg = sum(1.0 / math.log2(i + 2) for i in range(ideal_count)) return dcg / idcg if idcg > 0 else 0.0 # --------------------------------------------------------------------------- # Ground truth queries # --------------------------------------------------------------------------- EVAL_QUERIES = [ # --- GEOGRAPHIC (5) --- { "query": "Stories about the war in Congo", "expected": {"f6cea0f9", "3d1c98e7", "e1018d23", "2f638ea0"}, "category": "geographic", }, { "query": "Documentaries set in Afghanistan", "expected": {"d7982140", "f38a381a", "5e34b1f2", "b3fc2f4f", "dab272b4"}, "category": "geographic", }, { "query": "Stories about East Africa — Kenya, Ethiopia, Somalia", "expected": {"44004365", "b37d6691", "d371bdd4", "f9427ae8", "1d972cf2"}, "category": "geographic", }, { "query": "Stories filmed in Latin America or Mexico", "expected": {"13e631b1", "6ed4cd6a", "7708bf66", "ebe28ea7", "5fa24a64"}, "category": "geographic", }, { "query": "Stories about the Middle East conflict — Israel Palestine", "expected": {"aab689a6"}, "category": "geographic", }, # --- THEMATIC (5) --- { "query": "Stories about PTSD and veterans returning from war", "expected": {"e53b9d54", "b3fc2f4f", "dab272b4", "8f3f7b47", "5936e80e"}, "category": "thematic", }, { "query": "Climate change and environmental destruction", "expected": {"7233cf20", "44004365", "d371bdd4", "deb75fcf", "18b8f8d9", "b6f35a10"}, "category": "thematic", }, { "query": "Child marriage and women's rights", "expected": {"5e34b1f2", "b37d6691", "9b36adb3", "65c1fa57", "ebe28ea7"}, "category": "thematic", }, { "query": "Wildlife conservation and endangered species", "expected": {"f9427ae8", "c6fc31b3", "e1018d23", "39431e99", "b8c0f1a0"}, "category": "thematic", }, { "query": "Immigration and refugee stories", "expected": {"49866bbe", "f4fe3cbf", "6ed4cd6a", "ce125ae0", "6e613637"}, "category": "thematic", }, # --- TEMPORAL (4) --- { "query": "MediaStorm's earliest stories from 2005-2006", "expected": {"5936e80e", "88995eea", "0a2d36a5", "13e631b1", "5fa24a64", "214b44a0", "9e5325aa"}, "category": "temporal", }, { "query": "Recent stories from 2022 to 2025", "expected": {"b8c0f1a0", "609d8d9f", "64b3132a", "f9427ae8", "051af735"}, "category": "temporal", }, { "query": "Stories from the 2008 financial crisis era", "expected": {"5ce6a28d", "b333209b", "732657e6"}, "category": "temporal", }, { "query": "Stories published around 2010", "expected": {"44004365", "575bf728", "9b36adb3", "fbd54b9c", "826d329f"}, "category": "temporal", }, # --- PEOPLE (4) --- { "query": "Stories about Sebastiao Salgado", "expected": {"3fa4c5e5"}, "category": "people", }, { "query": "Stories featuring Don McCullin", "expected": {"3f5a3517"}, "category": "people", }, { "query": "Stories about Ai Weiwei", "expected": {"c346cb01"}, "category": "people", }, { "query": "Stories about Angelina Jolie and humanitarian work", "expected": {"fade2a94"}, "category": "people", }, # --- GENRE/FORMAT (4) --- { "query": "Photo essays in the archive", "expected": {"0a2d36a5", "13e631b1", "34d720e4", "732657e6", "d7982140", "c3d52625", "9e5325aa", "e53b9d54"}, "category": "genre", }, { "query": "Interactive multimedia projects or crisis guides", "expected": {"1815903a", "6b84f13f", "5be0d7ec", "05208857", "aab689a6"}, "category": "genre", }, { "query": "Video documentaries about family and aging", "expected": {"176e4cd9", "7e8268de", "88995eea", "7f8e385f", "4c2f60cf"}, "category": "genre", }, { "query": "Animated or motion design pieces", "expected": {"018cbb6a", "85d5056b", "5ae39bb8"}, "category": "genre", }, # --- AWARDS (4) --- { "query": "Emmy award winning stories", "expected": {"49866bbe", "988d3b60", "732657e6", "9b36adb3", "d266b644", "bac708fd", "dc0749e7", "e4cb243e", "e53b9d54"}, "category": "awards", }, { "query": "World Press Photo winners", "expected": {"44004365", "176e4cd9", "575bf728", "7cc092f6", "87f894da", "127a7e90"}, "category": "awards", }, { "query": "Award-winning stories about the Iraq war", "expected": {"e53b9d54", "5936e80e"}, "category": "awards", }, { "query": "Stories that won at Webby Awards", "expected": {"176e4cd9", "5936e80e", "44004365", "575bf728"}, "category": "awards", }, # --- EDGE CASES: should return NO relevant results (4) --- { "query": "Quantum computing breakthroughs in 2024", "expected": set(), "category": "edge_no_match", }, { "query": "Best Italian pasta recipes from Tuscany", "expected": set(), "category": "edge_no_match", }, { "query": "Taylor Swift concert tour dates", "expected": set(), "category": "edge_no_match", }, { "query": "Stock market trading strategies and cryptocurrency", "expected": set(), "category": "edge_no_match", }, ] # --------------------------------------------------------------------------- # Runner # --------------------------------------------------------------------------- def _resolve_uids(queries: list[dict], all_ids: list[str]) -> list[dict]: """Resolve short UID prefixes to full UUIDs from ChromaDB.""" prefix_map = {} for full_id in all_ids: prefix = full_id.split("-")[0] prefix_map[prefix] = full_id resolved = [] for q in queries: expected = set() for uid in q["expected"]: if uid in prefix_map: expected.add(prefix_map[uid]) else: expected.add(uid) # keep as-is if already full resolved.append({**q, "expected": expected}) return resolved async def run_eval(verbose: bool = False, quiet: bool = False, pipeline: bool = False) -> dict: """Run evaluation and return aggregate metrics. Args: pipeline: If True, run the full pipeline (retriever + Gemini filter). Requires GEMINI_API_KEY. Measures what the user actually sees. """ from mediastorm.config import CHROMADB_PATH, BM25_INDEX_PATH from mediastorm.vectorize.store import VectorStore from mediastorm.vectorize.embedder import Embedder from mediastorm.vectorize.bm25_store import BM25Store from mediastorm.rag.retriever import HybridRetriever from mediastorm.rag.router import QueryRouter store = VectorStore(path=CHROMADB_PATH) embedder = Embedder() bm25 = BM25Store(path=BM25_INDEX_PATH) bm25.load() router = QueryRouter() retriever = HybridRetriever( vector_store=store, bm25_store=bm25, embedder=embedder, router=router, top_k_final=5, ) # Pipeline mode: build link lookup and import generator link_lookup: dict[str, str] = {} if pipeline: from mediastorm.api import _build_link_lookup from mediastorm.rag.generator import generate_response link_lookup = await _build_link_lookup() # Resolve short UIDs to full UUIDs all_ids = store._stories.get(include=[])["ids"] queries = _resolve_uids(EVAL_QUERIES, all_ids) results = [] category_results: dict[str, list] = {} if not quiet: print("=" * 70) print("MediaStorm RAG — Retrieval Evaluation") print("=" * 70) print() for i, q in enumerate(queries): query = q["query"] expected = q["expected"] category = q["category"] start = time.time() retrieval = await retriever.retrieve(query) duration = time.time() - start # Pipeline mode: filter through Gemini (same as /api/search) if pipeline and retrieval.stories: full_text = await generate_response( query, retrieval, link_lookup=link_lookup, ) retrieval_stories = [ s for s in retrieval.stories if link_lookup.get(s["id"], "") and link_lookup[s["id"]] in full_text ] else: retrieval_stories = retrieval.stories retrieved_ids = [s["id"] for s in retrieval_stories] if category == "edge_no_match": success = len(retrieval_stories) == 0 row = { "query": query, "category": category, "success": success, "num_returned": len(retrieval_stories), "expected": [], "duration": duration, } status = "PASS" if success else "FAIL" if verbose and not quiet: print(f" [{status}] Q{i+1}: {query}") if not success: names = [s.get("metadata", {}).get("name", s["id"]) for s in retrieval_stories] print(f" Unexpected results: {names}") else: p1 = precision_at_1(retrieved_ids, expected) r5 = recall_at_k(retrieved_ids, expected, k=5) m = mrr(retrieved_ids, expected) n5 = ndcg_at_k(retrieved_ids, expected, k=5) row = { "query": query, "category": category, "precision_at_1": p1, "recall_at_5": r5, "mrr": m, "ndcg_at_5": n5, "retrieved": retrieved_ids, "expected": list(expected), "missed": list(expected - set(retrieved_ids)), "duration": duration, } if verbose and not quiet: status = "PASS" if r5 > 0 else "MISS" print(f" [{status}] Q{i+1}: {query}") print(f" P@1={p1:.0f} R@5={r5:.2f} MRR={m:.2f} NDCG@5={n5:.2f} ({duration:.1f}s)") if r5 < 1.0: found = set(retrieved_ids) & expected missed_v = expected - set(retrieved_ids) if missed_v: print(f" Missed: {missed_v}") results.append(row) category_results.setdefault(category, []).append(row) # Split into semantic, filter, and edge queries _SEMANTIC_CATS = {"geographic", "thematic", "people"} _FILTER_CATS = {"temporal", "genre", "awards"} semantic = [r for r in results if r["category"] in _SEMANTIC_CATS] filter_q = [r for r in results if r["category"] in _FILTER_CATS] edge = [r for r in results if r["category"] == "edge_no_match"] scored = [r for r in results if r["category"] != "edge_no_match"] def _avg(rows, key): return sum(r[key] for r in rows) / len(rows) if rows else 0.0 edge_pass = sum(1 for r in edge if r["success"]) if not quiet: print() print("-" * 70) print("CORE SEMANTIC SEARCH (people, thematic, geographic)") print("-" * 70) print(f" Precision@1: {_avg(semantic, 'precision_at_1'):.2f} (target ≥ 0.85)") print(f" Recall@5: {_avg(semantic, 'recall_at_5'):.2f} (target ≥ 0.90)") print(f" MRR: {_avg(semantic, 'mrr'):.2f}") print(f" NDCG@5: {_avg(semantic, 'ndcg_at_5'):.2f}") print() print("FILTER QUERIES (temporal, genre, awards)") print("-" * 70) print(f" Precision@1: {_avg(filter_q, 'precision_at_1'):.2f}") print(f" Recall@5: {_avg(filter_q, 'recall_at_5'):.2f}") print(f" MRR: {_avg(filter_q, 'mrr'):.2f}") print() print("EDGE CASES") print("-" * 70) print(f" Correctly rejected: {edge_pass}/{len(edge)}") print() # Per-category breakdown print("PER-CATEGORY BREAKDOWN") print("-" * 70) for cat, rows in category_results.items(): if cat == "edge_no_match": passed = sum(1 for r in rows if r["success"]) print(f" {cat:20s} {passed}/{len(rows)} rejected") else: label = "semantic" if cat in _SEMANTIC_CATS else "filter" cat_r5 = _avg(rows, "recall_at_5") cat_p1 = _avg(rows, "precision_at_1") print(f" {cat:20s} P@1={cat_p1:.2f} R@5={cat_r5:.2f} ({len(rows)} queries) [{label}]") print("=" * 70) return { "semantic_precision_at_1": _avg(semantic, "precision_at_1"), "semantic_recall_at_5": _avg(semantic, "recall_at_5"), "semantic_mrr": _avg(semantic, "mrr"), "semantic_ndcg_at_5": _avg(semantic, "ndcg_at_5"), "filter_precision_at_1": _avg(filter_q, "precision_at_1"), "filter_recall_at_5": _avg(filter_q, "recall_at_5"), "edge_pass_rate": edge_pass / len(edge) if edge else 1.0, "details": results, } if __name__ == "__main__": import sys verbose = "--verbose" in sys.argv or "-v" in sys.argv pipeline = "--pipeline" in sys.argv asyncio.run(run_eval(verbose=verbose, pipeline=pipeline))