remdms Claude Opus 4.6 commited on
Commit
b145b51
·
1 Parent(s): 72296b0

docs: implementation plan for eval CLI

Browse files

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

docs/superpowers/plans/2026-03-31-eval-cli.md ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Eval CLI Implementation Plan
2
+
3
+ > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
4
+
5
+ **Goal:** Add `cli.py eval` command that runs retrieval evaluation, saves results as JSON, and shows diffs against previous runs.
6
+
7
+ **Architecture:** CLI command calls existing `eval_retrieval.run_eval()`, structures results into a JSON file in `data/eval_runs/`, then compares against the most recent previous run to show regressions/improvements. Two new modules: `runner.py` (orchestration + storage) and `display.py` (console formatting).
8
+
9
+ **Tech Stack:** Python, Click (existing CLI), JSON files for storage
10
+
11
+ ---
12
+
13
+ ### Task 1: Runner — run eval and save JSON
14
+
15
+ **Files:**
16
+ - Create: `src/mediastorm/eval/__init__.py`
17
+ - Create: `src/mediastorm/eval/runner.py`
18
+
19
+ - [ ] **Step 1: Create empty `__init__.py`**
20
+
21
+ ```python
22
+ # src/mediastorm/eval/__init__.py
23
+ ```
24
+
25
+ - [ ] **Step 2: Write `runner.py`**
26
+
27
+ ```python
28
+ """Eval runner — orchestrates evaluation and persists results."""
29
+ import asyncio
30
+ import json
31
+ import time
32
+ from datetime import datetime
33
+ from pathlib import Path
34
+
35
+ EVAL_RUNS_DIR = Path("./data/eval_runs")
36
+
37
+ _SEMANTIC_CATS = {"geographic", "thematic", "people"}
38
+ _FILTER_CATS = {"temporal", "genre", "awards"}
39
+
40
+
41
+ def _build_run_data(eval_result: dict) -> dict:
42
+ """Structure raw eval_retrieval output into storable run data."""
43
+ details = eval_result["details"]
44
+ ts = datetime.now()
45
+
46
+ # Per-category aggregates
47
+ categories: dict[str, dict] = {}
48
+ for row in details:
49
+ cat = row["category"]
50
+ if cat not in categories:
51
+ categories[cat] = {"rows": []}
52
+ categories[cat]["rows"].append(row)
53
+
54
+ cat_summary = {}
55
+ for cat, data in categories.items():
56
+ rows = data["rows"]
57
+ if cat == "edge_no_match":
58
+ passed = sum(1 for r in rows if r["success"])
59
+ cat_summary[cat] = {"passed": passed, "total": len(rows)}
60
+ else:
61
+ cat_summary[cat] = {
62
+ "p1": _avg(rows, "precision_at_1"),
63
+ "r5": _avg(rows, "recall_at_5"),
64
+ "mrr": _avg(rows, "mrr"),
65
+ "ndcg5": _avg(rows, "ndcg_at_5"),
66
+ "count": len(rows),
67
+ }
68
+
69
+ # Query-level details
70
+ queries = []
71
+ for row in details:
72
+ if row["category"] == "edge_no_match":
73
+ queries.append({
74
+ "query": row["query"],
75
+ "category": row["category"],
76
+ "success": row["success"],
77
+ "num_returned": row["num_returned"],
78
+ "duration": row["duration"],
79
+ })
80
+ else:
81
+ queries.append({
82
+ "query": row["query"],
83
+ "category": row["category"],
84
+ "p1": row["precision_at_1"],
85
+ "r5": row["recall_at_5"],
86
+ "mrr": row["mrr"],
87
+ "ndcg5": row["ndcg_at_5"],
88
+ "retrieved_ids": row["retrieved"],
89
+ "expected_ids": list(row.get("expected", [])) if "expected" not in row else row["retrieved"], # fallback
90
+ "missed": list(set(row.get("expected", set())) - set(row["retrieved"])) if row.get("expected") else [],
91
+ "duration": row["duration"],
92
+ })
93
+
94
+ # We need expected IDs in the output — but eval_retrieval doesn't return them
95
+ # in the details dict. We'll enrich from EVAL_QUERIES in the CLI command.
96
+
97
+ return {
98
+ "timestamp": ts.isoformat(timespec="seconds"),
99
+ "aggregates": {
100
+ "semantic_p1": eval_result["semantic_precision_at_1"],
101
+ "semantic_r5": eval_result["semantic_recall_at_5"],
102
+ "semantic_mrr": eval_result["semantic_mrr"],
103
+ "semantic_ndcg5": eval_result["semantic_ndcg_at_5"],
104
+ "filter_p1": eval_result["filter_precision_at_1"],
105
+ "filter_r5": eval_result["filter_recall_at_5"],
106
+ "edge_pass_rate": eval_result["edge_pass_rate"],
107
+ },
108
+ "categories": cat_summary,
109
+ "queries": queries,
110
+ }
111
+
112
+
113
+ def _avg(rows: list[dict], key: str) -> float:
114
+ vals = [r[key] for r in rows if key in r]
115
+ return sum(vals) / len(vals) if vals else 0.0
116
+
117
+
118
+ def save_run(run_data: dict, runs_dir: Path = EVAL_RUNS_DIR) -> Path:
119
+ """Save run data as timestamped JSON. Returns the file path."""
120
+ runs_dir.mkdir(parents=True, exist_ok=True)
121
+ ts = datetime.fromisoformat(run_data["timestamp"])
122
+ filename = ts.strftime("%Y-%m-%d_%H-%M-%S") + ".json"
123
+ path = runs_dir / filename
124
+ path.write_text(json.dumps(run_data, indent=2, ensure_ascii=False))
125
+ return path
126
+
127
+
128
+ def load_previous_run(runs_dir: Path = EVAL_RUNS_DIR) -> dict | None:
129
+ """Load the most recent run JSON, or None if no runs exist."""
130
+ if not runs_dir.exists():
131
+ return None
132
+ files = sorted(runs_dir.glob("*.json"))
133
+ if not files:
134
+ return None
135
+ return json.loads(files[-1].read_text())
136
+
137
+
138
+ def load_all_runs(runs_dir: Path = EVAL_RUNS_DIR) -> list[dict]:
139
+ """Load all run JSONs sorted chronologically."""
140
+ if not runs_dir.exists():
141
+ return []
142
+ files = sorted(runs_dir.glob("*.json"))
143
+ return [json.loads(f.read_text()) for f in files]
144
+ ```
145
+
146
+ - [ ] **Step 3: Verify module imports**
147
+
148
+ Run: `source .venv/bin/activate && python -c "from mediastorm.eval.runner import save_run, load_previous_run, load_all_runs, _build_run_data; print('OK')"`
149
+ Expected: `OK`
150
+
151
+ - [ ] **Step 4: Commit**
152
+
153
+ ```bash
154
+ git add src/mediastorm/eval/__init__.py src/mediastorm/eval/runner.py
155
+ git commit -m "feat(eval): add runner module for eval orchestration and JSON storage"
156
+ ```
157
+
158
+ ---
159
+
160
+ ### Task 2: Display — console formatting
161
+
162
+ **Files:**
163
+ - Create: `src/mediastorm/eval/display.py`
164
+
165
+ - [ ] **Step 1: Write `display.py`**
166
+
167
+ ```python
168
+ """Console display for eval runs — scores, diffs, history."""
169
+
170
+
171
+ def print_scores(run_data: dict) -> None:
172
+ """Print aggregate scores table (same layout as eval_retrieval.py)."""
173
+ agg = run_data["aggregates"]
174
+ cats = run_data["categories"]
175
+
176
+ print()
177
+ print("=" * 60)
178
+ print("MediaStorm RAG — Retrieval Evaluation")
179
+ print("=" * 60)
180
+
181
+ print()
182
+ print("CORE SEMANTIC SEARCH (people, thematic, geographic)")
183
+ print("-" * 60)
184
+ print(f" Precision@1: {agg['semantic_p1']:.2f} (target ≥ 0.85)")
185
+ print(f" Recall@5: {agg['semantic_r5']:.2f} (target ≥ 0.90)")
186
+ print(f" MRR: {agg['semantic_mrr']:.2f}")
187
+ print(f" NDCG@5: {agg['semantic_ndcg5']:.2f}")
188
+
189
+ print()
190
+ print("FILTER QUERIES (temporal, genre, awards)")
191
+ print("-" * 60)
192
+ print(f" Precision@1: {agg['filter_p1']:.2f}")
193
+ print(f" Recall@5: {agg['filter_r5']:.2f}")
194
+
195
+ edge = cats.get("edge_no_match", {})
196
+ passed = edge.get("passed", 0)
197
+ total = edge.get("total", 0)
198
+ print()
199
+ print("EDGE CASES")
200
+ print("-" * 60)
201
+ print(f" Correctly rejected: {passed}/{total}")
202
+
203
+ # Per-category breakdown
204
+ print()
205
+ print("PER-CATEGORY BREAKDOWN")
206
+ print("-" * 60)
207
+ _SEM = {"geographic", "thematic", "people"}
208
+ for cat, data in cats.items():
209
+ if cat == "edge_no_match":
210
+ print(f" {cat:20s} {data['passed']}/{data['total']} rejected")
211
+ else:
212
+ label = "semantic" if cat in _SEM else "filter"
213
+ print(f" {cat:20s} P@1={data['p1']:.2f} R@5={data['r5']:.2f} ({data['count']} queries) [{label}]")
214
+
215
+ print("=" * 60)
216
+
217
+
218
+ def print_verbose(run_data: dict) -> None:
219
+ """Print per-query details before the aggregate scores."""
220
+ print()
221
+ for i, q in enumerate(run_data["queries"]):
222
+ if q["category"] == "edge_no_match":
223
+ status = "PASS" if q["success"] else "FAIL"
224
+ print(f" [{status}] Q{i+1}: {q['query']}")
225
+ if not q["success"]:
226
+ print(f" Returned {q['num_returned']} results (expected 0)")
227
+ else:
228
+ status = "PASS" if q["r5"] > 0 else "MISS"
229
+ print(f" [{status}] Q{i+1}: {q['query']}")
230
+ print(f" P@1={q['p1']:.0f} R@5={q['r5']:.2f} MRR={q['mrr']:.2f} NDCG@5={q['ndcg5']:.2f} ({q['duration']:.1f}s)")
231
+ if q.get("missed"):
232
+ print(f" Missed: {q['missed']}")
233
+
234
+
235
+ def print_diff(current: dict, previous: dict) -> None:
236
+ """Print comparison between two runs — deltas, regressions, improvements."""
237
+ prev_ts = previous["timestamp"][:16].replace("T", " ")
238
+ print()
239
+ print(f"COMPARISON vs {prev_ts}")
240
+ print("-" * 60)
241
+
242
+ cur_agg = current["aggregates"]
243
+ prev_agg = previous["aggregates"]
244
+
245
+ _diff_line("semantic P@1", prev_agg["semantic_p1"], cur_agg["semantic_p1"])
246
+ _diff_line("semantic R@5", prev_agg["semantic_r5"], cur_agg["semantic_r5"])
247
+ _diff_line("filter P@1", prev_agg["filter_p1"], cur_agg["filter_p1"])
248
+ _diff_line("filter R@5", prev_agg["filter_r5"], cur_agg["filter_r5"])
249
+
250
+ # Edge cases
251
+ prev_edge = previous["categories"].get("edge_no_match", {})
252
+ cur_edge = current["categories"].get("edge_no_match", {})
253
+ prev_e = prev_edge.get("passed", 0)
254
+ cur_e = cur_edge.get("passed", 0)
255
+ total_e = cur_edge.get("total", 0)
256
+ delta_e = cur_e - prev_e
257
+ arrow = " ▲" if delta_e > 0 else " ▼" if delta_e < 0 else ""
258
+ print(f" edge rejected: {prev_e}/{total_e} → {cur_e}/{total_e} ({'+' if delta_e >= 0 else ''}{delta_e}){arrow}")
259
+
260
+ # Per-query regressions and improvements
261
+ prev_queries = {q["query"]: q for q in previous["queries"]}
262
+ regressions = []
263
+ improvements = []
264
+
265
+ for q in current["queries"]:
266
+ if q["category"] == "edge_no_match":
267
+ continue
268
+ prev_q = prev_queries.get(q["query"])
269
+ if not prev_q or prev_q["category"] == "edge_no_match":
270
+ continue
271
+ delta = q["r5"] - prev_q["r5"]
272
+ if delta < -0.01:
273
+ regressions.append((q["query"], prev_q["r5"], q["r5"], delta))
274
+ elif delta > 0.01:
275
+ improvements.append((q["query"], prev_q["r5"], q["r5"], delta))
276
+
277
+ if regressions:
278
+ print()
279
+ print(f"REGRESSIONS ({len(regressions)}):")
280
+ for query, old, new, delta in regressions:
281
+ print(f' "{query}"')
282
+ print(f" R@5: {old:.2f} → {new:.2f} ({delta:+.2f}) ▼")
283
+
284
+ if improvements:
285
+ print()
286
+ print(f"IMPROVEMENTS ({len(improvements)}):")
287
+ for query, old, new, delta in improvements:
288
+ print(f' "{query}"')
289
+ print(f" R@5: {old:.2f} → {new:.2f} ({delta:+.2f}) ▲")
290
+
291
+ if not regressions and not improvements:
292
+ print()
293
+ print(" No per-query changes.")
294
+
295
+ print()
296
+
297
+
298
+ def _diff_line(label: str, old: float, new: float) -> None:
299
+ delta = new - old
300
+ if abs(delta) < 0.005:
301
+ arrow = "(=)"
302
+ elif delta > 0:
303
+ arrow = f"(+{delta:.2f}) ▲"
304
+ else:
305
+ arrow = f"({delta:.2f}) ▼ REGRESSION"
306
+ print(f" {label:16s} {old:.2f} → {new:.2f} {arrow}")
307
+
308
+
309
+ def print_history(runs: list[dict]) -> None:
310
+ """Print one-liner per run with trend indicators."""
311
+ if not runs:
312
+ print("No eval runs found.")
313
+ return
314
+
315
+ print()
316
+ print(f"EVAL HISTORY ({len(runs)} runs)")
317
+ print("-" * 60)
318
+
319
+ prev = None
320
+ for run in runs:
321
+ ts = run["timestamp"][:16].replace("T", "_").replace("-", "-").replace(":", "-")
322
+ # Use just date_time for display
323
+ display_ts = run["timestamp"][:16].replace("T", " ")
324
+ agg = run["aggregates"]
325
+ edge = run["categories"].get("edge_no_match", {})
326
+ edge_str = f"{edge.get('passed', 0)}/{edge.get('total', 0)}"
327
+
328
+ line = f" {display_ts} sem_R@5={agg['semantic_r5']:.2f} filt_R@5={agg['filter_r5']:.2f} edge={edge_str}"
329
+
330
+ if prev:
331
+ prev_agg = prev["aggregates"]
332
+ sem_delta = agg["semantic_r5"] - prev_agg["semantic_r5"]
333
+ filt_delta = agg["filter_r5"] - prev_agg["filter_r5"]
334
+ if sem_delta > 0.01 and filt_delta > 0.01:
335
+ line += " ▲ both"
336
+ elif sem_delta > 0.01:
337
+ line += f" ▲ sem +{sem_delta:.2f}"
338
+ elif filt_delta > 0.01:
339
+ line += f" ▲ filt +{filt_delta:.2f}"
340
+ elif sem_delta < -0.01 or filt_delta < -0.01:
341
+ line += " ▼"
342
+
343
+ print(line)
344
+ prev = run
345
+
346
+ print()
347
+ ```
348
+
349
+ - [ ] **Step 2: Verify module imports**
350
+
351
+ Run: `source .venv/bin/activate && python -c "from mediastorm.eval.display import print_scores, print_diff, print_history, print_verbose; print('OK')"`
352
+ Expected: `OK`
353
+
354
+ - [ ] **Step 3: Commit**
355
+
356
+ ```bash
357
+ git add src/mediastorm/eval/display.py
358
+ git commit -m "feat(eval): add display module for console formatting and diffs"
359
+ ```
360
+
361
+ ---
362
+
363
+ ### Task 3: CLI command — wire it all together
364
+
365
+ **Files:**
366
+ - Modify: `cli.py`
367
+
368
+ - [ ] **Step 1: Add eval command to `cli.py`**
369
+
370
+ Add after the `audit` command (before `if __name__`):
371
+
372
+ ```python
373
+ @cli.command(name="eval")
374
+ @click.option("--verbose", "-v", is_flag=True, help="Show per-query details.")
375
+ @click.option("--history", is_flag=True, help="Show history of past runs.")
376
+ def eval_cmd(verbose: bool, history: bool):
377
+ """Run retrieval evaluation and compare to previous run."""
378
+ from mediastorm.eval.runner import (
379
+ _build_run_data, save_run, load_previous_run, load_all_runs,
380
+ )
381
+ from mediastorm.eval.display import (
382
+ print_scores, print_verbose, print_diff, print_history,
383
+ )
384
+
385
+ if history:
386
+ runs = load_all_runs()
387
+ print_history(runs)
388
+ return
389
+
390
+ # Load previous run BEFORE running eval (so the new run isn't compared to itself)
391
+ previous = load_previous_run()
392
+
393
+ # Run evaluation
394
+ from eval_retrieval import run_eval
395
+ click.echo("Running retrieval evaluation...")
396
+ eval_result = asyncio.run(run_eval(verbose=False))
397
+
398
+ # Build and save run data
399
+ run_data = _build_run_data(eval_result)
400
+ path = save_run(run_data)
401
+ click.echo(f"Results saved to {path}")
402
+
403
+ # Display
404
+ if verbose:
405
+ print_verbose(run_data)
406
+ print_scores(run_data)
407
+
408
+ if previous:
409
+ print_diff(run_data, previous)
410
+ else:
411
+ click.echo("\nFirst run — no comparison available.")
412
+ ```
413
+
414
+ - [ ] **Step 2: Test the command runs**
415
+
416
+ Run: `source .venv/bin/activate && python cli.py eval`
417
+ Expected: scores table printed, JSON saved to `data/eval_runs/`, "First run — no comparison available."
418
+
419
+ - [ ] **Step 3: Test verbose mode**
420
+
421
+ Run: `source .venv/bin/activate && python cli.py eval --verbose`
422
+ Expected: per-query details printed before scores, plus comparison vs first run
423
+
424
+ - [ ] **Step 4: Test history**
425
+
426
+ Run: `source .venv/bin/activate && python cli.py eval --history`
427
+ Expected: two runs listed with trend indicator
428
+
429
+ - [ ] **Step 5: Commit**
430
+
431
+ ```bash
432
+ git add cli.py
433
+ git commit -m "feat(eval): add cli.py eval command with diff and history"
434
+ ```
435
+
436
+ ---
437
+
438
+ ### Task 4: Suppress eval_retrieval.py stdout
439
+
440
+ **Files:**
441
+ - Modify: `eval_retrieval.py:284-291` (the print block at the start of `run_eval`)
442
+
443
+ The current `run_eval()` prints directly to stdout. Since `display.py` handles all formatting, we need to suppress the prints when called from the CLI. Add a `quiet` parameter.
444
+
445
+ - [ ] **Step 1: Add `quiet` parameter to `run_eval()`**
446
+
447
+ In `eval_retrieval.py`, change the signature and wrap all `print()` calls:
448
+
449
+ ```python
450
+ async def run_eval(verbose: bool = False, quiet: bool = False) -> dict:
451
+ ```
452
+
453
+ Then wrap every `print(...)` in `run_eval` with `if not quiet:`. This affects:
454
+ - Lines 284-287 (header)
455
+ - Lines 310-314 (edge verbose)
456
+ - Lines 335-340 (scored verbose)
457
+ - Lines 359-393 (aggregates and breakdown)
458
+
459
+ - [ ] **Step 2: Update CLI to use `quiet=True`**
460
+
461
+ In `cli.py`, change the `run_eval` call:
462
+
463
+ ```python
464
+ eval_result = asyncio.run(run_eval(verbose=False, quiet=True))
465
+ ```
466
+
467
+ - [ ] **Step 3: Verify standalone script still works**
468
+
469
+ Run: `source .venv/bin/activate && python eval_retrieval.py --verbose`
470
+ Expected: same output as before (quiet defaults to False)
471
+
472
+ - [ ] **Step 4: Verify CLI suppresses duplicate output**
473
+
474
+ Run: `source .venv/bin/activate && python cli.py eval`
475
+ Expected: only the display module's output, no duplicate headers
476
+
477
+ - [ ] **Step 5: Commit**
478
+
479
+ ```bash
480
+ git add eval_retrieval.py cli.py
481
+ git commit -m "feat(eval): add quiet mode to eval_retrieval to avoid duplicate output"
482
+ ```
483
+
484
+ ---
485
+
486
+ ### Task 5: Enrich run data with expected IDs from ground truth
487
+
488
+ **Files:**
489
+ - Modify: `eval_retrieval.py:298-342` (the per-query loop)
490
+
491
+ Currently `run_eval()` doesn't include `expected` IDs in the returned details. We need them for the diff to show missed UIDs.
492
+
493
+ - [ ] **Step 1: Add expected IDs to each result row**
494
+
495
+ In `eval_retrieval.py`, inside the per-query loop, add `expected` to each row dict. For non-edge queries (around line 316-330):
496
+
497
+ ```python
498
+ row = {
499
+ "query": query,
500
+ "category": category,
501
+ "precision_at_1": p1,
502
+ "recall_at_5": r5,
503
+ "mrr": m,
504
+ "ndcg_at_5": n5,
505
+ "retrieved": retrieved_ids,
506
+ "expected": list(expected),
507
+ "missed": list(expected - set(retrieved_ids)),
508
+ "duration": duration,
509
+ }
510
+ ```
511
+
512
+ For edge queries (around line 302-308):
513
+
514
+ ```python
515
+ row = {
516
+ "query": query,
517
+ "category": category,
518
+ "success": success,
519
+ "num_returned": len(retrieval.stories),
520
+ "expected": [],
521
+ "duration": duration,
522
+ }
523
+ ```
524
+
525
+ - [ ] **Step 2: Update `_build_run_data` in `runner.py` to use the enriched data**
526
+
527
+ In `runner.py`, simplify the query building since expected/missed now come from eval_retrieval:
528
+
529
+ ```python
530
+ for row in details:
531
+ if row["category"] == "edge_no_match":
532
+ queries.append({
533
+ "query": row["query"],
534
+ "category": row["category"],
535
+ "success": row["success"],
536
+ "num_returned": row["num_returned"],
537
+ "duration": row["duration"],
538
+ })
539
+ else:
540
+ queries.append({
541
+ "query": row["query"],
542
+ "category": row["category"],
543
+ "p1": row["precision_at_1"],
544
+ "r5": row["recall_at_5"],
545
+ "mrr": row["mrr"],
546
+ "ndcg5": row["ndcg_at_5"],
547
+ "retrieved_ids": row["retrieved"],
548
+ "expected_ids": row["expected"],
549
+ "missed": row["missed"],
550
+ "duration": row["duration"],
551
+ })
552
+ ```
553
+
554
+ - [ ] **Step 3: Verify**
555
+
556
+ Run: `source .venv/bin/activate && python cli.py eval --verbose`
557
+ Expected: missed UIDs shown in verbose output
558
+
559
+ - [ ] **Step 4: Commit**
560
+
561
+ ```bash
562
+ git add eval_retrieval.py src/mediastorm/eval/runner.py
563
+ git commit -m "feat(eval): include expected IDs and missed UIDs in eval results"
564
+ ```