remdms Claude Sonnet 4.6 commited on
Commit
a36a4f5
·
1 Parent(s): 51fa501

feat(eval): add cli.py eval command with diff and history

Browse files

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. cli.py +41 -0
cli.py CHANGED
@@ -192,5 +192,46 @@ def audit(subset: int, verbose: bool):
192
  asyncio.run(run_audit(subset=subset, verbose=verbose))
193
 
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  if __name__ == "__main__":
196
  cli()
 
192
  asyncio.run(run_audit(subset=subset, verbose=verbose))
193
 
194
 
195
+ @cli.command(name="eval")
196
+ @click.option("--verbose", "-v", is_flag=True, help="Show per-query details.")
197
+ @click.option("--history", is_flag=True, help="Show history of past runs.")
198
+ def eval_cmd(verbose: bool, history: bool):
199
+ """Run retrieval evaluation and compare to previous run."""
200
+ from mediastorm.eval.runner import (
201
+ _build_run_data, save_run, load_previous_run, load_all_runs,
202
+ )
203
+ from mediastorm.eval.display import (
204
+ print_scores, print_verbose, print_diff, print_history,
205
+ )
206
+
207
+ if history:
208
+ runs = load_all_runs()
209
+ print_history(runs)
210
+ return
211
+
212
+ # Load previous run BEFORE running eval (so the new run isn't compared to itself)
213
+ previous = load_previous_run()
214
+
215
+ # Run evaluation
216
+ from eval_retrieval import run_eval
217
+ click.echo("Running retrieval evaluation...")
218
+ eval_result = asyncio.run(run_eval(verbose=False, quiet=True))
219
+
220
+ # Build and save run data
221
+ run_data = _build_run_data(eval_result)
222
+ path = save_run(run_data)
223
+ click.echo(f"Results saved to {path}")
224
+
225
+ # Display
226
+ if verbose:
227
+ print_verbose(run_data)
228
+ print_scores(run_data)
229
+
230
+ if previous:
231
+ print_diff(run_data, previous)
232
+ else:
233
+ click.echo("\nFirst run — no comparison available.")
234
+
235
+
236
  if __name__ == "__main__":
237
  cli()