Spaces:
Sleeping
Sleeping
feat(eval): add cli.py eval command with diff and history
Browse filesCo-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
cli.py
CHANGED
|
@@ -192,5 +192,46 @@ def audit(subset: int, verbose: bool):
|
|
| 192 |
asyncio.run(run_audit(subset=subset, verbose=verbose))
|
| 193 |
|
| 194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
if __name__ == "__main__":
|
| 196 |
cli()
|
|
|
|
| 192 |
asyncio.run(run_audit(subset=subset, verbose=verbose))
|
| 193 |
|
| 194 |
|
| 195 |
+
@cli.command(name="eval")
|
| 196 |
+
@click.option("--verbose", "-v", is_flag=True, help="Show per-query details.")
|
| 197 |
+
@click.option("--history", is_flag=True, help="Show history of past runs.")
|
| 198 |
+
def eval_cmd(verbose: bool, history: bool):
|
| 199 |
+
"""Run retrieval evaluation and compare to previous run."""
|
| 200 |
+
from mediastorm.eval.runner import (
|
| 201 |
+
_build_run_data, save_run, load_previous_run, load_all_runs,
|
| 202 |
+
)
|
| 203 |
+
from mediastorm.eval.display import (
|
| 204 |
+
print_scores, print_verbose, print_diff, print_history,
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
if history:
|
| 208 |
+
runs = load_all_runs()
|
| 209 |
+
print_history(runs)
|
| 210 |
+
return
|
| 211 |
+
|
| 212 |
+
# Load previous run BEFORE running eval (so the new run isn't compared to itself)
|
| 213 |
+
previous = load_previous_run()
|
| 214 |
+
|
| 215 |
+
# Run evaluation
|
| 216 |
+
from eval_retrieval import run_eval
|
| 217 |
+
click.echo("Running retrieval evaluation...")
|
| 218 |
+
eval_result = asyncio.run(run_eval(verbose=False, quiet=True))
|
| 219 |
+
|
| 220 |
+
# Build and save run data
|
| 221 |
+
run_data = _build_run_data(eval_result)
|
| 222 |
+
path = save_run(run_data)
|
| 223 |
+
click.echo(f"Results saved to {path}")
|
| 224 |
+
|
| 225 |
+
# Display
|
| 226 |
+
if verbose:
|
| 227 |
+
print_verbose(run_data)
|
| 228 |
+
print_scores(run_data)
|
| 229 |
+
|
| 230 |
+
if previous:
|
| 231 |
+
print_diff(run_data, previous)
|
| 232 |
+
else:
|
| 233 |
+
click.echo("\nFirst run — no comparison available.")
|
| 234 |
+
|
| 235 |
+
|
| 236 |
if __name__ == "__main__":
|
| 237 |
cli()
|