"""Command-line entry point for the two-phase evaluation harness. Usage:: # Phase 1 -- runs the model, spends quota; start with a small slice. uv run python -m eval.run_eval predict --dataset sroie --limit 20 # Phase 2 -- offline; recompute metrics and sweep as often as you like. uv run python -m eval.run_eval score --dataset sroie The predict phase caches results under ``eval/cache//`` and is idempotent (already-cached ids are skipped unless ``--overwrite``). The score phase reads that cache and prints the metrics tables and threshold sweep; it never re-runs inference. """ from __future__ import annotations import argparse import logging import sys from pathlib import Path from eval.cache import DEFAULT_CACHE_BASE from eval.predict import run_predict from eval.score import build_report, format_report def _add_common(parser: argparse.ArgumentParser) -> None: parser.add_argument("--dataset", default="sroie", help="Dataset adapter name (default: sroie).") parser.add_argument( "--cache-base", type=Path, default=DEFAULT_CACHE_BASE, help="Root cache directory (default: eval/cache).", ) def _build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(prog="eval.run_eval", description=__doc__) subparsers = parser.add_subparsers(dest="command", required=True) predict = subparsers.add_parser("predict", help="Run the model over a slice and cache results.") _add_common(predict) predict.add_argument( "--limit", type=int, default=20, help="Number of examples to process (the held-out slice size; default: 20).", ) predict.add_argument( "--overwrite", action="store_true", help="Re-process and overwrite already-cached examples.", ) score = subparsers.add_parser("score", help="Compute metrics + sweep from the cache (offline).") _add_common(score) return parser def main(argv: list[str] | None = None) -> int: """Run the CLI. Args: argv: Argument list (defaults to ``sys.argv[1:]``). Returns: Process exit code (0 on success). """ logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s", ) args = _build_parser().parse_args(argv) if args.command == "predict": stats = run_predict( args.dataset, args.limit, cache_base=args.cache_base, overwrite=args.overwrite, ) print( f"\nPredict complete for {stats.dataset}: " f"processed={stats.processed} skipped={stats.skipped} " f"accepted={stats.accepted} review={stats.review} errors={stats.errors} " f"failed={stats.failed}\n" f"Now run: uv run python -m eval.run_eval score --dataset {stats.dataset}" ) return 0 if args.command == "score": report = build_report(args.dataset, cache_base=args.cache_base) print(format_report(report)) return 0 return 1 # pragma: no cover -- argparse enforces a valid subcommand. if __name__ == "__main__": sys.exit(main())