kennethzychew's picture
phase 5: evaluation harness (SROIE)
d2a6765
Raw
History Blame Contribute Delete
3.21 kB
"""Command-line entry point for the two-phase evaluation harness.
Usage::
# Phase 1 -- runs the model, spends quota; start with a small slice.
uv run python -m eval.run_eval predict --dataset sroie --limit 20
# Phase 2 -- offline; recompute metrics and sweep as often as you like.
uv run python -m eval.run_eval score --dataset sroie
The predict phase caches results under ``eval/cache/<dataset>/`` and is
idempotent (already-cached ids are skipped unless ``--overwrite``). The score
phase reads that cache and prints the metrics tables and threshold sweep; it
never re-runs inference.
"""
from __future__ import annotations
import argparse
import logging
import sys
from pathlib import Path
from eval.cache import DEFAULT_CACHE_BASE
from eval.predict import run_predict
from eval.score import build_report, format_report
def _add_common(parser: argparse.ArgumentParser) -> None:
parser.add_argument("--dataset", default="sroie", help="Dataset adapter name (default: sroie).")
parser.add_argument(
"--cache-base",
type=Path,
default=DEFAULT_CACHE_BASE,
help="Root cache directory (default: eval/cache).",
)
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(prog="eval.run_eval", description=__doc__)
subparsers = parser.add_subparsers(dest="command", required=True)
predict = subparsers.add_parser("predict", help="Run the model over a slice and cache results.")
_add_common(predict)
predict.add_argument(
"--limit",
type=int,
default=20,
help="Number of examples to process (the held-out slice size; default: 20).",
)
predict.add_argument(
"--overwrite",
action="store_true",
help="Re-process and overwrite already-cached examples.",
)
score = subparsers.add_parser("score", help="Compute metrics + sweep from the cache (offline).")
_add_common(score)
return parser
def main(argv: list[str] | None = None) -> int:
"""Run the CLI.
Args:
argv: Argument list (defaults to ``sys.argv[1:]``).
Returns:
Process exit code (0 on success).
"""
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
)
args = _build_parser().parse_args(argv)
if args.command == "predict":
stats = run_predict(
args.dataset,
args.limit,
cache_base=args.cache_base,
overwrite=args.overwrite,
)
print(
f"\nPredict complete for {stats.dataset}: "
f"processed={stats.processed} skipped={stats.skipped} "
f"accepted={stats.accepted} review={stats.review} errors={stats.errors} "
f"failed={stats.failed}\n"
f"Now run: uv run python -m eval.run_eval score --dataset {stats.dataset}"
)
return 0
if args.command == "score":
report = build_report(args.dataset, cache_base=args.cache_base)
print(format_report(report))
return 0
return 1 # pragma: no cover -- argparse enforces a valid subcommand.
if __name__ == "__main__":
sys.exit(main())