Spaces:

knzychw
/

document-extract-agent

Running

App Files Files Community

document-extract-agent / eval /run_eval.py

kennethzychew

phase 5: evaluation harness (SROIE)

d2a6765 2 days ago

Raw

History Blame Contribute Delete

3.21 kB

	"""Command-line entry point for the two-phase evaluation harness.

	Usage::

	# Phase 1 -- runs the model, spends quota; start with a small slice.
	uv run python -m eval.run_eval predict --dataset sroie --limit 20

	# Phase 2 -- offline; recompute metrics and sweep as often as you like.
	uv run python -m eval.run_eval score --dataset sroie

	The predict phase caches results under ``eval/cache/<dataset>/`` and is
	idempotent (already-cached ids are skipped unless ``--overwrite``). The score
	phase reads that cache and prints the metrics tables and threshold sweep; it
	never re-runs inference.
	"""

	from __future__ import annotations

	import argparse
	import logging
	import sys
	from pathlib import Path

	from eval.cache import DEFAULT_CACHE_BASE
	from eval.predict import run_predict
	from eval.score import build_report, format_report


	def _add_common(parser: argparse.ArgumentParser) -> None:
	parser.add_argument("--dataset", default="sroie", help="Dataset adapter name (default: sroie).")
	parser.add_argument(
	"--cache-base",
	type=Path,
	default=DEFAULT_CACHE_BASE,
	help="Root cache directory (default: eval/cache).",
	)


	def _build_parser() -> argparse.ArgumentParser:
	parser = argparse.ArgumentParser(prog="eval.run_eval", description=__doc__)
	subparsers = parser.add_subparsers(dest="command", required=True)

	predict = subparsers.add_parser("predict", help="Run the model over a slice and cache results.")
	_add_common(predict)
	predict.add_argument(
	"--limit",
	type=int,
	default=20,
	help="Number of examples to process (the held-out slice size; default: 20).",
	)
	predict.add_argument(
	"--overwrite",
	action="store_true",
	help="Re-process and overwrite already-cached examples.",
	)

	score = subparsers.add_parser("score", help="Compute metrics + sweep from the cache (offline).")
	_add_common(score)

	return parser


	def main(argv: list[str] \| None = None) -> int:
	"""Run the CLI.

	Args:
	argv: Argument list (defaults to ``sys.argv[1:]``).

	Returns:
	Process exit code (0 on success).
	"""
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s %(levelname)s %(name)s: %(message)s",
	)
	args = _build_parser().parse_args(argv)

	if args.command == "predict":
	stats = run_predict(
	args.dataset,
	args.limit,
	cache_base=args.cache_base,
	overwrite=args.overwrite,
	)
	print(
	f"\nPredict complete for {stats.dataset}: "
	f"processed={stats.processed} skipped={stats.skipped} "
	f"accepted={stats.accepted} review={stats.review} errors={stats.errors} "
	f"failed={stats.failed}\n"
	f"Now run: uv run python -m eval.run_eval score --dataset {stats.dataset}"
	)
	return 0

	if args.command == "score":
	report = build_report(args.dataset, cache_base=args.cache_base)
	print(format_report(report))
	return 0

	return 1 # pragma: no cover -- argparse enforces a valid subcommand.


	if __name__ == "__main__":
	sys.exit(main())