"""Command-line entry point for the two-phase evaluation harness.

Usage::

    # Phase 1 -- runs the model, spends quota; start with a small slice.
    uv run python -m eval.run_eval predict --dataset sroie --limit 20

    # Phase 2 -- offline; recompute metrics and sweep as often as you like.
    uv run python -m eval.run_eval score --dataset sroie

The predict phase caches results under ``eval/cache/<dataset>/`` and is
idempotent (already-cached ids are skipped unless ``--overwrite``). The score
phase reads that cache and prints the metrics tables and threshold sweep; it
never re-runs inference.
"""

from __future__ import annotations

import argparse
import logging
import sys
from pathlib import Path

from eval.cache import DEFAULT_CACHE_BASE
from eval.predict import run_predict
from eval.score import build_report, format_report


def _add_common(parser: argparse.ArgumentParser) -> None:
    parser.add_argument("--dataset", default="sroie", help="Dataset adapter name (default: sroie).")
    parser.add_argument(
        "--cache-base",
        type=Path,
        default=DEFAULT_CACHE_BASE,
        help="Root cache directory (default: eval/cache).",
    )


def _build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(prog="eval.run_eval", description=__doc__)
    subparsers = parser.add_subparsers(dest="command", required=True)

    predict = subparsers.add_parser("predict", help="Run the model over a slice and cache results.")
    _add_common(predict)
    predict.add_argument(
        "--limit",
        type=int,
        default=20,
        help="Number of examples to process (the held-out slice size; default: 20).",
    )
    predict.add_argument(
        "--overwrite",
        action="store_true",
        help="Re-process and overwrite already-cached examples.",
    )

    score = subparsers.add_parser("score", help="Compute metrics + sweep from the cache (offline).")
    _add_common(score)

    return parser


def main(argv: list[str] | None = None) -> int:
    """Run the CLI.

    Args:
        argv: Argument list (defaults to ``sys.argv[1:]``).

    Returns:
        Process exit code (0 on success).
    """
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    )
    args = _build_parser().parse_args(argv)

    if args.command == "predict":
        stats = run_predict(
            args.dataset,
            args.limit,
            cache_base=args.cache_base,
            overwrite=args.overwrite,
        )
        print(
            f"\nPredict complete for {stats.dataset}: "
            f"processed={stats.processed} skipped={stats.skipped} "
            f"accepted={stats.accepted} review={stats.review} errors={stats.errors} "
            f"failed={stats.failed}\n"
            f"Now run: uv run python -m eval.run_eval score --dataset {stats.dataset}"
        )
        return 0

    if args.command == "score":
        report = build_report(args.dataset, cache_base=args.cache_base)
        print(format_report(report))
        return 0

    return 1  # pragma: no cover -- argparse enforces a valid subcommand.


if __name__ == "__main__":
    sys.exit(main())