"""pdfsys-bench CLI — run the MVP closed loop on a directory of PDFs. Usage:: python -m pdfsys_bench \\ --pdf-dir packages/pdfsys-bench/omnidocbench_100/pdfs \\ --out out/bench_omnidoc100.jsonl \\ --limit 20 Flags exposed here are intentionally minimal — anything more is the job of a proper runner package. This CLI is meant for smoke-testing. """ from __future__ import annotations import argparse import sys from pathlib import Path from .loop import run_loop def build_parser() -> argparse.ArgumentParser: p = argparse.ArgumentParser(prog="pdfsys-bench", description="Run the MVP pdfsys closed loop.") p.add_argument( "--pdf-dir", type=Path, required=True, help="Directory of PDFs to process (recursive).", ) p.add_argument( "--out", type=Path, required=True, help="Output JSONL path (one line per PDF).", ) p.add_argument( "--limit", type=int, default=None, help="Cap the number of PDFs processed. Default: no cap.", ) p.add_argument( "--no-quality", action="store_true", help="Skip the ModernBERT quality scorer (fast smoke test).", ) p.add_argument( "--quality-model", default="HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn", help="HuggingFace repo id for the quality scorer.", ) p.add_argument( "--router-weights", type=Path, default=None, help="Path to xgb_classifier.ubj. Defaults to the package's bundled path.", ) p.add_argument( "--markdown-dir", type=Path, default=None, help="Optional directory to dump per-PDF extracted markdown.", ) p.add_argument( "--ocr-threshold", type=float, default=0.5, help="P(ocr) threshold above which a PDF is routed off the text-ok path.", ) return p def main(argv: list[str] | None = None) -> int: args = build_parser().parse_args(argv) summary = run_loop( pdf_dir=args.pdf_dir, out_path=args.out, limit=args.limit, score_quality=not args.no_quality, router_weights=args.router_weights, quality_model=args.quality_model, markdown_dir=args.markdown_dir, ocr_threshold=args.ocr_threshold, ) print(f"[pdfsys-bench] processed {summary['num_pdfs']} PDFs in {summary['wall_seconds']:.1f}s") print(f"[pdfsys-bench] by_backend: {summary['by_backend']}") print(f"[pdfsys-bench] extracted={summary['num_extracted']} scored={summary['num_scored']} errors={summary['num_errors']}") if summary.get("avg_quality") is not None: print(f"[pdfsys-bench] avg_quality={summary['avg_quality']:.3f}") print(f"[pdfsys-bench] jsonl: {summary['out_path']}") print(f"[pdfsys-bench] summary: {summary['summary_path']}") return 0 if __name__ == "__main__": sys.exit(main())