yin
feat(mvp): wire router → mupdf parser → OCR quality scorer closed loop
d423504
"""pdfsys-bench CLI — run the MVP closed loop on a directory of PDFs.
Usage::
python -m pdfsys_bench \\
--pdf-dir packages/pdfsys-bench/omnidocbench_100/pdfs \\
--out out/bench_omnidoc100.jsonl \\
--limit 20
Flags exposed here are intentionally minimal — anything more is the job
of a proper runner package. This CLI is meant for smoke-testing.
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
from .loop import run_loop
def build_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(prog="pdfsys-bench", description="Run the MVP pdfsys closed loop.")
p.add_argument(
"--pdf-dir",
type=Path,
required=True,
help="Directory of PDFs to process (recursive).",
)
p.add_argument(
"--out",
type=Path,
required=True,
help="Output JSONL path (one line per PDF).",
)
p.add_argument(
"--limit",
type=int,
default=None,
help="Cap the number of PDFs processed. Default: no cap.",
)
p.add_argument(
"--no-quality",
action="store_true",
help="Skip the ModernBERT quality scorer (fast smoke test).",
)
p.add_argument(
"--quality-model",
default="HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn",
help="HuggingFace repo id for the quality scorer.",
)
p.add_argument(
"--router-weights",
type=Path,
default=None,
help="Path to xgb_classifier.ubj. Defaults to the package's bundled path.",
)
p.add_argument(
"--markdown-dir",
type=Path,
default=None,
help="Optional directory to dump per-PDF extracted markdown.",
)
p.add_argument(
"--ocr-threshold",
type=float,
default=0.5,
help="P(ocr) threshold above which a PDF is routed off the text-ok path.",
)
return p
def main(argv: list[str] | None = None) -> int:
args = build_parser().parse_args(argv)
summary = run_loop(
pdf_dir=args.pdf_dir,
out_path=args.out,
limit=args.limit,
score_quality=not args.no_quality,
router_weights=args.router_weights,
quality_model=args.quality_model,
markdown_dir=args.markdown_dir,
ocr_threshold=args.ocr_threshold,
)
print(f"[pdfsys-bench] processed {summary['num_pdfs']} PDFs in {summary['wall_seconds']:.1f}s")
print(f"[pdfsys-bench] by_backend: {summary['by_backend']}")
print(f"[pdfsys-bench] extracted={summary['num_extracted']} scored={summary['num_scored']} errors={summary['num_errors']}")
if summary.get("avg_quality") is not None:
print(f"[pdfsys-bench] avg_quality={summary['avg_quality']:.3f}")
print(f"[pdfsys-bench] jsonl: {summary['out_path']}")
print(f"[pdfsys-bench] summary: {summary['summary_path']}")
return 0
if __name__ == "__main__":
sys.exit(main())