Spaces:

roger1024
/

DocPipe

Runtime error

DocPipe / packages /pdfsys-bench /src /pdfsys_bench /__main__.py

yin

feat(mvp): wire router → mupdf parser → OCR quality scorer closed loop

d423504 about 2 months ago

2.97 kB

	"""pdfsys-bench CLI — run the MVP closed loop on a directory of PDFs.

	Usage::

	python -m pdfsys_bench \\
	--pdf-dir packages/pdfsys-bench/omnidocbench_100/pdfs \\
	--out out/bench_omnidoc100.jsonl \\
	--limit 20

	Flags exposed here are intentionally minimal — anything more is the job
	of a proper runner package. This CLI is meant for smoke-testing.
	"""

	from __future__ import annotations

	import argparse
	import sys
	from pathlib import Path

	from .loop import run_loop


	def build_parser() -> argparse.ArgumentParser:
	p = argparse.ArgumentParser(prog="pdfsys-bench", description="Run the MVP pdfsys closed loop.")
	p.add_argument(
	"--pdf-dir",
	type=Path,
	required=True,
	help="Directory of PDFs to process (recursive).",
	)
	p.add_argument(
	"--out",
	type=Path,
	required=True,
	help="Output JSONL path (one line per PDF).",
	)
	p.add_argument(
	"--limit",
	type=int,
	default=None,
	help="Cap the number of PDFs processed. Default: no cap.",
	)
	p.add_argument(
	"--no-quality",
	action="store_true",
	help="Skip the ModernBERT quality scorer (fast smoke test).",
	)
	p.add_argument(
	"--quality-model",
	default="HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn",
	help="HuggingFace repo id for the quality scorer.",
	)
	p.add_argument(
	"--router-weights",
	type=Path,
	default=None,
	help="Path to xgb_classifier.ubj. Defaults to the package's bundled path.",
	)
	p.add_argument(
	"--markdown-dir",
	type=Path,
	default=None,
	help="Optional directory to dump per-PDF extracted markdown.",
	)
	p.add_argument(
	"--ocr-threshold",
	type=float,
	default=0.5,
	help="P(ocr) threshold above which a PDF is routed off the text-ok path.",
	)
	return p


	def main(argv: list[str] \| None = None) -> int:
	args = build_parser().parse_args(argv)
	summary = run_loop(
	pdf_dir=args.pdf_dir,
	out_path=args.out,
	limit=args.limit,
	score_quality=not args.no_quality,
	router_weights=args.router_weights,
	quality_model=args.quality_model,
	markdown_dir=args.markdown_dir,
	ocr_threshold=args.ocr_threshold,
	)

	print(f"[pdfsys-bench] processed {summary['num_pdfs']} PDFs in {summary['wall_seconds']:.1f}s")
	print(f"[pdfsys-bench] by_backend: {summary['by_backend']}")
	print(f"[pdfsys-bench] extracted={summary['num_extracted']} scored={summary['num_scored']} errors={summary['num_errors']}")
	if summary.get("avg_quality") is not None:
	print(f"[pdfsys-bench] avg_quality={summary['avg_quality']:.3f}")
	print(f"[pdfsys-bench] jsonl: {summary['out_path']}")
	print(f"[pdfsys-bench] summary: {summary['summary_path']}")
	return 0


	if __name__ == "__main__":
	sys.exit(main())