File size: 2,974 Bytes
d423504
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""pdfsys-bench CLI — run the MVP closed loop on a directory of PDFs.

Usage::

    python -m pdfsys_bench \\
        --pdf-dir packages/pdfsys-bench/omnidocbench_100/pdfs \\
        --out out/bench_omnidoc100.jsonl \\
        --limit 20

Flags exposed here are intentionally minimal — anything more is the job
of a proper runner package. This CLI is meant for smoke-testing.
"""

from __future__ import annotations

import argparse
import sys
from pathlib import Path

from .loop import run_loop


def build_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(prog="pdfsys-bench", description="Run the MVP pdfsys closed loop.")
    p.add_argument(
        "--pdf-dir",
        type=Path,
        required=True,
        help="Directory of PDFs to process (recursive).",
    )
    p.add_argument(
        "--out",
        type=Path,
        required=True,
        help="Output JSONL path (one line per PDF).",
    )
    p.add_argument(
        "--limit",
        type=int,
        default=None,
        help="Cap the number of PDFs processed. Default: no cap.",
    )
    p.add_argument(
        "--no-quality",
        action="store_true",
        help="Skip the ModernBERT quality scorer (fast smoke test).",
    )
    p.add_argument(
        "--quality-model",
        default="HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn",
        help="HuggingFace repo id for the quality scorer.",
    )
    p.add_argument(
        "--router-weights",
        type=Path,
        default=None,
        help="Path to xgb_classifier.ubj. Defaults to the package's bundled path.",
    )
    p.add_argument(
        "--markdown-dir",
        type=Path,
        default=None,
        help="Optional directory to dump per-PDF extracted markdown.",
    )
    p.add_argument(
        "--ocr-threshold",
        type=float,
        default=0.5,
        help="P(ocr) threshold above which a PDF is routed off the text-ok path.",
    )
    return p


def main(argv: list[str] | None = None) -> int:
    args = build_parser().parse_args(argv)
    summary = run_loop(
        pdf_dir=args.pdf_dir,
        out_path=args.out,
        limit=args.limit,
        score_quality=not args.no_quality,
        router_weights=args.router_weights,
        quality_model=args.quality_model,
        markdown_dir=args.markdown_dir,
        ocr_threshold=args.ocr_threshold,
    )

    print(f"[pdfsys-bench] processed {summary['num_pdfs']} PDFs in {summary['wall_seconds']:.1f}s")
    print(f"[pdfsys-bench] by_backend: {summary['by_backend']}")
    print(f"[pdfsys-bench] extracted={summary['num_extracted']} scored={summary['num_scored']} errors={summary['num_errors']}")
    if summary.get("avg_quality") is not None:
        print(f"[pdfsys-bench] avg_quality={summary['avg_quality']:.3f}")
    print(f"[pdfsys-bench] jsonl: {summary['out_path']}")
    print(f"[pdfsys-bench] summary: {summary['summary_path']}")
    return 0


if __name__ == "__main__":
    sys.exit(main())