| """Gradio demo for the pdfsys-mnbvc MVP pipeline. |
| |
| What this demonstrates (matching the code that actually exists in the |
| repo today, not the aspirational PRD): |
| |
| * Stage-A XGBoost router — decides text-ok vs needs-ocr from 124 |
| PyMuPDF-derived features. |
| * MuPDF fast path — extracts Markdown-ready segments when the router |
| picks ``Backend.MUPDF``. Overlaid on the first page as colored bboxes. |
| * ModernBERT OCR quality scorer — optional, heavy (~800 MB download, |
| 3–5 s per doc on CPU). Off by default to keep the demo snappy. |
| |
| PIPELINE / VLM / DEFERRED backends are surfaced through the router |
| decision but are still stubs in ``packages/pdfsys-parser-*``; the UI |
| just reports the routing choice in that case and skips extraction. |
| |
| Runs locally (``python demo/app.py``) and as a Hugging Face Space (see |
| the repo-root ``README.md`` frontmatter and ``demo/README.md``). |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import os |
| import sys |
| import tempfile |
| import traceback |
| from pathlib import Path |
|
|
| import gradio as gr |
|
|
| |
| |
| |
| _REPO_ROOT = Path(__file__).resolve().parent.parent |
| for pkg in ("pdfsys-core", "pdfsys-router", "pdfsys-parser-mupdf", "pdfsys-bench"): |
| src = _REPO_ROOT / "packages" / pkg / "src" |
| if src.is_dir() and str(src) not in sys.path: |
| sys.path.insert(0, str(src)) |
|
|
| from pipeline import ( |
| PipelineResult, |
| pick_curated_features, |
| render_first_page_with_bboxes, |
| run_pipeline, |
| ) |
|
|
|
|
| |
|
|
| DESCRIPTION = """\ |
| # PDFSystem-MNBVC · Pipeline Demo |
| |
| **FinePDFs-inspired PB-scale PDF → pretraining-data pipeline**, adapted |
| for the Chinese MNBVC corpus. This demo shows the MVP closed loop that |
| is actually implemented in the repo today: |
| |
| **Router (XGBoost, 124 features)** → **MuPDF fast path** → **OCR Quality Scorer (ModernBERT)** |
| |
| The router decides whether a PDF is cheap to parse with PyMuPDF alone, |
| or whether it needs to go to the (still-stubbed) OCR / VLM backends. |
| Roughly 90% of a typical PDF corpus takes the green fast-path lane. |
| """ |
|
|
| PIPELINE_DIAGRAM_MD = """\ |
| ### Pipeline |
| |
| ``` |
| ┌────────────────┐ |
| PDF ───────►│ Stage-A │ XGBoost · ~10 ms/PDF |
| │ Router │ 124 PyMuPDF features |
| └────────┬───────┘ |
| │ ocr_prob |
| ┌─────────────┼─────────────┐ |
| ▼ ▼ ▼ |
| MUPDF PIPELINE VLM / DEFERRED |
| (text-ok) (OCR, stub) (VLM, stub) |
| │ |
| ▼ |
| PyMuPDF blocks ─► Markdown + Segments (with bboxes) |
| │ |
| ▼ |
| ModernBERT-large OCR quality regressor ─► score ∈ [0, 3] |
| ``` |
| |
| **Backend color legend on page preview** |
| |
| - 🟢 `mupdf` — text-ok fast path (implemented) |
| - 🟠 `pipeline` — OCR lane (stub, routing only) |
| - 🟣 `vlm` — VLM lane (stub, routing only) |
| - ⚪ `deferred` — held back until VLM workers online |
| """ |
|
|
|
|
| def _safe(val, default=""): |
| """Coerce NaN / None for Gradio components that don't like them.""" |
| if val is None: |
| return default |
| try: |
| import math |
|
|
| if isinstance(val, float) and math.isnan(val): |
| return default |
| except Exception: |
| pass |
| return val |
|
|
|
|
| |
|
|
|
|
| def process_pdf( |
| pdf_file: str | None, |
| run_quality: bool, |
| ocr_threshold: float, |
| progress: gr.Progress = gr.Progress(), |
| ): |
| """Main Gradio callback. Returns one value per output component.""" |
| empty_segments = [[0, 0, "-", "-", 0, ""]] |
| empty_features = [["(no PDF uploaded)", ""]] |
| empty_summary = "Upload a PDF to get started." |
|
|
| if not pdf_file: |
| return ( |
| empty_summary, |
| "", 0.0, 0, "", 0.0, |
| None, |
| "_No markdown yet._", |
| empty_segments, |
| empty_features, |
| "{}", |
| ) |
|
|
| pdf_path = Path(pdf_file) |
|
|
| try: |
| progress(0.1, desc="Routing (XGBoost)…") |
| result: PipelineResult = run_pipeline( |
| pdf_path, |
| run_quality=run_quality, |
| ocr_threshold=ocr_threshold, |
| ) |
|
|
| progress(0.7, desc="Rendering first page…") |
| preview = render_first_page_with_bboxes(pdf_path, result, page_index=0) |
|
|
| except Exception as e: |
| tb = traceback.format_exc() |
| err_json = {"error": str(e), "traceback": tb.splitlines()[-6:]} |
| return ( |
| f"**Failed:** `{e}`", |
| "", 0.0, 0, "", 0.0, |
| None, |
| f"```\n{tb}\n```", |
| empty_segments, |
| empty_features, |
| json.dumps(err_json, indent=2, ensure_ascii=False), |
| ) |
|
|
| |
| lines = [ |
| f"**File:** `{pdf_path.name}` ({pdf_path.stat().st_size / 1024:.1f} KB)", |
| f"**Routed to:** `{result.backend}` · " |
| f"P(ocr) = **{result.ocr_prob:.3f}** · {result.num_pages} page(s)", |
| ] |
| flags = [] |
| if result.is_form: |
| flags.append("is_form") |
| if result.is_encrypted: |
| flags.append("encrypted") |
| if result.needs_password: |
| flags.append("password-protected") |
| if result.garbled_text_ratio > 0.01: |
| flags.append(f"garbled_text_ratio={result.garbled_text_ratio:.2%}") |
| if flags: |
| lines.append("**Flags:** " + ", ".join(f"`{f}`" for f in flags)) |
| if result.router_error: |
| lines.append(f"**Router error:** `{result.router_error}`") |
| if result.extract_error: |
| lines.append(f"**Extract error:** `{result.extract_error}`") |
| if result.quality_error: |
| lines.append(f"**Quality error:** `{result.quality_error}`") |
|
|
| if result.backend == "mupdf" and not result.extract_error: |
| stats = result.extract_stats |
| lines.append( |
| f"**Extracted:** {stats.get('segment_count', 0)} segments, " |
| f"{stats.get('char_count', 0):,} chars " |
| f"(pages {stats.get('pages_extracted', 0)}/{stats.get('page_count', 0)})" |
| ) |
| else: |
| lines.append( |
| "_MuPDF extraction skipped — backend is not `mupdf`. " |
| "PIPELINE/VLM backends are still stubs in this repo._" |
| ) |
|
|
| if result.quality_score is not None: |
| lines.append( |
| f"**OCR quality:** **{result.quality_score:.2f}** / 3.0 " |
| f"({result.quality_num_tokens} tokens, `{result.quality_model}`)" |
| ) |
|
|
| lines.append( |
| f"**Timing (ms):** router **{result.wall_ms_router:.0f}** · " |
| f"extract **{result.wall_ms_extract:.0f}** · " |
| f"quality **{result.wall_ms_quality:.0f}**" |
| ) |
| summary_md = "\n\n".join(lines) |
|
|
| |
| md_text = result.markdown.strip() or "_No markdown — this PDF was not routed to MuPDF._" |
| if len(md_text) > 20_000: |
| md_text = md_text[:20_000] + "\n\n…\n\n**[truncated for UI — full Markdown in the JSON tab]**" |
|
|
| |
| seg_rows = [ |
| [s["index"], s["page"], s["type"], str(s["bbox_norm"]), s["chars"], s["preview"]] |
| for s in result.segments |
| ] or empty_segments |
|
|
| |
| feat_rows = pick_curated_features(result.router_features) or empty_features |
|
|
| |
| raw = result.to_record() |
| raw["router_features_full"] = result.router_features |
| raw["segments_full"] = result.segments |
| raw_json_str = json.dumps(raw, indent=2, ensure_ascii=False, default=str) |
|
|
| return ( |
| summary_md, |
| result.backend, |
| float(result.ocr_prob) if result.ocr_prob == result.ocr_prob else 0.0, |
| int(result.num_pages), |
| ("-" if result.quality_score is None else f"{result.quality_score:.2f} / 3.0"), |
| float(result.wall_ms_router + result.wall_ms_extract + result.wall_ms_quality), |
| preview, |
| md_text, |
| seg_rows, |
| feat_rows, |
| raw_json_str, |
| ) |
|
|
|
|
| |
|
|
| CSS = """ |
| .small-num input { font-weight: 600; font-size: 1.1rem; } |
| footer { display: none !important; } |
| """ |
|
|
|
|
| def build_demo() -> gr.Blocks: |
| with gr.Blocks(title="PDFSystem-MNBVC Demo") as demo: |
| gr.Markdown(DESCRIPTION) |
|
|
| with gr.Row(): |
| |
| with gr.Column(scale=1, min_width=320): |
| pdf_input = gr.File( |
| label="Upload a PDF", |
| file_types=[".pdf"], |
| type="filepath", |
| ) |
| with gr.Accordion("Options", open=True): |
| ocr_threshold = gr.Slider( |
| 0.0, 1.0, value=0.5, step=0.05, |
| label="OCR probability threshold", |
| info="ocr_prob ≥ threshold ⇒ route off the MuPDF fast path", |
| ) |
| run_quality = gr.Checkbox( |
| label="Run ModernBERT quality scorer", |
| value=False, |
| info="~3–5 s on CPU. First run downloads ~800 MB.", |
| ) |
| run_btn = gr.Button("Run Pipeline", variant="primary", size="lg") |
| gr.Markdown(PIPELINE_DIAGRAM_MD) |
|
|
| |
| with gr.Column(scale=2, min_width=520): |
| summary_md = gr.Markdown( |
| "Upload a PDF and click **Run Pipeline**.", |
| ) |
|
|
| with gr.Row(): |
| backend_out = gr.Textbox( |
| label="Backend", interactive=False, elem_classes=["small-num"] |
| ) |
| ocr_prob_out = gr.Number( |
| label="P(OCR)", interactive=False, precision=3, |
| elem_classes=["small-num"], |
| ) |
| pages_out = gr.Number( |
| label="Pages", interactive=False, |
| elem_classes=["small-num"], |
| ) |
| quality_out = gr.Textbox( |
| label="Quality", interactive=False, |
| elem_classes=["small-num"], |
| ) |
| wall_ms_out = gr.Number( |
| label="Total ms", interactive=False, precision=0, |
| elem_classes=["small-num"], |
| ) |
|
|
| with gr.Tabs(): |
| with gr.Tab("Page preview"): |
| preview_img = gr.Image( |
| label="First page with extracted bboxes", |
| type="pil", |
| interactive=False, |
| height=720, |
| ) |
| with gr.Tab("Markdown"): |
| md_out = gr.Markdown() |
| with gr.Tab("Segments"): |
| seg_df = gr.Dataframe( |
| headers=["idx", "page", "type", "bbox_norm", "chars", "preview"], |
| datatype=["number", "number", "str", "str", "number", "str"], |
| wrap=True, |
| label="Extracted segments (one row per block)", |
| ) |
| with gr.Tab("Router features"): |
| feat_df = gr.Dataframe( |
| headers=["feature", "value"], |
| datatype=["str", "str"], |
| label="Curated subset (full 124-dim vector in Raw JSON)", |
| ) |
| with gr.Tab("Raw JSON"): |
| raw_json = gr.Code(label="All pipeline outputs", language="json") |
|
|
| |
| outputs = [ |
| summary_md, |
| backend_out, ocr_prob_out, pages_out, quality_out, wall_ms_out, |
| preview_img, |
| md_out, |
| seg_df, |
| feat_df, |
| raw_json, |
| ] |
| run_btn.click( |
| process_pdf, |
| inputs=[pdf_input, run_quality, ocr_threshold], |
| outputs=outputs, |
| ) |
| |
| pdf_input.upload( |
| lambda f, t: process_pdf(f, False, t), |
| inputs=[pdf_input, ocr_threshold], |
| outputs=outputs, |
| ) |
|
|
| gr.Markdown( |
| "---\n" |
| "Repo: [pdfsystem_mnbvc](https://github.com/) · " |
| "Architecture: [FinePDFs](https://huggingface.co/datasets/HuggingFaceFW/finepdfs) · " |
| "Router weights: FinePDFs upstream (Apache-2.0) · " |
| "Quality model: `HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn`" |
| ) |
|
|
| return demo |
|
|
|
|
| demo = build_demo() |
|
|
|
|
| if __name__ == "__main__": |
| |
| server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0") |
| server_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860")) |
| demo.queue(max_size=8).launch( |
| server_name=server_name, |
| server_port=server_port, |
| theme=gr.themes.Soft(primary_hue="emerald"), |
| css=CSS, |
| ) |
|
|