DocPipe / demo /app.py
jieluo1024
chore: update Gradio to 6.12.0 and fix HF Spaces compatibility
4345562
"""Gradio demo for the pdfsys-mnbvc MVP pipeline.
What this demonstrates (matching the code that actually exists in the
repo today, not the aspirational PRD):
* Stage-A XGBoost router — decides text-ok vs needs-ocr from 124
PyMuPDF-derived features.
* MuPDF fast path — extracts Markdown-ready segments when the router
picks ``Backend.MUPDF``. Overlaid on the first page as colored bboxes.
* ModernBERT OCR quality scorer — optional, heavy (~800 MB download,
3–5 s per doc on CPU). Off by default to keep the demo snappy.
PIPELINE / VLM / DEFERRED backends are surfaced through the router
decision but are still stubs in ``packages/pdfsys-parser-*``; the UI
just reports the routing choice in that case and skips extraction.
Runs locally (``python demo/app.py``) and as a Hugging Face Space (see
the repo-root ``README.md`` frontmatter and ``demo/README.md``).
"""
from __future__ import annotations
import json
import os
import sys
import tempfile
import traceback
from pathlib import Path
import gradio as gr
# Allow ``python demo/app.py`` without installing the workspace by falling
# back to the in-tree sources. When running under HF Spaces / uv sync the
# packages are already on sys.path and these inserts become no-ops.
_REPO_ROOT = Path(__file__).resolve().parent.parent
for pkg in ("pdfsys-core", "pdfsys-router", "pdfsys-parser-mupdf", "pdfsys-bench"):
src = _REPO_ROOT / "packages" / pkg / "src"
if src.is_dir() and str(src) not in sys.path:
sys.path.insert(0, str(src))
from pipeline import ( # noqa: E402 — must come after sys.path surgery
PipelineResult,
pick_curated_features,
render_first_page_with_bboxes,
run_pipeline,
)
# ------------------------------------------------------------------ constants
DESCRIPTION = """\
# PDFSystem-MNBVC · Pipeline Demo
**FinePDFs-inspired PB-scale PDF → pretraining-data pipeline**, adapted
for the Chinese MNBVC corpus. This demo shows the MVP closed loop that
is actually implemented in the repo today:
**Router (XGBoost, 124 features)** → **MuPDF fast path** → **OCR Quality Scorer (ModernBERT)**
The router decides whether a PDF is cheap to parse with PyMuPDF alone,
or whether it needs to go to the (still-stubbed) OCR / VLM backends.
Roughly 90% of a typical PDF corpus takes the green fast-path lane.
"""
PIPELINE_DIAGRAM_MD = """\
### Pipeline
```
┌────────────────┐
PDF ───────►│ Stage-A │ XGBoost · ~10 ms/PDF
│ Router │ 124 PyMuPDF features
└────────┬───────┘
│ ocr_prob
┌─────────────┼─────────────┐
▼ ▼ ▼
MUPDF PIPELINE VLM / DEFERRED
(text-ok) (OCR, stub) (VLM, stub)
PyMuPDF blocks ─► Markdown + Segments (with bboxes)
ModernBERT-large OCR quality regressor ─► score ∈ [0, 3]
```
**Backend color legend on page preview**
- 🟢 `mupdf` — text-ok fast path (implemented)
- 🟠 `pipeline` — OCR lane (stub, routing only)
- 🟣 `vlm` — VLM lane (stub, routing only)
- ⚪ `deferred` — held back until VLM workers online
"""
def _safe(val, default=""):
"""Coerce NaN / None for Gradio components that don't like them."""
if val is None:
return default
try:
import math
if isinstance(val, float) and math.isnan(val):
return default
except Exception:
pass
return val
# ------------------------------------------------------------------ handlers
def process_pdf(
pdf_file: str | None,
run_quality: bool,
ocr_threshold: float,
progress: gr.Progress = gr.Progress(),
):
"""Main Gradio callback. Returns one value per output component."""
empty_segments = [[0, 0, "-", "-", 0, ""]]
empty_features = [["(no PDF uploaded)", ""]]
empty_summary = "Upload a PDF to get started."
if not pdf_file:
return (
empty_summary,
"", 0.0, 0, "", 0.0,
None,
"_No markdown yet._",
empty_segments,
empty_features,
"{}",
)
pdf_path = Path(pdf_file)
try:
progress(0.1, desc="Routing (XGBoost)…")
result: PipelineResult = run_pipeline(
pdf_path,
run_quality=run_quality,
ocr_threshold=ocr_threshold,
)
progress(0.7, desc="Rendering first page…")
preview = render_first_page_with_bboxes(pdf_path, result, page_index=0)
except Exception as e: # noqa: BLE001
tb = traceback.format_exc()
err_json = {"error": str(e), "traceback": tb.splitlines()[-6:]}
return (
f"**Failed:** `{e}`",
"", 0.0, 0, "", 0.0,
None,
f"```\n{tb}\n```",
empty_segments,
empty_features,
json.dumps(err_json, indent=2, ensure_ascii=False),
)
# ------------------------------------------------------------- summary
lines = [
f"**File:** `{pdf_path.name}` ({pdf_path.stat().st_size / 1024:.1f} KB)",
f"**Routed to:** `{result.backend}`  ·  "
f"P(ocr) = **{result.ocr_prob:.3f}**  ·  {result.num_pages} page(s)",
]
flags = []
if result.is_form:
flags.append("is_form")
if result.is_encrypted:
flags.append("encrypted")
if result.needs_password:
flags.append("password-protected")
if result.garbled_text_ratio > 0.01:
flags.append(f"garbled_text_ratio={result.garbled_text_ratio:.2%}")
if flags:
lines.append("**Flags:** " + ", ".join(f"`{f}`" for f in flags))
if result.router_error:
lines.append(f"**Router error:** `{result.router_error}`")
if result.extract_error:
lines.append(f"**Extract error:** `{result.extract_error}`")
if result.quality_error:
lines.append(f"**Quality error:** `{result.quality_error}`")
if result.backend == "mupdf" and not result.extract_error:
stats = result.extract_stats
lines.append(
f"**Extracted:** {stats.get('segment_count', 0)} segments, "
f"{stats.get('char_count', 0):,} chars "
f"(pages {stats.get('pages_extracted', 0)}/{stats.get('page_count', 0)})"
)
else:
lines.append(
"_MuPDF extraction skipped — backend is not `mupdf`. "
"PIPELINE/VLM backends are still stubs in this repo._"
)
if result.quality_score is not None:
lines.append(
f"**OCR quality:** **{result.quality_score:.2f}** / 3.0 "
f"({result.quality_num_tokens} tokens, `{result.quality_model}`)"
)
lines.append(
f"**Timing (ms):** router **{result.wall_ms_router:.0f}** · "
f"extract **{result.wall_ms_extract:.0f}** · "
f"quality **{result.wall_ms_quality:.0f}**"
)
summary_md = "\n\n".join(lines)
# ------------------------------------------------------------- markdown
md_text = result.markdown.strip() or "_No markdown — this PDF was not routed to MuPDF._"
if len(md_text) > 20_000:
md_text = md_text[:20_000] + "\n\n…\n\n**[truncated for UI — full Markdown in the JSON tab]**"
# ------------------------------------------------------------- segments
seg_rows = [
[s["index"], s["page"], s["type"], str(s["bbox_norm"]), s["chars"], s["preview"]]
for s in result.segments
] or empty_segments
# ------------------------------------------------------------- features
feat_rows = pick_curated_features(result.router_features) or empty_features
# ------------------------------------------------------------- raw JSON
raw = result.to_record()
raw["router_features_full"] = result.router_features
raw["segments_full"] = result.segments
raw_json_str = json.dumps(raw, indent=2, ensure_ascii=False, default=str)
return (
summary_md,
result.backend,
float(result.ocr_prob) if result.ocr_prob == result.ocr_prob else 0.0,
int(result.num_pages),
("-" if result.quality_score is None else f"{result.quality_score:.2f} / 3.0"),
float(result.wall_ms_router + result.wall_ms_extract + result.wall_ms_quality),
preview,
md_text,
seg_rows,
feat_rows,
raw_json_str,
)
# ---------------------------------------------------------------------- UI
CSS = """
.small-num input { font-weight: 600; font-size: 1.1rem; }
footer { display: none !important; }
"""
def build_demo() -> gr.Blocks:
with gr.Blocks(title="PDFSystem-MNBVC Demo") as demo:
gr.Markdown(DESCRIPTION)
with gr.Row():
# -------------------- left column: controls + diagram
with gr.Column(scale=1, min_width=320):
pdf_input = gr.File(
label="Upload a PDF",
file_types=[".pdf"],
type="filepath",
)
with gr.Accordion("Options", open=True):
ocr_threshold = gr.Slider(
0.0, 1.0, value=0.5, step=0.05,
label="OCR probability threshold",
info="ocr_prob ≥ threshold ⇒ route off the MuPDF fast path",
)
run_quality = gr.Checkbox(
label="Run ModernBERT quality scorer",
value=False,
info="~3–5 s on CPU. First run downloads ~800 MB.",
)
run_btn = gr.Button("Run Pipeline", variant="primary", size="lg")
gr.Markdown(PIPELINE_DIAGRAM_MD)
# -------------------- right column: outputs
with gr.Column(scale=2, min_width=520):
summary_md = gr.Markdown(
"Upload a PDF and click **Run Pipeline**.",
)
with gr.Row():
backend_out = gr.Textbox(
label="Backend", interactive=False, elem_classes=["small-num"]
)
ocr_prob_out = gr.Number(
label="P(OCR)", interactive=False, precision=3,
elem_classes=["small-num"],
)
pages_out = gr.Number(
label="Pages", interactive=False,
elem_classes=["small-num"],
)
quality_out = gr.Textbox(
label="Quality", interactive=False,
elem_classes=["small-num"],
)
wall_ms_out = gr.Number(
label="Total ms", interactive=False, precision=0,
elem_classes=["small-num"],
)
with gr.Tabs():
with gr.Tab("Page preview"):
preview_img = gr.Image(
label="First page with extracted bboxes",
type="pil",
interactive=False,
height=720,
)
with gr.Tab("Markdown"):
md_out = gr.Markdown()
with gr.Tab("Segments"):
seg_df = gr.Dataframe(
headers=["idx", "page", "type", "bbox_norm", "chars", "preview"],
datatype=["number", "number", "str", "str", "number", "str"],
wrap=True,
label="Extracted segments (one row per block)",
)
with gr.Tab("Router features"):
feat_df = gr.Dataframe(
headers=["feature", "value"],
datatype=["str", "str"],
label="Curated subset (full 124-dim vector in Raw JSON)",
)
with gr.Tab("Raw JSON"):
raw_json = gr.Code(label="All pipeline outputs", language="json")
# ----------------------------------------------------------- wiring
outputs = [
summary_md,
backend_out, ocr_prob_out, pages_out, quality_out, wall_ms_out,
preview_img,
md_out,
seg_df,
feat_df,
raw_json,
]
run_btn.click(
process_pdf,
inputs=[pdf_input, run_quality, ocr_threshold],
outputs=outputs,
)
# Auto-run on file upload (with quality off for snappiness).
pdf_input.upload(
lambda f, t: process_pdf(f, False, t),
inputs=[pdf_input, ocr_threshold],
outputs=outputs,
)
gr.Markdown(
"---\n"
"Repo: [pdfsystem_mnbvc](https://github.com/) · "
"Architecture: [FinePDFs](https://huggingface.co/datasets/HuggingFaceFW/finepdfs) · "
"Router weights: FinePDFs upstream (Apache-2.0) · "
"Quality model: `HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn`"
)
return demo
demo = build_demo()
if __name__ == "__main__":
# Sensible defaults for both local dev and HF Spaces.
server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
server_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
demo.queue(max_size=8).launch(
server_name=server_name,
server_port=server_port,
theme=gr.themes.Soft(primary_hue="emerald"),
css=CSS,
)