Spaces:

roger1024
/

DocPipe

Runtime error

jieluo1024

chore: update Gradio to 6.12.0 and fix HF Spaces compatibility

4345562 about 1 month ago

13.9 kB

	"""Gradio demo for the pdfsys-mnbvc MVP pipeline.

	What this demonstrates (matching the code that actually exists in the
	repo today, not the aspirational PRD):

	* Stage-A XGBoost router — decides text-ok vs needs-ocr from 124
	PyMuPDF-derived features.
	* MuPDF fast path — extracts Markdown-ready segments when the router
	picks ``Backend.MUPDF``. Overlaid on the first page as colored bboxes.
	* ModernBERT OCR quality scorer — optional, heavy (~800 MB download,
	3–5 s per doc on CPU). Off by default to keep the demo snappy.

	PIPELINE / VLM / DEFERRED backends are surfaced through the router
	decision but are still stubs in ``packages/pdfsys-parser-*``; the UI
	just reports the routing choice in that case and skips extraction.

	Runs locally (``python demo/app.py``) and as a Hugging Face Space (see
	the repo-root ``README.md`` frontmatter and ``demo/README.md``).
	"""

	from __future__ import annotations

	import json
	import os
	import sys
	import tempfile
	import traceback
	from pathlib import Path

	import gradio as gr

	# Allow ``python demo/app.py`` without installing the workspace by falling
	# back to the in-tree sources. When running under HF Spaces / uv sync the
	# packages are already on sys.path and these inserts become no-ops.
	_REPO_ROOT = Path(__file__).resolve().parent.parent
	for pkg in ("pdfsys-core", "pdfsys-router", "pdfsys-parser-mupdf", "pdfsys-bench"):
	src = _REPO_ROOT / "packages" / pkg / "src"
	if src.is_dir() and str(src) not in sys.path:
	sys.path.insert(0, str(src))

	from pipeline import ( # noqa: E402 — must come after sys.path surgery
	PipelineResult,
	pick_curated_features,
	render_first_page_with_bboxes,
	run_pipeline,
	)


	# ------------------------------------------------------------------ constants

	DESCRIPTION = """\
	# PDFSystem-MNBVC · Pipeline Demo

	FinePDFs-inspired PB-scale PDF → pretraining-data pipeline, adapted
	for the Chinese MNBVC corpus. This demo shows the MVP closed loop that
	is actually implemented in the repo today:

	Router (XGBoost, 124 features) → MuPDF fast path → OCR Quality Scorer (ModernBERT)

	The router decides whether a PDF is cheap to parse with PyMuPDF alone,
	or whether it needs to go to the (still-stubbed) OCR / VLM backends.
	Roughly 90% of a typical PDF corpus takes the green fast-path lane.
	"""

	PIPELINE_DIAGRAM_MD = """\
	### Pipeline

	```
	┌────────────────┐
	PDF ───────►│ Stage-A │ XGBoost · ~10 ms/PDF
	│ Router │ 124 PyMuPDF features
	└────────┬───────┘
	│ ocr_prob
	┌─────────────┼─────────────┐
	▼ ▼ ▼
	MUPDF PIPELINE VLM / DEFERRED
	(text-ok) (OCR, stub) (VLM, stub)
	│
	▼
	PyMuPDF blocks ─► Markdown + Segments (with bboxes)
	│
	▼
	ModernBERT-large OCR quality regressor ─► score ∈ [0, 3]
	```

	Backend color legend on page preview

	- 🟢 `mupdf` — text-ok fast path (implemented)
	- 🟠 `pipeline` — OCR lane (stub, routing only)
	- 🟣 `vlm` — VLM lane (stub, routing only)
	- ⚪ `deferred` — held back until VLM workers online
	"""


	def _safe(val, default=""):
	"""Coerce NaN / None for Gradio components that don't like them."""
	if val is None:
	return default
	try:
	import math

	if isinstance(val, float) and math.isnan(val):
	return default
	except Exception:
	pass
	return val


	# ------------------------------------------------------------------ handlers


	def process_pdf(
	pdf_file: str \| None,
	run_quality: bool,
	ocr_threshold: float,
	progress: gr.Progress = gr.Progress(),
	):
	"""Main Gradio callback. Returns one value per output component."""
	empty_segments = [[0, 0, "-", "-", 0, ""]]
	empty_features = [["(no PDF uploaded)", ""]]
	empty_summary = "Upload a PDF to get started."

	if not pdf_file:
	return (
	empty_summary,
	"", 0.0, 0, "", 0.0,
	None,
	"_No markdown yet._",
	empty_segments,
	empty_features,
	"{}",
	)

	pdf_path = Path(pdf_file)

	try:
	progress(0.1, desc="Routing (XGBoost)…")
	result: PipelineResult = run_pipeline(
	pdf_path,
	run_quality=run_quality,
	ocr_threshold=ocr_threshold,
	)

	progress(0.7, desc="Rendering first page…")
	preview = render_first_page_with_bboxes(pdf_path, result, page_index=0)

	except Exception as e: # noqa: BLE001
	tb = traceback.format_exc()
	err_json = {"error": str(e), "traceback": tb.splitlines()[-6:]}
	return (
	f"Failed: `{e}`",
	"", 0.0, 0, "", 0.0,
	None,
	f"```\n{tb}\n```",
	empty_segments,
	empty_features,
	json.dumps(err_json, indent=2, ensure_ascii=False),
	)

	# ------------------------------------------------------------- summary
	lines = [
	f"File: `{pdf_path.name}` ({pdf_path.stat().st_size / 1024:.1f} KB)",
	f"Routed to: `{result.backend}`  ·  "
	f"P(ocr) = {result.ocr_prob:.3f}  ·  {result.num_pages} page(s)",
	]
	flags = []
	if result.is_form:
	flags.append("is_form")
	if result.is_encrypted:
	flags.append("encrypted")
	if result.needs_password:
	flags.append("password-protected")
	if result.garbled_text_ratio > 0.01:
	flags.append(f"garbled_text_ratio={result.garbled_text_ratio:.2%}")
	if flags:
	lines.append("Flags: " + ", ".join(f"`{f}`" for f in flags))
	if result.router_error:
	lines.append(f"Router error: `{result.router_error}`")
	if result.extract_error:
	lines.append(f"Extract error: `{result.extract_error}`")
	if result.quality_error:
	lines.append(f"Quality error: `{result.quality_error}`")

	if result.backend == "mupdf" and not result.extract_error:
	stats = result.extract_stats
	lines.append(
	f"Extracted: {stats.get('segment_count', 0)} segments, "
	f"{stats.get('char_count', 0):,} chars "
	f"(pages {stats.get('pages_extracted', 0)}/{stats.get('page_count', 0)})"
	)
	else:
	lines.append(
	"_MuPDF extraction skipped — backend is not `mupdf`. "
	"PIPELINE/VLM backends are still stubs in this repo._"
	)

	if result.quality_score is not None:
	lines.append(
	f"OCR quality: {result.quality_score:.2f} / 3.0 "
	f"({result.quality_num_tokens} tokens, `{result.quality_model}`)"
	)

	lines.append(
	f"Timing (ms): router {result.wall_ms_router:.0f} · "
	f"extract {result.wall_ms_extract:.0f} · "
	f"quality {result.wall_ms_quality:.0f}"
	)
	summary_md = "\n\n".join(lines)

	# ------------------------------------------------------------- markdown
	md_text = result.markdown.strip() or "_No markdown — this PDF was not routed to MuPDF._"
	if len(md_text) > 20_000:
	md_text = md_text[:20_000] + "\n\n…\n\n[truncated for UI — full Markdown in the JSON tab]"

	# ------------------------------------------------------------- segments
	seg_rows = [
	[s["index"], s["page"], s["type"], str(s["bbox_norm"]), s["chars"], s["preview"]]
	for s in result.segments
	] or empty_segments

	# ------------------------------------------------------------- features
	feat_rows = pick_curated_features(result.router_features) or empty_features

	# ------------------------------------------------------------- raw JSON
	raw = result.to_record()
	raw["router_features_full"] = result.router_features
	raw["segments_full"] = result.segments
	raw_json_str = json.dumps(raw, indent=2, ensure_ascii=False, default=str)

	return (
	summary_md,
	result.backend,
	float(result.ocr_prob) if result.ocr_prob == result.ocr_prob else 0.0,
	int(result.num_pages),
	("-" if result.quality_score is None else f"{result.quality_score:.2f} / 3.0"),
	float(result.wall_ms_router + result.wall_ms_extract + result.wall_ms_quality),
	preview,
	md_text,
	seg_rows,
	feat_rows,
	raw_json_str,
	)


	# ---------------------------------------------------------------------- UI

	CSS = """
	.small-num input { font-weight: 600; font-size: 1.1rem; }
	footer { display: none !important; }
	"""


	def build_demo() -> gr.Blocks:
	with gr.Blocks(title="PDFSystem-MNBVC Demo") as demo:
	gr.Markdown(DESCRIPTION)

	with gr.Row():
	# -------------------- left column: controls + diagram
	with gr.Column(scale=1, min_width=320):
	pdf_input = gr.File(
	label="Upload a PDF",
	file_types=[".pdf"],
	type="filepath",
	)
	with gr.Accordion("Options", open=True):
	ocr_threshold = gr.Slider(
	0.0, 1.0, value=0.5, step=0.05,
	label="OCR probability threshold",
	info="ocr_prob ≥ threshold ⇒ route off the MuPDF fast path",
	)
	run_quality = gr.Checkbox(
	label="Run ModernBERT quality scorer",
	value=False,
	info="~3–5 s on CPU. First run downloads ~800 MB.",
	)
	run_btn = gr.Button("Run Pipeline", variant="primary", size="lg")
	gr.Markdown(PIPELINE_DIAGRAM_MD)

	# -------------------- right column: outputs
	with gr.Column(scale=2, min_width=520):
	summary_md = gr.Markdown(
	"Upload a PDF and click Run Pipeline.",
	)

	with gr.Row():
	backend_out = gr.Textbox(
	label="Backend", interactive=False, elem_classes=["small-num"]
	)
	ocr_prob_out = gr.Number(
	label="P(OCR)", interactive=False, precision=3,
	elem_classes=["small-num"],
	)
	pages_out = gr.Number(
	label="Pages", interactive=False,
	elem_classes=["small-num"],
	)
	quality_out = gr.Textbox(
	label="Quality", interactive=False,
	elem_classes=["small-num"],
	)
	wall_ms_out = gr.Number(
	label="Total ms", interactive=False, precision=0,
	elem_classes=["small-num"],
	)

	with gr.Tabs():
	with gr.Tab("Page preview"):
	preview_img = gr.Image(
	label="First page with extracted bboxes",
	type="pil",
	interactive=False,
	height=720,
	)
	with gr.Tab("Markdown"):
	md_out = gr.Markdown()
	with gr.Tab("Segments"):
	seg_df = gr.Dataframe(
	headers=["idx", "page", "type", "bbox_norm", "chars", "preview"],
	datatype=["number", "number", "str", "str", "number", "str"],
	wrap=True,
	label="Extracted segments (one row per block)",
	)
	with gr.Tab("Router features"):
	feat_df = gr.Dataframe(
	headers=["feature", "value"],
	datatype=["str", "str"],
	label="Curated subset (full 124-dim vector in Raw JSON)",
	)
	with gr.Tab("Raw JSON"):
	raw_json = gr.Code(label="All pipeline outputs", language="json")

	# ----------------------------------------------------------- wiring
	outputs = [
	summary_md,
	backend_out, ocr_prob_out, pages_out, quality_out, wall_ms_out,
	preview_img,
	md_out,
	seg_df,
	feat_df,
	raw_json,
	]
	run_btn.click(
	process_pdf,
	inputs=[pdf_input, run_quality, ocr_threshold],
	outputs=outputs,
	)
	# Auto-run on file upload (with quality off for snappiness).
	pdf_input.upload(
	lambda f, t: process_pdf(f, False, t),
	inputs=[pdf_input, ocr_threshold],
	outputs=outputs,
	)

	gr.Markdown(
	"---\n"
	"Repo: [pdfsystem_mnbvc](https://github.com/) · "
	"Architecture: [FinePDFs](https://huggingface.co/datasets/HuggingFaceFW/finepdfs) · "
	"Router weights: FinePDFs upstream (Apache-2.0) · "
	"Quality model: `HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn`"
	)

	return demo


	demo = build_demo()


	if __name__ == "__main__":
	# Sensible defaults for both local dev and HF Spaces.
	server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
	server_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
	demo.queue(max_size=8).launch(
	server_name=server_name,
	server_port=server_port,
	theme=gr.themes.Soft(primary_hue="emerald"),
	css=CSS,
	)