adelevett's picture
polygon layout rendering
8504c7e
"""
app.py – Marker + PP-DocLayout V3 + Qwen3.5 via HF Inference
==============================================================
HuggingFace Space entry point.
Pipeline per request
--------------------
1. layout_profiler.profile_layout() CPU – PyMuPDF font/geometry pass
2. Marker PdfConverter GPU – layout + OCR + structure
β”œβ”€ PP-DocLayout V3 (optional checkbox) GPU – replaces Surya layout
└─ Qwen3.5 use_llm (optional checkbox) API – HF inference router
3. Aside overlay CPU – colour aside regions on
page-image using x-regime
Aside handling
--------------
Marker has no BlockTypes.Aside. Rather than modifying Marker internals we:
β€’ Run layout_profiler to get the x-regime (body lane boundaries per page).
β€’ After conversion, render a bbox overlay image where regions whose
centre-x falls outside the body lane are coloured orange (aside) instead
of blue (body).
β€’ The markdown output is untouched β€” the visual overlay is the aside signal.
Credit guard
------------
The HF_TOKEN env var is set as a Space secret. When the $2 free-tier credit
is exhausted the router returns HTTP 402. We catch it once, flip a module-level
flag, and disable the LLM checkbox with an explanatory banner.
"""
from __future__ import annotations
import subprocess
import sys
# The HF Space Dockerfile evaluates requirements.txt before mounting the repo
# directory. We must compile and install our local marker fork at runtime.
subprocess.run(
[
sys.executable, "-m", "pip", "install",
"./marker",
"--quiet",
],
check=True,
)
import os
import io
import logging
import tempfile
from typing import Optional
import gradio as gr
import spaces
from PIL import Image, ImageDraw
logger = logging.getLogger(__name__)
# ── Credit state ─────────────────────────────────────────────────────────────
_credits_exhausted: bool = False
# ── HF Space / model config ──────────────────────────────────────────────────
HF_TOKEN: str = os.environ.get("HF_TOKEN", "")
QWEN_MODEL: str = "Qwen/Qwen3.5-122B-A10B:cheapest"
HF_BASE_URL: str = "https://router.huggingface.co/v1"
# Aside overlay colours (RGBA)
BODY_COLOUR = (59, 130, 246, 160) # blue
ASIDE_COLOUR = (249, 115, 22, 160) # orange
LABEL_COLOUR = (255, 255, 255, 220) # white text bg
PP_MODEL_NAME = "PaddlePaddle/PP-DocLayoutV3_safetensors"
# ══════════════════════════════════════════════════════════════════════════════
# Startup pre-download (runs on CPU at Space launch, before any GPU request)
# ══════════════════════════════════════════════════════════════════════════════
#
# ZeroGPU only gates GPU compute β€” disk I/O runs freely at module load time.
# Pre-downloading here means the @spaces.GPU function never waits for network.
def _predownload_models() -> None:
"""
ZeroGPU only gates GPU compute β€” disk I/O and network run freely at module
load time. By downloading here, _run_marker never waits for network I/O
inside a paid GPU slot.
Errors are logged but never re-raised β€” a failed pre-download means the
first GPU request will download on-demand instead (slower, but not broken).
"""
global _marker_models
# ── Marker models (~2–3 GB) ────────────────────────────────────────────
try:
logger.info("Pre-downloading Marker models")
from marker.models import create_model_dict
_marker_models = create_model_dict()
logger.info("Marker models ready.")
except Exception as exc:
logger.warning("Marker model pre-download failed: %s", exc)
_marker_models = None
# ── PP-DocLayout V3 (~400 MB) ──────────────────────────────────────────
try:
logger.info("Pre-downloading PP-DocLayout V3")
from huggingface_hub import snapshot_download
snapshot_download(
repo_id=PP_MODEL_NAME,
repo_type="model",
ignore_patterns=["*.msgpack", "flax_model*", "tf_model*"],
)
logger.info("PP-DocLayout V3 ready.")
except Exception as exc:
logger.warning("PP-DocLayout V3 pre-download failed: %s", exc)
_marker_models = None # populated by _predownload_models() at startup
_predownload_models()
# ══════════════════════════════════════════════════════════════════════════════
# Overlay helpers
# ══════════════════════════════════════════════════════════════════════════════
# Dash pattern for x-regime lines (on, off) in pixels
_DASH = (6, 4)
def _draw_dashed_vline(draw, x: int, height: int, colour, dash=_DASH):
"""Draw a vertical dashed line at pixel x."""
on, off = dash
y = 0
while y < height:
draw.line([(x, y), (x, min(y + on, height))], fill=colour, width=2)
y += on + off
def _draw_bbox_overlay(
page_image: Image.Image,
page_boxes: list[dict], # [{"bbox": [x0,y0,x1,y1], "label": str}] in PDF points
body_x_span, # layout_profiler.XSpan | None (also in PDF points)
) -> Image.Image:
"""
Render coloured bbox overlay + x-regime guide lines on a page image at 72 dpi.
At 72 dpi, 1 PDF point == 1 pixel β€” Marker block bboxes and layout_profiler
x-span values are all in PDF points, so no coordinate conversion is needed.
Visual language:
Blue fill + outline β†’ body block (centre-x inside body lane)
Orange fill + outline β†’ aside block (centre-x outside body lane)
Green dashed vlines β†’ body lane left/right edges from layout_profiler
"""
H = page_image.height
# Draw on a fully-transparent layer so page_image shows through
drawing_layer = Image.new("RGBA", page_image.size, (0, 0, 0, 0))
draw = ImageDraw.Draw(drawing_layer, "RGBA")
# ── Draw blocks ──────────────────────────────────────────────────────────
for box in page_boxes:
x0, y0, x1, y1 = box["bbox"]
px0, py0, px1, py1 = int(x0), int(y0), int(x1), int(y1)
cx = (x0 + x1) / 2.0
is_aside = (
body_x_span is not None
and (cx < body_x_span.x_min or cx > body_x_span.x_max)
)
colour = ASIDE_COLOUR if is_aside else BODY_COLOUR
draw.rectangle([px0, py0, px1, py1], fill=colour)
draw.rectangle([px0, py0, px1, py1], outline=colour[:3] + (255,), width=2)
label = box.get("label", "")
if label:
tw = len(label) * 7 + 4
draw.rectangle([px0, py0, px0 + tw, py0 + 14], fill=LABEL_COLOUR)
draw.text((px0 + 2, py0 + 1), label, fill=(30, 30, 30))
# ── Draw x-regime guide lines ────────────────────────────────────────────
if body_x_span is not None:
XREGIME_COLOUR = (34, 197, 94, 230) # green, semi-opaque
_draw_dashed_vline(draw, int(body_x_span.x_min), H, XREGIME_COLOUR)
_draw_dashed_vline(draw, int(body_x_span.x_max), H, XREGIME_COLOUR)
# Composite drawing layer over the original page image
return Image.alpha_composite(page_image.convert("RGBA"), drawing_layer).convert("RGB")
# ══════════════════════════════════════════════════════════════════════════════
# Marker + PP-DocLayout GPU pipeline
# ══════════════════════════════════════════════════════════════════════════════
@spaces.GPU(duration=120)
def _run_marker(
pdf_path: str,
use_pp_layout: bool,
use_llm: bool,
) -> tuple[str, list]:
"""
GPU-decorated function: runs Marker conversion.
Returns (markdown_text, rendered) where rendered is Marker's full output.
Real API (confirmed from marker source / issues):
- text_from_rendered returns (text, metadata, images) -- 3-tuple
- LLM config keys: openai_api_key, openai_base_url, openai_model
- llm_service must be passed as kwarg to PdfConverter
- ConfigParser required to wire processors/renderer/llm_service
"""
from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict # fallback if predownload failed
from marker.output import text_from_rendered
config: dict = {"output_format": "markdown"}
if use_llm:
config.update({
"use_llm": True,
"llm_service": "marker.services.openai.OpenAIService",
"openai_base_url": HF_BASE_URL,
"openai_api_key": HF_TOKEN,
"openai_model": QWEN_MODEL,
})
config_parser = ConfigParser(config)
models = _marker_models if _marker_models is not None else create_model_dict()
converter_kwargs = dict(
config=config_parser.generate_config_dict(),
artifact_dict=models,
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer(),
llm_service=config_parser.get_llm_service(),
)
if use_pp_layout:
# make_pp_converter returns a PdfConverter subclass instance that:
# - overrides self.layout_builder_class = PPDocLayoutBuilder
# - overrides build_document() to store the Document on self.document
# This is the correct extension point per the actual pdf.py source.
from pp_doc_layout_builder import make_pp_converter
converter = make_pp_converter(**converter_kwargs)
else:
converter = PdfConverter(**converter_kwargs)
rendered = converter(pdf_path)
# text_from_rendered returns (text, metadata, images) -- must unpack
text, _metadata, _images = text_from_rendered(rendered)
# converter.document is populated by our build_document() override.
# For vanilla PdfConverter (use_pp_layout=False) it will be None.
document = getattr(converter, "document", None)
n_pages = len(getattr(document, "pages", []))
logger.info("_run_marker: document has %d pages (use_pp_layout=%s)", n_pages, use_pp_layout)
return text, document
# ══════════════════════════════════════════════════════════════════════════════
# Main processing function
# ══════════════════════════════════════════════════════════════════════════════
def process_pdf(
pdf_file,
use_pp_layout: bool,
use_llm: bool,
) -> tuple[str, Optional[Image.Image], str]:
"""
Full pipeline. Returns:
(markdown: str, overlay_image: PIL.Image | None, status: str)
"""
global _credits_exhausted
if pdf_file is None:
return "", None, "⚠️ Please upload a PDF."
if use_llm and _credits_exhausted:
use_llm = False
status_note = "⚠️ LLM disabled β€” HF inference credits exhausted.\n"
else:
status_note = ""
if use_llm and not HF_TOKEN:
return "", None, "⚠️ HF_TOKEN not set in Space secrets. Cannot use LLM."
pdf_path = pdf_file.name if hasattr(pdf_file, "name") else str(pdf_file)
# ── Stage 1: layout_profiler (CPU) ────────────────────────────────────
try:
from layout_profiler import profile_layout
lp = profile_layout(pdf_path)
profiler_summary = lp.summary()
except Exception as exc:
logger.warning("layout_profiler failed: %s", exc)
lp = None
profiler_summary = f"layout_profiler unavailable: {exc}"
# ── Stage 2: Marker GPU pass ──────────────────────────────────────────
try:
markdown, document = _run_marker(pdf_path, use_pp_layout, use_llm)
except Exception as exc:
# Catch HF 402 credit exhaustion
err_str = str(exc)
if "402" in err_str:
_credits_exhausted = True
return "", None, "⚠️ HF inference credits exhausted. LLM disabled for this session."
logger.exception("Marker conversion failed")
return "", None, f"❌ Conversion error: {exc}"
# ── Stage 3: Aside overlay (CPU) ──────────────────────────────────────
# Render the page at 72 dpi. PP-DocLayout detections are in the pixel
# space of the images Marker passed to the builder (typically 96 dpi).
# We scale them to 72 dpi using the ratio (overlay_w / det_img_w).
overlay_image: Optional[Image.Image] = None
try:
import fitz
doc = fitz.open(pdf_path)
first_page = doc[0]
mat = fitz.Matrix(1.0, 1.0) # 72 dpi β€” 1 pt == 1 px
pix = first_page.get_pixmap(matrix=mat)
page_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
doc.close()
body_span = lp.body_regime.get_span(0) if lp is not None else None
# Extract page_boxes from Marker's Document block tree.
# Iterate page.structure (layout-level block IDs only) β€” not page.children
# which includes every Line and Span and would flood the overlay.
# block.polygon is a PolygonBox in PDF points (rescaled by add_blocks_to_pages).
# At 72 dpi, 1 PDF point == 1 pixel β€” no scaling needed.
page_boxes: list[dict] = []
if document is not None:
pages = getattr(document, "pages", [])
if pages:
page0 = pages[0]
structure = getattr(page0, "structure", []) or []
for block_id in structure:
block = page0.get_block(block_id)
if block is None:
continue
poly_box = getattr(block, "polygon", None)
bt = getattr(block, "block_type", None)
if poly_box is None:
continue
# poly_box.polygon is List[List[float]] β€” [[x,y], ...]
poly_points = getattr(poly_box, "polygon", None)
if not poly_points or not isinstance(poly_points, list):
continue
try:
xs = [p[0] for p in poly_points]
ys = [p[1] for p in poly_points]
label = bt.value if hasattr(bt, "value") else str(bt)
page_boxes.append({
"bbox": [min(xs), min(ys), max(xs), max(ys)],
"label": label,
})
except Exception as e:
logger.warning("Skipping block polygon: %s", e)
continue
logger.info("Overlay: %d structure blocks from document.pages[0]", len(page_boxes))
overlay_image = _draw_bbox_overlay(page_img, page_boxes, body_span)
except Exception as exc:
logger.warning("Overlay rendering failed: %s", exc)
# ── Compose status message ─────────────────────────────────────────────
mode_parts = []
mode_parts.append("PP-DocLayout V3" if use_pp_layout else "Surya (default)")
mode_parts.append("Qwen3.5 LLM" if use_llm else "no LLM")
status = status_note + f"βœ… Converted with {' + '.join(mode_parts)}\n\n{profiler_summary}"
return markdown, overlay_image, status
# ══════════════════════════════════════════════════════════════════════════════
# Gradio UI
# ══════════════════════════════════════════════════════════════════════════════
def _llm_checkbox_label() -> str:
if _credits_exhausted:
return "Use Qwen3.5 via HF Inference (⚠️ credits exhausted)"
return "Use Qwen3.5 via HF Inference (uses $2 free credit)"
with gr.Blocks(title="Marker + PP-DocLayout V3") as demo:
gr.Markdown(
"""
# πŸ“„ Marker PDF Converter
### with PP-DocLayout V3 & Qwen3.5
Upload a PDF and choose your processing options.
- **PP-DocLayout V3**: Replaces Surya layout detection with PaddlePaddle's
RT-DETR instance segmentation model for higher bbox accuracy.
- **Qwen3.5 LLM**: Enables Marker's `use_llm` pass for table cleanup,
equation rendering, and reading-order correction via the HF inference router.
The first page bbox overlay colours body regions **blue** and margin/aside
regions **orange**, derived from font-geometric x-regime analysis.
"""
)
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="Upload PDF",
file_types=[".pdf"],
type="filepath",
)
with gr.Row():
use_pp = gr.Checkbox(label="Use PP-DocLayout V3", value=True)
use_llm = gr.Checkbox(label="Use Qwen3.5 (HF Inference)", value=False)
run_btn = gr.Button("Convert", variant="primary")
with gr.Column(scale=2):
overlay_out = gr.Image(label="Page 1 β€” bbox overlay (blue=body, orange=aside)")
with gr.Row():
markdown_out = gr.Markdown(label="Converted Markdown")
status_out = gr.Textbox(label="Status / Layout Profile", lines=12, interactive=False)
run_btn.click(
fn=process_pdf,
inputs=[pdf_input, use_pp, use_llm],
outputs=[markdown_out, overlay_out, status_out],
)
gr.Markdown(
"""
---
**Notes**
- LLM calls use your Space's `HF_TOKEN` free inference credit (~$2).
Once exhausted, the LLM option is automatically disabled for the session.
- Processing time: ~15-30s for a 10-page PDF without LLM; ~60s with LLM.
- Overlay shows page 1 only. Aside detection uses PyMuPDF font-geometry
(no extra GPU cost).
"""
)
if __name__ == "__main__":
demo.launch()