Spaces:

adelevett
/

docling_pp_layout_demo

Running on Zero

File size: 5,175 Bytes

# ---------------------------------------------------------------------------
# Force-upgrade transformers to >=5.1.0 before any other import.
#
# Why: PP-DocLayoutV3's custom model classes (PPDocLayoutV3ImageProcessor,
# PPDocLayoutV3ForObjectDetection) were added to the transformers library in
# version 5.1.0.  docling-ibm-models caps transformers<5.0.0 (conservative
# pinning), so pip resolves transformers ~4.x at build time.  We upgrade it
# here at runtime, before any docling/transformers import, so the correct
# classes are available.  docling-ibm-models' usage (AutoModel, pipeline API)
# remains compatible with transformers 5.x.
# ---------------------------------------------------------------------------
import subprocess
import sys

subprocess.run(
    [
        sys.executable, "-m", "pip", "install",
        "transformers>=5.1.0",
        "--quiet",
    ],
    check=True,
)

# `spaces` MUST be imported before any package that touches CUDA (torch,
# transformers, docling …).  ZeroGPU intercepts the CUDA initialisation; if
# anything else triggers it first the import raises RuntimeError.
import spaces  # noqa: E402

# ---------------------------------------------------------------------------
# Plugin registration
# ---------------------------------------------------------------------------
# docling-pp-doc-layout requires Python >=3.12 on PyPI, but the code itself
# is compatible with Python 3.10 (all annotations are guarded by
# `from __future__ import annotations`).  Instead of installing the package,
# we bundle the source directly and register the model with docling's factory
# by monkey-patching BaseFactory.load_from_plugins so that every new
# LayoutFactory instance automatically includes PPDocLayoutV3Model.
from docling.models.factories.base_factory import BaseFactory
from docling.models.factories.layout_factory import LayoutFactory
from docling_pp_doc_layout.model import PPDocLayoutV3Model

_orig_load = BaseFactory.load_from_plugins


def _load_with_pp_doc_layout(
    self, plugin_name=None, allow_external_plugins=False
):
    _orig_load(
        self,
        plugin_name=plugin_name,
        allow_external_plugins=allow_external_plugins,
    )
    if isinstance(self, LayoutFactory):
        try:
            self.register(
                PPDocLayoutV3Model,
                "docling-pp-doc-layout",
                "docling_pp_doc_layout.model",
            )
        except ValueError:
            pass  # already registered on a previous factory creation


BaseFactory.load_from_plugins = _load_with_pp_doc_layout

# ---------------------------------------------------------------------------
import gradio as gr
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_pp_doc_layout.options import PPDocLayoutV3Options

# Global initialisation — pipeline is constructed lazily on the first
# convert() call, which happens inside @spaces.GPU, so decide_device()
# correctly resolves "cuda:0" when the H200 is allocated.
pipeline_options = PdfPipelineOptions(
    layout_options=PPDocLayoutV3Options(
        batch_size=2,
        confidence_threshold=0.5,
    )
)

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)


@spaces.GPU(duration=120)
def infer_layout(file_path: str | None):
    if not file_path:
        return {"error": "No file uploaded"}, None
    try:
        result = converter.convert(file_path)
        structured_data = []
        for item, _level in result.document.iterate_items():
            structured_data.append({
                "type": type(item).__name__,
                "content": getattr(item, "text", "No text mapping"),
            })
        # Write to a temp file so Gradio can serve it as a download.
        import json, tempfile, os
        tmp = tempfile.NamedTemporaryFile(
            mode="w", suffix=".json", delete=False, encoding="utf-8"
        )
        json.dump(structured_data, tmp, ensure_ascii=False, indent=2)
        tmp.close()
        return structured_data, tmp.name
    except Exception as e:
        return {"runtime_exception": str(e)}, None


with gr.Blocks(title="PP-DocLayoutV3 Empirical Parser") as interface:
    gr.Markdown(
        "## Layout Detection Inference\n"
        "Upload a PDF to parse structural components through the "
        "PaddlePaddle PP-DocLayoutV3 model."
    )
    with gr.Row():
        pdf_input = gr.File(label="Source Document", file_types=[".pdf"])
        json_output = gr.JSON(label="Structured Extraction Matrix")
    download_btn = gr.DownloadButton(label="Download JSON", visible=False)
    execute_btn = gr.Button("Run Layout Detection")

    def run_and_reveal(file_path):
        data, path = infer_layout(file_path)
        return data, gr.DownloadButton(value=path, visible=path is not None)

    execute_btn.click(
        fn=run_and_reveal,
        inputs=pdf_input,
        outputs=[json_output, download_btn],
    )

if __name__ == "__main__":
    interface.launch()