# --------------------------------------------------------------------------- # Force-upgrade transformers to >=5.1.0 before any other import. # # Why: PP-DocLayoutV3's custom model classes (PPDocLayoutV3ImageProcessor, # PPDocLayoutV3ForObjectDetection) were added to the transformers library in # version 5.1.0. docling-ibm-models caps transformers<5.0.0 (conservative # pinning), so pip resolves transformers ~4.x at build time. We upgrade it # here at runtime, before any docling/transformers import, so the correct # classes are available. docling-ibm-models' usage (AutoModel, pipeline API) # remains compatible with transformers 5.x. # --------------------------------------------------------------------------- import subprocess import sys subprocess.run( [ sys.executable, "-m", "pip", "install", "transformers>=5.1.0", "--quiet", ], check=True, ) # `spaces` MUST be imported before any package that touches CUDA (torch, # transformers, docling …). ZeroGPU intercepts the CUDA initialisation; if # anything else triggers it first the import raises RuntimeError. import spaces # noqa: E402 # --------------------------------------------------------------------------- # Plugin registration # --------------------------------------------------------------------------- # docling-pp-doc-layout requires Python >=3.12 on PyPI, but the code itself # is compatible with Python 3.10 (all annotations are guarded by # `from __future__ import annotations`). Instead of installing the package, # we bundle the source directly and register the model with docling's factory # by monkey-patching BaseFactory.load_from_plugins so that every new # LayoutFactory instance automatically includes PPDocLayoutV3Model. from docling.models.factories.base_factory import BaseFactory from docling.models.factories.layout_factory import LayoutFactory from docling_pp_doc_layout.model import PPDocLayoutV3Model _orig_load = BaseFactory.load_from_plugins def _load_with_pp_doc_layout( self, plugin_name=None, allow_external_plugins=False ): _orig_load( self, plugin_name=plugin_name, allow_external_plugins=allow_external_plugins, ) if isinstance(self, LayoutFactory): try: self.register( PPDocLayoutV3Model, "docling-pp-doc-layout", "docling_pp_doc_layout.model", ) except ValueError: pass # already registered on a previous factory creation BaseFactory.load_from_plugins = _load_with_pp_doc_layout # --------------------------------------------------------------------------- import gradio as gr from docling.datamodel.base_models import InputFormat from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.pipeline_options import PdfPipelineOptions from docling_pp_doc_layout.options import PPDocLayoutV3Options # Global initialisation — pipeline is constructed lazily on the first # convert() call, which happens inside @spaces.GPU, so decide_device() # correctly resolves "cuda:0" when the H200 is allocated. pipeline_options = PdfPipelineOptions( layout_options=PPDocLayoutV3Options( batch_size=2, confidence_threshold=0.5, ) ) converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) } ) @spaces.GPU(duration=120) def infer_layout(file_path: str | None): if not file_path: return {"error": "No file uploaded"}, None try: result = converter.convert(file_path) structured_data = [] for item, _level in result.document.iterate_items(): structured_data.append({ "type": type(item).__name__, "content": getattr(item, "text", "No text mapping"), }) # Write to a temp file so Gradio can serve it as a download. import json, tempfile, os tmp = tempfile.NamedTemporaryFile( mode="w", suffix=".json", delete=False, encoding="utf-8" ) json.dump(structured_data, tmp, ensure_ascii=False, indent=2) tmp.close() return structured_data, tmp.name except Exception as e: return {"runtime_exception": str(e)}, None with gr.Blocks(title="PP-DocLayoutV3 Empirical Parser") as interface: gr.Markdown( "## Layout Detection Inference\n" "Upload a PDF to parse structural components through the " "PaddlePaddle PP-DocLayoutV3 model." ) with gr.Row(): pdf_input = gr.File(label="Source Document", file_types=[".pdf"]) json_output = gr.JSON(label="Structured Extraction Matrix") download_btn = gr.DownloadButton(label="Download JSON", visible=False) execute_btn = gr.Button("Run Layout Detection") def run_and_reveal(file_path): data, path = infer_layout(file_path) return data, gr.DownloadButton(value=path, visible=path is not None) execute_btn.click( fn=run_and_reveal, inputs=pdf_input, outputs=[json_output, download_btn], ) if __name__ == "__main__": interface.launch()