Spaces:

adelevett
/

docling_pp_layout_demo

Running on Zero

App Files Files Community

adelevett commited on Mar 7

Commit

d7c9ee5

verified ·

1 Parent(s): dbe48bf

Upload 9 files

Browse files

Files changed (8) hide show

app.py +38 -0
docling_pp_doc_layout/__init__.py +3 -0
docling_pp_doc_layout/label_mapping.py +34 -0
docling_pp_doc_layout/model.py +225 -0
docling_pp_doc_layout/options.py +110 -0
docling_pp_doc_layout/plugin.py +12 -0
docling_pp_doc_layout/py.typed +0 -0
requirements.txt +5 -1

app.py CHANGED Viewed

@@ -1,3 +1,41 @@
 import gradio as gr
 import spaces
 from docling.datamodel.base_models import InputFormat

+# ---------------------------------------------------------------------------
+# Plugin registration
+# ---------------------------------------------------------------------------
+# docling-pp-doc-layout requires Python >=3.12 on PyPI, but the code itself
+# is compatible with Python 3.10 (all annotations are guarded by
+# `from __future__ import annotations`).  Instead of installing the package,
+# we bundle the source directly and register the model with docling's factory
+# by monkey-patching BaseFactory.load_from_plugins so that every new
+# LayoutFactory instance automatically includes PPDocLayoutV3Model.
+from docling.models.factories.base_factory import BaseFactory
+from docling.models.factories.layout_factory import LayoutFactory
+from docling_pp_doc_layout.model import PPDocLayoutV3Model
+_orig_load = BaseFactory.load_from_plugins
+def _load_with_pp_doc_layout(
+    self, plugin_name=None, allow_external_plugins=False
+):
+    _orig_load(
+        self,
+        plugin_name=plugin_name,
+        allow_external_plugins=allow_external_plugins,
+    )
+    if isinstance(self, LayoutFactory):
+        try:
+            self.register(
+                PPDocLayoutV3Model,
+                "docling-pp-doc-layout",
+                "docling_pp_doc_layout.model",
+            )
+        except ValueError:
+            pass  # already registered on a previous factory creation
+BaseFactory.load_from_plugins = _load_with_pp_doc_layout
+# ---------------------------------------------------------------------------
 import gradio as gr
 import spaces
 from docling.datamodel.base_models import InputFormat

docling_pp_doc_layout/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """A Docling plugin for PaddlePaddle PP-DocLayout-V3 model document layout detection."""
2	+
3	+ __version__ = "0.1.0"

docling_pp_doc_layout/label_mapping.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""Mapping from PP-DocLayout-V3 label names to docling DocItemLabel values.
+Every label produced here must exist in
+``docling.utils.layout_postprocessor.LayoutPostprocessor.CONFIDENCE_THRESHOLDS``
+so that the postprocessor can apply confidence filtering without a ``KeyError``.
+"""
+from __future__ import annotations
+from docling_core.types.doc import DocItemLabel
+LABEL_MAP: dict[str, DocItemLabel] = {
+    "abstract": DocItemLabel.TEXT,
+    "algorithm": DocItemLabel.CODE,
+    "aside_text": DocItemLabel.TEXT,
+    "chart": DocItemLabel.PICTURE,
+    "content": DocItemLabel.TEXT,
+    "doc_title": DocItemLabel.TITLE,
+    "figure_title": DocItemLabel.CAPTION,
+    "footer": DocItemLabel.PAGE_FOOTER,
+    "footnote": DocItemLabel.FOOTNOTE,
+    "formula": DocItemLabel.FORMULA,
+    "formula_number": DocItemLabel.TEXT,
+    "header": DocItemLabel.PAGE_HEADER,
+    "image": DocItemLabel.PICTURE,
+    "number": DocItemLabel.TEXT,
+    "paragraph_title": DocItemLabel.SECTION_HEADER,
+    "reference": DocItemLabel.TEXT,
+    "reference_content": DocItemLabel.TEXT,
+    "seal": DocItemLabel.PICTURE,
+    "table": DocItemLabel.TABLE,
+    "text": DocItemLabel.TEXT,
+    "vision_footnote": DocItemLabel.FOOTNOTE,
+}

docling_pp_doc_layout/model.py ADDED Viewed

	@@ -0,0 +1,225 @@

+"""PP-DocLayout-V3 layout model for the docling standard pipeline.
+Runs PaddlePaddle PP-DocLayout-V3 locally via HuggingFace ``transformers``
+to detect document layout elements and returns ``LayoutPrediction`` objects
+that docling merges with its standard-pipeline output.
+"""
+from __future__ import annotations
+import logging
+import warnings
+from typing import TYPE_CHECKING
+import numpy as np
+import torch
+from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
+from docling.models.base_layout_model import BaseLayoutModel
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.layout_postprocessor import LayoutPostprocessor
+from docling.utils.profiling import TimeRecorder
+from docling_core.types.doc import DocItemLabel
+from transformers import AutoImageProcessor, AutoModelForObjectDetection
+from docling_pp_doc_layout.label_mapping import LABEL_MAP
+from docling_pp_doc_layout.options import PPDocLayoutV3Options
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from pathlib import Path
+    from docling.datamodel.accelerator_options import AcceleratorOptions
+    from docling.datamodel.document import ConversionResult
+    from docling.datamodel.pipeline_options import BaseLayoutOptions
+    from PIL import Image
+logger = logging.getLogger(__name__)
+class PPDocLayoutV3Model(BaseLayoutModel):
+    """Layout engine using PP-DocLayout-V3 via HuggingFace transformers."""
+    def __init__(
+        self,
+        artifacts_path: Path | None,
+        accelerator_options: AcceleratorOptions,
+        options: PPDocLayoutV3Options,
+        *,
+        enable_remote_services: bool = False,  # noqa: ARG002
+    ) -> None:
+        self.options = options
+        self.artifacts_path = artifacts_path
+        self.accelerator_options = accelerator_options
+        self._device = decide_device(accelerator_options.device)
+        logger.info(
+            "Loading PP-DocLayout-V3 model %s on device=%s",
+            options.model_name,
+            self._device,
+        )
+        self._image_processor = AutoImageProcessor.from_pretrained(
+            options.model_name,
+        )
+        self._model = AutoModelForObjectDetection.from_pretrained(
+            options.model_name,
+        ).to(self._device)
+        self._model.eval()
+        self._id2label: dict[int, str] = self._model.config.id2label
+        logger.info("PP-DocLayout-V3 model loaded successfully")
+    @classmethod
+    def get_options_type(cls) -> type[BaseLayoutOptions]:
+        """Return the options class for this layout model."""
+        return PPDocLayoutV3Options
+    def _run_inference(
+        self,
+        images: list[Image.Image],
+    ) -> list[list[dict]]:
+        """Run PP-DocLayout-V3 on a batch of PIL images.
+        Returns a list (per image) of lists of detection dicts with keys
+        ``label``, ``confidence``, ``l``, ``t``, ``r``, ``b``.
+        """
+        inputs = self._image_processor(images=images, return_tensors="pt")
+        inputs = {k: v.to(self._device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self._model(**inputs)
+        target_sizes = [img.size[::-1] for img in images]  # (height, width)
+        results = self._image_processor.post_process_object_detection(
+            outputs,
+            target_sizes=target_sizes,
+            threshold=self.options.confidence_threshold,
+        )
+        batch_detections: list[list[dict]] = []
+        for result in results:
+            detections: list[dict] = []
+            polys = result.get("polygons") or result.get("polygon_points")
+            if polys is None:
+                polys = [None] * len(result["scores"])
+            for score, label_id, box, poly in zip(
+                result["scores"],
+                result["labels"],
+                result["boxes"],
+                polys,
+                strict=True,
+            ):
+                raw_label = self._id2label.get(label_id.item(), "text")
+                doc_label = LABEL_MAP.get(raw_label, DocItemLabel.TEXT)
+                if poly is not None and len(poly) > 0:
+                    # Flatten or handle nested points to extract min/max
+                    if isinstance(poly[0], int | float):
+                        xs = poly[0::2]
+                        ys = poly[1::2]
+                    else:
+                        xs = [pt[0] for pt in poly]
+                        ys = [pt[1] for pt in poly]
+                    x_min, x_max = min(xs), max(xs)
+                    y_min, y_max = min(ys), max(ys)
+                else:
+                    x_min, y_min, x_max, y_max = box.tolist()
+                detections.append({
+                    "label": doc_label,
+                    "confidence": score.item(),
+                    "l": x_min,
+                    "t": y_min,
+                    "r": x_max,
+                    "b": y_max,
+                })
+            batch_detections.append(detections)
+        return batch_detections
+    def predict_layout(
+        self,
+        conv_res: ConversionResult,
+        pages: Sequence[Page],
+    ) -> Sequence[LayoutPrediction]:
+        """Detect layout regions for a batch of document pages."""
+        pages = list(pages)
+        valid_pages: list[Page] = []
+        valid_images: list[Image.Image] = []
+        is_page_valid: list[bool] = []
+        for page in pages:
+            if page._backend is None or not page._backend.is_valid():  # noqa: SLF001
+                is_page_valid.append(False)
+                continue
+            if page.size is None:
+                is_page_valid.append(False)
+                continue
+            page_image = page.get_image(scale=1.0)
+            if page_image is None:
+                is_page_valid.append(False)
+                continue
+            valid_pages.append(page)
+            valid_images.append(page_image)
+            is_page_valid.append(True)
+        batch_detections: list[list[dict]] = []
+        if valid_images:
+            with TimeRecorder(conv_res, "layout"):
+                bs = self.options.batch_size
+                for i in range(0, len(valid_images), bs):
+                    batch = valid_images[i : i + bs]
+                    batch_detections.extend(self._run_inference(batch))
+        layout_predictions: list[LayoutPrediction] = []
+        valid_idx = 0
+        for idx, page in enumerate(pages):
+            if not is_page_valid[idx]:
+                existing = page.predictions.layout or LayoutPrediction()
+                layout_predictions.append(existing)
+                continue
+            detections = batch_detections[valid_idx]
+            valid_idx += 1
+            clusters: list[Cluster] = []
+            for ix, det in enumerate(detections):
+                cluster = Cluster(
+                    id=ix,
+                    label=det["label"],
+                    confidence=det["confidence"],
+                    bbox=BoundingBox(
+                        l=det["l"],
+                        t=det["t"],
+                        r=det["r"],
+                        b=det["b"],
+                    ),
+                    cells=[],
+                )
+                clusters.append(cluster)
+            processed_clusters, processed_cells = LayoutPostprocessor(page, clusters, self.options).postprocess()
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore",
+                    "Mean of empty slice|invalid value encountered in scalar divide",
+                    RuntimeWarning,
+                    "numpy",
+                )
+                conv_res.confidence.pages[page.page_no].layout_score = float(
+                    np.mean([c.confidence for c in processed_clusters])
+                )
+                conv_res.confidence.pages[page.page_no].ocr_score = float(
+                    np.mean([c.confidence for c in processed_cells if c.from_ocr])
+                )
+            prediction = LayoutPrediction(clusters=processed_clusters)
+            layout_predictions.append(prediction)
+        return layout_predictions

docling_pp_doc_layout/options.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""Configuration model for the PP-DocLayout-V3 layout engine."""
+from __future__ import annotations
+import os
+from typing import Annotated, ClassVar, Literal
+from docling.datamodel.pipeline_options import LayoutOptions
+from pydantic import ConfigDict, Field
+def _parse_bool(value: str) -> bool:
+    """Parse a string environment variable value as a boolean.
+    Args:
+        value: The string to parse.  Case-insensitive ``"true"``, ``"1"``,
+            and ``"yes"`` are truthy; everything else is falsy.
+    Returns:
+        ``True`` if *value* is a recognised truthy string, ``False`` otherwise.
+    """
+    return value.lower() in ("true", "1", "yes")
+class PPDocLayoutV3Options(LayoutOptions):
+    """Options for the PP-DocLayout-V3 layout detection engine.
+    Uses a HuggingFace-hosted PP-DocLayout-V3 model to detect document
+    layout elements (text, tables, figures, headers, etc.) in page images.
+    All options fall back to environment variables when not set explicitly,
+    allowing configuration without code changes (e.g. in Docker / Compose
+    deployments).
+    Attributes:
+        model_name: HuggingFace model repository ID.
+            Falls back to the ``PP_DOC_LAYOUT_MODEL_NAME`` env var.
+        confidence_threshold: Minimum confidence score for detections.
+            Falls back to the ``PP_DOC_LAYOUT_CONFIDENCE_THRESHOLD`` env var.
+        batch_size: Number of pages per inference batch.
+            Falls back to the ``PP_DOC_LAYOUT_BATCH_SIZE`` env var.
+        create_orphan_clusters: Create clusters for orphaned elements.
+            Falls back to the ``PP_DOC_LAYOUT_CREATE_ORPHAN_CLUSTERS`` env var.
+        keep_empty_clusters: Retain empty clusters in results.
+            Falls back to the ``PP_DOC_LAYOUT_KEEP_EMPTY_CLUSTERS`` env var.
+        skip_cell_assignment: Skip table-cell assignment during layout analysis.
+            Falls back to the ``PP_DOC_LAYOUT_SKIP_CELL_ASSIGNMENT`` env var.
+    """
+    kind: ClassVar[Literal["ppdoclayout-v3"]] = "ppdoclayout-v3"
+    model_name: Annotated[
+        str,
+        Field(description="HuggingFace model repository ID for PP-DocLayout-V3."),
+    ] = Field(
+        default_factory=lambda: os.environ.get(
+            "PP_DOC_LAYOUT_MODEL_NAME",
+            "PaddlePaddle/PP-DocLayoutV3_safetensors",
+        )
+    )
+    confidence_threshold: Annotated[
+        float,
+        Field(
+            ge=0.0,
+            le=1.0,
+            description="Minimum confidence score to keep a detection.",
+        ),
+    ] = Field(default_factory=lambda: float(os.environ.get("PP_DOC_LAYOUT_CONFIDENCE_THRESHOLD", "0.5")))
+    batch_size: Annotated[
+        int,
+        Field(
+            gt=0,
+            description="Batch size for layout inference.",
+        ),
+    ] = Field(default_factory=lambda: int(os.environ.get("PP_DOC_LAYOUT_BATCH_SIZE", "8")))
+    # Override inherited boolean fields to add environment-variable support.
+    create_orphan_clusters: Annotated[
+        bool,
+        Field(
+            description=(
+                "Create clusters for orphaned elements not assigned to any structure. "
+                "Falls back to PP_DOC_LAYOUT_CREATE_ORPHAN_CLUSTERS env var."
+            )
+        ),
+    ] = Field(default_factory=lambda: _parse_bool(os.environ.get("PP_DOC_LAYOUT_CREATE_ORPHAN_CLUSTERS", "true")))
+    keep_empty_clusters: Annotated[
+        bool,
+        Field(
+            description=(
+                "Retain empty clusters in layout analysis results. "
+                "Falls back to PP_DOC_LAYOUT_KEEP_EMPTY_CLUSTERS env var."
+            )
+        ),
+    ] = Field(default_factory=lambda: _parse_bool(os.environ.get("PP_DOC_LAYOUT_KEEP_EMPTY_CLUSTERS", "false")))
+    skip_cell_assignment: Annotated[
+        bool,
+        Field(
+            description=(
+                "Skip assignment of cells to table structures during layout analysis. "
+                "Falls back to PP_DOC_LAYOUT_SKIP_CELL_ASSIGNMENT env var."
+            )
+        ),
+    ] = Field(default_factory=lambda: _parse_bool(os.environ.get("PP_DOC_LAYOUT_SKIP_CELL_ASSIGNMENT", "false")))
+    model_config = ConfigDict(extra="forbid")

docling_pp_doc_layout/plugin.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Docling plugin entry point registering the PP-DocLayout-V3 layout engine."""
+from __future__ import annotations
+from typing import Any
+from docling_pp_doc_layout.model import PPDocLayoutV3Model
+def layout_engines() -> dict[str, Any]:
+    """Return layout engine classes provided by this plugin."""
+    return {"layout_engines": [PPDocLayoutV3Model]}

docling_pp_doc_layout/py.typed ADDED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -1,2 +1,6 @@
-docling-pp-doc-layout
 spaces

+# docling-pp-doc-layout is bundled as a local package (docling_pp_doc_layout/)
+# because its PyPI releases require Python >=3.12 and ZeroGPU runs Python 3.10.
+# Its dependencies are listed here directly instead.
+docling>=2.73
+transformers>=5.1.0
 spaces