"""PP-DocLayout-V3 layout model for the docling standard pipeline. Runs PaddlePaddle PP-DocLayout-V3 locally via HuggingFace ``transformers`` to detect document layout elements and returns ``LayoutPrediction`` objects that docling merges with its standard-pipeline output. """ from __future__ import annotations import logging import warnings from typing import TYPE_CHECKING import numpy as np import torch from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page from docling.models.base_layout_model import BaseLayoutModel from docling.utils.accelerator_utils import decide_device from docling.utils.layout_postprocessor import LayoutPostprocessor from docling.utils.profiling import TimeRecorder from docling_core.types.doc import DocItemLabel from transformers import AutoImageProcessor, AutoModelForObjectDetection from docling_pp_doc_layout.label_mapping import LABEL_MAP from docling_pp_doc_layout.options import PPDocLayoutV3Options if TYPE_CHECKING: from collections.abc import Sequence from pathlib import Path from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import BaseLayoutOptions from PIL import Image logger = logging.getLogger(__name__) class PPDocLayoutV3Model(BaseLayoutModel): """Layout engine using PP-DocLayout-V3 via HuggingFace transformers.""" def __init__( self, artifacts_path: Path | None, accelerator_options: AcceleratorOptions, options: PPDocLayoutV3Options, *, enable_remote_services: bool = False, # noqa: ARG002 ) -> None: self.options = options self.artifacts_path = artifacts_path self.accelerator_options = accelerator_options self._device = decide_device(accelerator_options.device) logger.info( "Loading PP-DocLayout-V3 model %s on device=%s", options.model_name, self._device, ) self._image_processor = AutoImageProcessor.from_pretrained( options.model_name, ) self._model = AutoModelForObjectDetection.from_pretrained( options.model_name, ).to(self._device) self._model.eval() self._id2label: dict[int, str] = self._model.config.id2label logger.info("PP-DocLayout-V3 model loaded successfully") @classmethod def get_options_type(cls) -> type[BaseLayoutOptions]: """Return the options class for this layout model.""" return PPDocLayoutV3Options def _run_inference( self, images: list[Image.Image], ) -> list[list[dict]]: """Run PP-DocLayout-V3 on a batch of PIL images. Returns a list (per image) of lists of detection dicts with keys ``label``, ``confidence``, ``l``, ``t``, ``r``, ``b``. """ inputs = self._image_processor(images=images, return_tensors="pt") inputs = {k: v.to(self._device) for k, v in inputs.items()} with torch.no_grad(): outputs = self._model(**inputs) target_sizes = [img.size[::-1] for img in images] # (height, width) results = self._image_processor.post_process_object_detection( outputs, target_sizes=target_sizes, threshold=self.options.confidence_threshold, ) batch_detections: list[list[dict]] = [] for result in results: detections: list[dict] = [] polys = result.get("polygons") or result.get("polygon_points") if polys is None: polys = [None] * len(result["scores"]) for score, label_id, box, poly in zip( result["scores"], result["labels"], result["boxes"], polys, strict=True, ): raw_label = self._id2label.get(label_id.item(), "text") doc_label = LABEL_MAP.get(raw_label, DocItemLabel.TEXT) if poly is not None and len(poly) > 0: # Flatten or handle nested points to extract min/max if isinstance(poly[0], int | float): xs = poly[0::2] ys = poly[1::2] else: xs = [pt[0] for pt in poly] ys = [pt[1] for pt in poly] x_min, x_max = min(xs), max(xs) y_min, y_max = min(ys), max(ys) else: x_min, y_min, x_max, y_max = box.tolist() detections.append({ "label": doc_label, "confidence": score.item(), "l": x_min, "t": y_min, "r": x_max, "b": y_max, }) batch_detections.append(detections) return batch_detections def predict_layout( self, conv_res: ConversionResult, pages: Sequence[Page], ) -> Sequence[LayoutPrediction]: """Detect layout regions for a batch of document pages.""" pages = list(pages) valid_pages: list[Page] = [] valid_images: list[Image.Image] = [] is_page_valid: list[bool] = [] for page in pages: if page._backend is None or not page._backend.is_valid(): # noqa: SLF001 is_page_valid.append(False) continue if page.size is None: is_page_valid.append(False) continue page_image = page.get_image(scale=1.0) if page_image is None: is_page_valid.append(False) continue valid_pages.append(page) valid_images.append(page_image) is_page_valid.append(True) batch_detections: list[list[dict]] = [] if valid_images: with TimeRecorder(conv_res, "layout"): bs = self.options.batch_size for i in range(0, len(valid_images), bs): batch = valid_images[i : i + bs] batch_detections.extend(self._run_inference(batch)) layout_predictions: list[LayoutPrediction] = [] valid_idx = 0 for idx, page in enumerate(pages): if not is_page_valid[idx]: existing = page.predictions.layout or LayoutPrediction() layout_predictions.append(existing) continue detections = batch_detections[valid_idx] valid_idx += 1 clusters: list[Cluster] = [] for ix, det in enumerate(detections): cluster = Cluster( id=ix, label=det["label"], confidence=det["confidence"], bbox=BoundingBox( l=det["l"], t=det["t"], r=det["r"], b=det["b"], ), cells=[], ) clusters.append(cluster) processed_clusters, processed_cells = LayoutPostprocessor(page, clusters, self.options).postprocess() with warnings.catch_warnings(): warnings.filterwarnings( "ignore", "Mean of empty slice|invalid value encountered in scalar divide", RuntimeWarning, "numpy", ) conv_res.confidence.pages[page.page_no].layout_score = float( np.mean([c.confidence for c in processed_clusters]) ) conv_res.confidence.pages[page.page_no].ocr_score = float( np.mean([c.confidence for c in processed_cells if c.from_ocr]) ) prediction = LayoutPrediction(clusters=processed_clusters) layout_predictions.append(prediction) return layout_predictions