Spaces:

adelevett
/

docling_pp_layout_demo

Running on Zero

App Files Files Community

docling_pp_layout_demo / docling_pp_doc_layout /model.py

adelevett

Upload 9 files

d7c9ee5 verified 3 days ago

raw

history blame contribute delete

8.11 kB

	"""PP-DocLayout-V3 layout model for the docling standard pipeline.

	Runs PaddlePaddle PP-DocLayout-V3 locally via HuggingFace ``transformers``
	to detect document layout elements and returns ``LayoutPrediction`` objects
	that docling merges with its standard-pipeline output.
	"""

	from __future__ import annotations

	import logging
	import warnings
	from typing import TYPE_CHECKING

	import numpy as np
	import torch
	from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
	from docling.models.base_layout_model import BaseLayoutModel
	from docling.utils.accelerator_utils import decide_device
	from docling.utils.layout_postprocessor import LayoutPostprocessor
	from docling.utils.profiling import TimeRecorder
	from docling_core.types.doc import DocItemLabel
	from transformers import AutoImageProcessor, AutoModelForObjectDetection

	from docling_pp_doc_layout.label_mapping import LABEL_MAP
	from docling_pp_doc_layout.options import PPDocLayoutV3Options

	if TYPE_CHECKING:
	from collections.abc import Sequence
	from pathlib import Path

	from docling.datamodel.accelerator_options import AcceleratorOptions
	from docling.datamodel.document import ConversionResult
	from docling.datamodel.pipeline_options import BaseLayoutOptions
	from PIL import Image

	logger = logging.getLogger(__name__)


	class PPDocLayoutV3Model(BaseLayoutModel):
	"""Layout engine using PP-DocLayout-V3 via HuggingFace transformers."""

	def __init__(
	self,
	artifacts_path: Path \| None,
	accelerator_options: AcceleratorOptions,
	options: PPDocLayoutV3Options,
	*,
	enable_remote_services: bool = False, # noqa: ARG002
	) -> None:
	self.options = options
	self.artifacts_path = artifacts_path
	self.accelerator_options = accelerator_options

	self._device = decide_device(accelerator_options.device)
	logger.info(
	"Loading PP-DocLayout-V3 model %s on device=%s",
	options.model_name,
	self._device,
	)

	self._image_processor = AutoImageProcessor.from_pretrained(
	options.model_name,
	)
	self._model = AutoModelForObjectDetection.from_pretrained(
	options.model_name,
	).to(self._device)
	self._model.eval()

	self._id2label: dict[int, str] = self._model.config.id2label
	logger.info("PP-DocLayout-V3 model loaded successfully")

	@classmethod
	def get_options_type(cls) -> type[BaseLayoutOptions]:
	"""Return the options class for this layout model."""
	return PPDocLayoutV3Options

	def _run_inference(
	self,
	images: list[Image.Image],
	) -> list[list[dict]]:
	"""Run PP-DocLayout-V3 on a batch of PIL images.

	Returns a list (per image) of lists of detection dicts with keys
	``label``, ``confidence``, ``l``, ``t``, ``r``, ``b``.
	"""
	inputs = self._image_processor(images=images, return_tensors="pt")
	inputs = {k: v.to(self._device) for k, v in inputs.items()}

	with torch.no_grad():
	outputs = self._model(**inputs)

	target_sizes = [img.size[::-1] for img in images] # (height, width)
	results = self._image_processor.post_process_object_detection(
	outputs,
	target_sizes=target_sizes,
	threshold=self.options.confidence_threshold,
	)

	batch_detections: list[list[dict]] = []
	for result in results:
	detections: list[dict] = []

	polys = result.get("polygons") or result.get("polygon_points")
	if polys is None:
	polys = [None] * len(result["scores"])

	for score, label_id, box, poly in zip(
	result["scores"],
	result["labels"],
	result["boxes"],
	polys,
	strict=True,
	):
	raw_label = self._id2label.get(label_id.item(), "text")
	doc_label = LABEL_MAP.get(raw_label, DocItemLabel.TEXT)

	if poly is not None and len(poly) > 0:
	# Flatten or handle nested points to extract min/max
	if isinstance(poly[0], int \| float):
	xs = poly[0::2]
	ys = poly[1::2]
	else:
	xs = [pt[0] for pt in poly]
	ys = [pt[1] for pt in poly]
	x_min, x_max = min(xs), max(xs)
	y_min, y_max = min(ys), max(ys)
	else:
	x_min, y_min, x_max, y_max = box.tolist()

	detections.append({
	"label": doc_label,
	"confidence": score.item(),
	"l": x_min,
	"t": y_min,
	"r": x_max,
	"b": y_max,
	})
	batch_detections.append(detections)

	return batch_detections

	def predict_layout(
	self,
	conv_res: ConversionResult,
	pages: Sequence[Page],
	) -> Sequence[LayoutPrediction]:
	"""Detect layout regions for a batch of document pages."""
	pages = list(pages)

	valid_pages: list[Page] = []
	valid_images: list[Image.Image] = []
	is_page_valid: list[bool] = []

	for page in pages:
	if page._backend is None or not page._backend.is_valid(): # noqa: SLF001
	is_page_valid.append(False)
	continue
	if page.size is None:
	is_page_valid.append(False)
	continue
	page_image = page.get_image(scale=1.0)
	if page_image is None:
	is_page_valid.append(False)
	continue

	valid_pages.append(page)
	valid_images.append(page_image)
	is_page_valid.append(True)

	batch_detections: list[list[dict]] = []
	if valid_images:
	with TimeRecorder(conv_res, "layout"):
	bs = self.options.batch_size
	for i in range(0, len(valid_images), bs):
	batch = valid_images[i : i + bs]
	batch_detections.extend(self._run_inference(batch))

	layout_predictions: list[LayoutPrediction] = []
	valid_idx = 0

	for idx, page in enumerate(pages):
	if not is_page_valid[idx]:
	existing = page.predictions.layout or LayoutPrediction()
	layout_predictions.append(existing)
	continue

	detections = batch_detections[valid_idx]
	valid_idx += 1

	clusters: list[Cluster] = []
	for ix, det in enumerate(detections):
	cluster = Cluster(
	id=ix,
	label=det["label"],
	confidence=det["confidence"],
	bbox=BoundingBox(
	l=det["l"],
	t=det["t"],
	r=det["r"],
	b=det["b"],
	),
	cells=[],
	)
	clusters.append(cluster)

	processed_clusters, processed_cells = LayoutPostprocessor(page, clusters, self.options).postprocess()

	with warnings.catch_warnings():
	warnings.filterwarnings(
	"ignore",
	"Mean of empty slice\|invalid value encountered in scalar divide",
	RuntimeWarning,
	"numpy",
	)
	conv_res.confidence.pages[page.page_no].layout_score = float(
	np.mean([c.confidence for c in processed_clusters])
	)
	conv_res.confidence.pages[page.page_no].ocr_score = float(
	np.mean([c.confidence for c in processed_cells if c.from_ocr])
	)

	prediction = LayoutPrediction(clusters=processed_clusters)
	layout_predictions.append(prediction)

	return layout_predictions