adelevett commited on
Commit
d7c9ee5
·
verified ·
1 Parent(s): dbe48bf

Upload 9 files

Browse files
app.py CHANGED
@@ -1,3 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import spaces
3
  from docling.datamodel.base_models import InputFormat
 
1
+ # ---------------------------------------------------------------------------
2
+ # Plugin registration
3
+ # ---------------------------------------------------------------------------
4
+ # docling-pp-doc-layout requires Python >=3.12 on PyPI, but the code itself
5
+ # is compatible with Python 3.10 (all annotations are guarded by
6
+ # `from __future__ import annotations`). Instead of installing the package,
7
+ # we bundle the source directly and register the model with docling's factory
8
+ # by monkey-patching BaseFactory.load_from_plugins so that every new
9
+ # LayoutFactory instance automatically includes PPDocLayoutV3Model.
10
+ from docling.models.factories.base_factory import BaseFactory
11
+ from docling.models.factories.layout_factory import LayoutFactory
12
+ from docling_pp_doc_layout.model import PPDocLayoutV3Model
13
+
14
+ _orig_load = BaseFactory.load_from_plugins
15
+
16
+
17
+ def _load_with_pp_doc_layout(
18
+ self, plugin_name=None, allow_external_plugins=False
19
+ ):
20
+ _orig_load(
21
+ self,
22
+ plugin_name=plugin_name,
23
+ allow_external_plugins=allow_external_plugins,
24
+ )
25
+ if isinstance(self, LayoutFactory):
26
+ try:
27
+ self.register(
28
+ PPDocLayoutV3Model,
29
+ "docling-pp-doc-layout",
30
+ "docling_pp_doc_layout.model",
31
+ )
32
+ except ValueError:
33
+ pass # already registered on a previous factory creation
34
+
35
+
36
+ BaseFactory.load_from_plugins = _load_with_pp_doc_layout
37
+
38
+ # ---------------------------------------------------------------------------
39
  import gradio as gr
40
  import spaces
41
  from docling.datamodel.base_models import InputFormat
docling_pp_doc_layout/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """A Docling plugin for PaddlePaddle PP-DocLayout-V3 model document layout detection."""
2
+
3
+ __version__ = "0.1.0"
docling_pp_doc_layout/label_mapping.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Mapping from PP-DocLayout-V3 label names to docling DocItemLabel values.
2
+
3
+ Every label produced here must exist in
4
+ ``docling.utils.layout_postprocessor.LayoutPostprocessor.CONFIDENCE_THRESHOLDS``
5
+ so that the postprocessor can apply confidence filtering without a ``KeyError``.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from docling_core.types.doc import DocItemLabel
11
+
12
+ LABEL_MAP: dict[str, DocItemLabel] = {
13
+ "abstract": DocItemLabel.TEXT,
14
+ "algorithm": DocItemLabel.CODE,
15
+ "aside_text": DocItemLabel.TEXT,
16
+ "chart": DocItemLabel.PICTURE,
17
+ "content": DocItemLabel.TEXT,
18
+ "doc_title": DocItemLabel.TITLE,
19
+ "figure_title": DocItemLabel.CAPTION,
20
+ "footer": DocItemLabel.PAGE_FOOTER,
21
+ "footnote": DocItemLabel.FOOTNOTE,
22
+ "formula": DocItemLabel.FORMULA,
23
+ "formula_number": DocItemLabel.TEXT,
24
+ "header": DocItemLabel.PAGE_HEADER,
25
+ "image": DocItemLabel.PICTURE,
26
+ "number": DocItemLabel.TEXT,
27
+ "paragraph_title": DocItemLabel.SECTION_HEADER,
28
+ "reference": DocItemLabel.TEXT,
29
+ "reference_content": DocItemLabel.TEXT,
30
+ "seal": DocItemLabel.PICTURE,
31
+ "table": DocItemLabel.TABLE,
32
+ "text": DocItemLabel.TEXT,
33
+ "vision_footnote": DocItemLabel.FOOTNOTE,
34
+ }
docling_pp_doc_layout/model.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PP-DocLayout-V3 layout model for the docling standard pipeline.
2
+
3
+ Runs PaddlePaddle PP-DocLayout-V3 locally via HuggingFace ``transformers``
4
+ to detect document layout elements and returns ``LayoutPrediction`` objects
5
+ that docling merges with its standard-pipeline output.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import warnings
12
+ from typing import TYPE_CHECKING
13
+
14
+ import numpy as np
15
+ import torch
16
+ from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
17
+ from docling.models.base_layout_model import BaseLayoutModel
18
+ from docling.utils.accelerator_utils import decide_device
19
+ from docling.utils.layout_postprocessor import LayoutPostprocessor
20
+ from docling.utils.profiling import TimeRecorder
21
+ from docling_core.types.doc import DocItemLabel
22
+ from transformers import AutoImageProcessor, AutoModelForObjectDetection
23
+
24
+ from docling_pp_doc_layout.label_mapping import LABEL_MAP
25
+ from docling_pp_doc_layout.options import PPDocLayoutV3Options
26
+
27
+ if TYPE_CHECKING:
28
+ from collections.abc import Sequence
29
+ from pathlib import Path
30
+
31
+ from docling.datamodel.accelerator_options import AcceleratorOptions
32
+ from docling.datamodel.document import ConversionResult
33
+ from docling.datamodel.pipeline_options import BaseLayoutOptions
34
+ from PIL import Image
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ class PPDocLayoutV3Model(BaseLayoutModel):
40
+ """Layout engine using PP-DocLayout-V3 via HuggingFace transformers."""
41
+
42
+ def __init__(
43
+ self,
44
+ artifacts_path: Path | None,
45
+ accelerator_options: AcceleratorOptions,
46
+ options: PPDocLayoutV3Options,
47
+ *,
48
+ enable_remote_services: bool = False, # noqa: ARG002
49
+ ) -> None:
50
+ self.options = options
51
+ self.artifacts_path = artifacts_path
52
+ self.accelerator_options = accelerator_options
53
+
54
+ self._device = decide_device(accelerator_options.device)
55
+ logger.info(
56
+ "Loading PP-DocLayout-V3 model %s on device=%s",
57
+ options.model_name,
58
+ self._device,
59
+ )
60
+
61
+ self._image_processor = AutoImageProcessor.from_pretrained(
62
+ options.model_name,
63
+ )
64
+ self._model = AutoModelForObjectDetection.from_pretrained(
65
+ options.model_name,
66
+ ).to(self._device)
67
+ self._model.eval()
68
+
69
+ self._id2label: dict[int, str] = self._model.config.id2label
70
+ logger.info("PP-DocLayout-V3 model loaded successfully")
71
+
72
+ @classmethod
73
+ def get_options_type(cls) -> type[BaseLayoutOptions]:
74
+ """Return the options class for this layout model."""
75
+ return PPDocLayoutV3Options
76
+
77
+ def _run_inference(
78
+ self,
79
+ images: list[Image.Image],
80
+ ) -> list[list[dict]]:
81
+ """Run PP-DocLayout-V3 on a batch of PIL images.
82
+
83
+ Returns a list (per image) of lists of detection dicts with keys
84
+ ``label``, ``confidence``, ``l``, ``t``, ``r``, ``b``.
85
+ """
86
+ inputs = self._image_processor(images=images, return_tensors="pt")
87
+ inputs = {k: v.to(self._device) for k, v in inputs.items()}
88
+
89
+ with torch.no_grad():
90
+ outputs = self._model(**inputs)
91
+
92
+ target_sizes = [img.size[::-1] for img in images] # (height, width)
93
+ results = self._image_processor.post_process_object_detection(
94
+ outputs,
95
+ target_sizes=target_sizes,
96
+ threshold=self.options.confidence_threshold,
97
+ )
98
+
99
+ batch_detections: list[list[dict]] = []
100
+ for result in results:
101
+ detections: list[dict] = []
102
+
103
+ polys = result.get("polygons") or result.get("polygon_points")
104
+ if polys is None:
105
+ polys = [None] * len(result["scores"])
106
+
107
+ for score, label_id, box, poly in zip(
108
+ result["scores"],
109
+ result["labels"],
110
+ result["boxes"],
111
+ polys,
112
+ strict=True,
113
+ ):
114
+ raw_label = self._id2label.get(label_id.item(), "text")
115
+ doc_label = LABEL_MAP.get(raw_label, DocItemLabel.TEXT)
116
+
117
+ if poly is not None and len(poly) > 0:
118
+ # Flatten or handle nested points to extract min/max
119
+ if isinstance(poly[0], int | float):
120
+ xs = poly[0::2]
121
+ ys = poly[1::2]
122
+ else:
123
+ xs = [pt[0] for pt in poly]
124
+ ys = [pt[1] for pt in poly]
125
+ x_min, x_max = min(xs), max(xs)
126
+ y_min, y_max = min(ys), max(ys)
127
+ else:
128
+ x_min, y_min, x_max, y_max = box.tolist()
129
+
130
+ detections.append({
131
+ "label": doc_label,
132
+ "confidence": score.item(),
133
+ "l": x_min,
134
+ "t": y_min,
135
+ "r": x_max,
136
+ "b": y_max,
137
+ })
138
+ batch_detections.append(detections)
139
+
140
+ return batch_detections
141
+
142
+ def predict_layout(
143
+ self,
144
+ conv_res: ConversionResult,
145
+ pages: Sequence[Page],
146
+ ) -> Sequence[LayoutPrediction]:
147
+ """Detect layout regions for a batch of document pages."""
148
+ pages = list(pages)
149
+
150
+ valid_pages: list[Page] = []
151
+ valid_images: list[Image.Image] = []
152
+ is_page_valid: list[bool] = []
153
+
154
+ for page in pages:
155
+ if page._backend is None or not page._backend.is_valid(): # noqa: SLF001
156
+ is_page_valid.append(False)
157
+ continue
158
+ if page.size is None:
159
+ is_page_valid.append(False)
160
+ continue
161
+ page_image = page.get_image(scale=1.0)
162
+ if page_image is None:
163
+ is_page_valid.append(False)
164
+ continue
165
+
166
+ valid_pages.append(page)
167
+ valid_images.append(page_image)
168
+ is_page_valid.append(True)
169
+
170
+ batch_detections: list[list[dict]] = []
171
+ if valid_images:
172
+ with TimeRecorder(conv_res, "layout"):
173
+ bs = self.options.batch_size
174
+ for i in range(0, len(valid_images), bs):
175
+ batch = valid_images[i : i + bs]
176
+ batch_detections.extend(self._run_inference(batch))
177
+
178
+ layout_predictions: list[LayoutPrediction] = []
179
+ valid_idx = 0
180
+
181
+ for idx, page in enumerate(pages):
182
+ if not is_page_valid[idx]:
183
+ existing = page.predictions.layout or LayoutPrediction()
184
+ layout_predictions.append(existing)
185
+ continue
186
+
187
+ detections = batch_detections[valid_idx]
188
+ valid_idx += 1
189
+
190
+ clusters: list[Cluster] = []
191
+ for ix, det in enumerate(detections):
192
+ cluster = Cluster(
193
+ id=ix,
194
+ label=det["label"],
195
+ confidence=det["confidence"],
196
+ bbox=BoundingBox(
197
+ l=det["l"],
198
+ t=det["t"],
199
+ r=det["r"],
200
+ b=det["b"],
201
+ ),
202
+ cells=[],
203
+ )
204
+ clusters.append(cluster)
205
+
206
+ processed_clusters, processed_cells = LayoutPostprocessor(page, clusters, self.options).postprocess()
207
+
208
+ with warnings.catch_warnings():
209
+ warnings.filterwarnings(
210
+ "ignore",
211
+ "Mean of empty slice|invalid value encountered in scalar divide",
212
+ RuntimeWarning,
213
+ "numpy",
214
+ )
215
+ conv_res.confidence.pages[page.page_no].layout_score = float(
216
+ np.mean([c.confidence for c in processed_clusters])
217
+ )
218
+ conv_res.confidence.pages[page.page_no].ocr_score = float(
219
+ np.mean([c.confidence for c in processed_cells if c.from_ocr])
220
+ )
221
+
222
+ prediction = LayoutPrediction(clusters=processed_clusters)
223
+ layout_predictions.append(prediction)
224
+
225
+ return layout_predictions
docling_pp_doc_layout/options.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration model for the PP-DocLayout-V3 layout engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from typing import Annotated, ClassVar, Literal
7
+
8
+ from docling.datamodel.pipeline_options import LayoutOptions
9
+ from pydantic import ConfigDict, Field
10
+
11
+
12
+ def _parse_bool(value: str) -> bool:
13
+ """Parse a string environment variable value as a boolean.
14
+
15
+ Args:
16
+ value: The string to parse. Case-insensitive ``"true"``, ``"1"``,
17
+ and ``"yes"`` are truthy; everything else is falsy.
18
+
19
+ Returns:
20
+ ``True`` if *value* is a recognised truthy string, ``False`` otherwise.
21
+ """
22
+ return value.lower() in ("true", "1", "yes")
23
+
24
+
25
+ class PPDocLayoutV3Options(LayoutOptions):
26
+ """Options for the PP-DocLayout-V3 layout detection engine.
27
+
28
+ Uses a HuggingFace-hosted PP-DocLayout-V3 model to detect document
29
+ layout elements (text, tables, figures, headers, etc.) in page images.
30
+
31
+ All options fall back to environment variables when not set explicitly,
32
+ allowing configuration without code changes (e.g. in Docker / Compose
33
+ deployments).
34
+
35
+ Attributes:
36
+ model_name: HuggingFace model repository ID.
37
+ Falls back to the ``PP_DOC_LAYOUT_MODEL_NAME`` env var.
38
+ confidence_threshold: Minimum confidence score for detections.
39
+ Falls back to the ``PP_DOC_LAYOUT_CONFIDENCE_THRESHOLD`` env var.
40
+ batch_size: Number of pages per inference batch.
41
+ Falls back to the ``PP_DOC_LAYOUT_BATCH_SIZE`` env var.
42
+ create_orphan_clusters: Create clusters for orphaned elements.
43
+ Falls back to the ``PP_DOC_LAYOUT_CREATE_ORPHAN_CLUSTERS`` env var.
44
+ keep_empty_clusters: Retain empty clusters in results.
45
+ Falls back to the ``PP_DOC_LAYOUT_KEEP_EMPTY_CLUSTERS`` env var.
46
+ skip_cell_assignment: Skip table-cell assignment during layout analysis.
47
+ Falls back to the ``PP_DOC_LAYOUT_SKIP_CELL_ASSIGNMENT`` env var.
48
+ """
49
+
50
+ kind: ClassVar[Literal["ppdoclayout-v3"]] = "ppdoclayout-v3"
51
+
52
+ model_name: Annotated[
53
+ str,
54
+ Field(description="HuggingFace model repository ID for PP-DocLayout-V3."),
55
+ ] = Field(
56
+ default_factory=lambda: os.environ.get(
57
+ "PP_DOC_LAYOUT_MODEL_NAME",
58
+ "PaddlePaddle/PP-DocLayoutV3_safetensors",
59
+ )
60
+ )
61
+
62
+ confidence_threshold: Annotated[
63
+ float,
64
+ Field(
65
+ ge=0.0,
66
+ le=1.0,
67
+ description="Minimum confidence score to keep a detection.",
68
+ ),
69
+ ] = Field(default_factory=lambda: float(os.environ.get("PP_DOC_LAYOUT_CONFIDENCE_THRESHOLD", "0.5")))
70
+
71
+ batch_size: Annotated[
72
+ int,
73
+ Field(
74
+ gt=0,
75
+ description="Batch size for layout inference.",
76
+ ),
77
+ ] = Field(default_factory=lambda: int(os.environ.get("PP_DOC_LAYOUT_BATCH_SIZE", "8")))
78
+
79
+ # Override inherited boolean fields to add environment-variable support.
80
+ create_orphan_clusters: Annotated[
81
+ bool,
82
+ Field(
83
+ description=(
84
+ "Create clusters for orphaned elements not assigned to any structure. "
85
+ "Falls back to PP_DOC_LAYOUT_CREATE_ORPHAN_CLUSTERS env var."
86
+ )
87
+ ),
88
+ ] = Field(default_factory=lambda: _parse_bool(os.environ.get("PP_DOC_LAYOUT_CREATE_ORPHAN_CLUSTERS", "true")))
89
+
90
+ keep_empty_clusters: Annotated[
91
+ bool,
92
+ Field(
93
+ description=(
94
+ "Retain empty clusters in layout analysis results. "
95
+ "Falls back to PP_DOC_LAYOUT_KEEP_EMPTY_CLUSTERS env var."
96
+ )
97
+ ),
98
+ ] = Field(default_factory=lambda: _parse_bool(os.environ.get("PP_DOC_LAYOUT_KEEP_EMPTY_CLUSTERS", "false")))
99
+
100
+ skip_cell_assignment: Annotated[
101
+ bool,
102
+ Field(
103
+ description=(
104
+ "Skip assignment of cells to table structures during layout analysis. "
105
+ "Falls back to PP_DOC_LAYOUT_SKIP_CELL_ASSIGNMENT env var."
106
+ )
107
+ ),
108
+ ] = Field(default_factory=lambda: _parse_bool(os.environ.get("PP_DOC_LAYOUT_SKIP_CELL_ASSIGNMENT", "false")))
109
+
110
+ model_config = ConfigDict(extra="forbid")
docling_pp_doc_layout/plugin.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Docling plugin entry point registering the PP-DocLayout-V3 layout engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from docling_pp_doc_layout.model import PPDocLayoutV3Model
8
+
9
+
10
+ def layout_engines() -> dict[str, Any]:
11
+ """Return layout engine classes provided by this plugin."""
12
+ return {"layout_engines": [PPDocLayoutV3Model]}
docling_pp_doc_layout/py.typed ADDED
File without changes
requirements.txt CHANGED
@@ -1,2 +1,6 @@
1
- docling-pp-doc-layout
 
 
 
 
2
  spaces
 
1
+ # docling-pp-doc-layout is bundled as a local package (docling_pp_doc_layout/)
2
+ # because its PyPI releases require Python >=3.12 and ZeroGPU runs Python 3.10.
3
+ # Its dependencies are listed here directly instead.
4
+ docling>=2.73
5
+ transformers>=5.1.0
6
  spaces