adelevett's picture
Upload 9 files
d7c9ee5 verified
"""Configuration model for the PP-DocLayout-V3 layout engine."""
from __future__ import annotations
import os
from typing import Annotated, ClassVar, Literal
from docling.datamodel.pipeline_options import LayoutOptions
from pydantic import ConfigDict, Field
def _parse_bool(value: str) -> bool:
"""Parse a string environment variable value as a boolean.
Args:
value: The string to parse. Case-insensitive ``"true"``, ``"1"``,
and ``"yes"`` are truthy; everything else is falsy.
Returns:
``True`` if *value* is a recognised truthy string, ``False`` otherwise.
"""
return value.lower() in ("true", "1", "yes")
class PPDocLayoutV3Options(LayoutOptions):
"""Options for the PP-DocLayout-V3 layout detection engine.
Uses a HuggingFace-hosted PP-DocLayout-V3 model to detect document
layout elements (text, tables, figures, headers, etc.) in page images.
All options fall back to environment variables when not set explicitly,
allowing configuration without code changes (e.g. in Docker / Compose
deployments).
Attributes:
model_name: HuggingFace model repository ID.
Falls back to the ``PP_DOC_LAYOUT_MODEL_NAME`` env var.
confidence_threshold: Minimum confidence score for detections.
Falls back to the ``PP_DOC_LAYOUT_CONFIDENCE_THRESHOLD`` env var.
batch_size: Number of pages per inference batch.
Falls back to the ``PP_DOC_LAYOUT_BATCH_SIZE`` env var.
create_orphan_clusters: Create clusters for orphaned elements.
Falls back to the ``PP_DOC_LAYOUT_CREATE_ORPHAN_CLUSTERS`` env var.
keep_empty_clusters: Retain empty clusters in results.
Falls back to the ``PP_DOC_LAYOUT_KEEP_EMPTY_CLUSTERS`` env var.
skip_cell_assignment: Skip table-cell assignment during layout analysis.
Falls back to the ``PP_DOC_LAYOUT_SKIP_CELL_ASSIGNMENT`` env var.
"""
kind: ClassVar[Literal["ppdoclayout-v3"]] = "ppdoclayout-v3"
model_name: Annotated[
str,
Field(description="HuggingFace model repository ID for PP-DocLayout-V3."),
] = Field(
default_factory=lambda: os.environ.get(
"PP_DOC_LAYOUT_MODEL_NAME",
"PaddlePaddle/PP-DocLayoutV3_safetensors",
)
)
confidence_threshold: Annotated[
float,
Field(
ge=0.0,
le=1.0,
description="Minimum confidence score to keep a detection.",
),
] = Field(default_factory=lambda: float(os.environ.get("PP_DOC_LAYOUT_CONFIDENCE_THRESHOLD", "0.5")))
batch_size: Annotated[
int,
Field(
gt=0,
description="Batch size for layout inference.",
),
] = Field(default_factory=lambda: int(os.environ.get("PP_DOC_LAYOUT_BATCH_SIZE", "8")))
# Override inherited boolean fields to add environment-variable support.
create_orphan_clusters: Annotated[
bool,
Field(
description=(
"Create clusters for orphaned elements not assigned to any structure. "
"Falls back to PP_DOC_LAYOUT_CREATE_ORPHAN_CLUSTERS env var."
)
),
] = Field(default_factory=lambda: _parse_bool(os.environ.get("PP_DOC_LAYOUT_CREATE_ORPHAN_CLUSTERS", "true")))
keep_empty_clusters: Annotated[
bool,
Field(
description=(
"Retain empty clusters in layout analysis results. "
"Falls back to PP_DOC_LAYOUT_KEEP_EMPTY_CLUSTERS env var."
)
),
] = Field(default_factory=lambda: _parse_bool(os.environ.get("PP_DOC_LAYOUT_KEEP_EMPTY_CLUSTERS", "false")))
skip_cell_assignment: Annotated[
bool,
Field(
description=(
"Skip assignment of cells to table structures during layout analysis. "
"Falls back to PP_DOC_LAYOUT_SKIP_CELL_ASSIGNMENT env var."
)
),
] = Field(default_factory=lambda: _parse_bool(os.environ.get("PP_DOC_LAYOUT_SKIP_CELL_ASSIGNMENT", "false")))
model_config = ConfigDict(extra="forbid")