Pier-Jean's picture
Initial deploy: Docling Studio (local mode, port 7860)
5539271
"""Pydantic schemas — API request/response DTOs.
All responses use camelCase serialization to match the existing frontend contract
(originally served by the Spring Boot backend).
"""
from __future__ import annotations
from datetime import datetime
from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator
def _to_camel(name: str) -> str:
parts = name.split("_")
return parts[0] + "".join(w.capitalize() for w in parts[1:])
class _CamelModel(BaseModel):
"""Base model that serializes field names to camelCase."""
model_config = ConfigDict(
alias_generator=_to_camel,
populate_by_name=True,
serialize_by_alias=True,
)
class DocumentResponse(_CamelModel):
id: str
filename: str
status: str = "uploaded" # Document status (always "uploaded" for now)
content_type: str | None = None
file_size: int | None = None
page_count: int | None = None
created_at: str | datetime
class AnalysisResponse(_CamelModel):
id: str
document_id: str = ""
document_filename: str | None = None
status: str
content_markdown: str | None = None
content_html: str | None = None
pages_json: str | None = None
chunks_json: str | None = None
has_document_json: bool = False
error_message: str | None = None
started_at: str | datetime | None = None
completed_at: str | datetime | None = None
created_at: str | datetime
class PipelineOptionsRequest(BaseModel):
"""Docling pipeline configuration options."""
model_config = ConfigDict(populate_by_name=True)
do_ocr: bool = Field(default=True, validation_alias=AliasChoices("do_ocr", "doOcr"))
do_table_structure: bool = Field(
default=True, validation_alias=AliasChoices("do_table_structure", "doTableStructure")
)
table_mode: str = Field(
default="accurate", validation_alias=AliasChoices("table_mode", "tableMode")
)
do_code_enrichment: bool = Field(
default=False, validation_alias=AliasChoices("do_code_enrichment", "doCodeEnrichment")
)
do_formula_enrichment: bool = Field(
default=False, validation_alias=AliasChoices("do_formula_enrichment", "doFormulaEnrichment")
)
do_picture_classification: bool = Field(
default=False,
validation_alias=AliasChoices("do_picture_classification", "doPictureClassification"),
)
do_picture_description: bool = Field(
default=False,
validation_alias=AliasChoices("do_picture_description", "doPictureDescription"),
)
generate_picture_images: bool = Field(
default=False,
validation_alias=AliasChoices("generate_picture_images", "generatePictureImages"),
)
generate_page_images: bool = Field(
default=False, validation_alias=AliasChoices("generate_page_images", "generatePageImages")
)
images_scale: float = Field(
default=1.0, validation_alias=AliasChoices("images_scale", "imagesScale")
)
@field_validator("table_mode")
@classmethod
def validate_table_mode(cls, v: str) -> str:
if v not in ("accurate", "fast"):
raise ValueError('table_mode must be "accurate" or "fast"')
return v
@field_validator("images_scale")
@classmethod
def validate_images_scale(cls, v: float) -> float:
if v <= 0 or v > 10:
raise ValueError("images_scale must be between 0 (exclusive) and 10")
return v
class ChunkingOptionsRequest(BaseModel):
"""Docling chunking configuration options."""
model_config = ConfigDict(populate_by_name=True)
chunker_type: str = Field(
default="hybrid", validation_alias=AliasChoices("chunker_type", "chunkerType")
)
max_tokens: int = Field(default=512, validation_alias=AliasChoices("max_tokens", "maxTokens"))
merge_peers: bool = Field(
default=True, validation_alias=AliasChoices("merge_peers", "mergePeers")
)
repeat_table_header: bool = Field(
default=True, validation_alias=AliasChoices("repeat_table_header", "repeatTableHeader")
)
@field_validator("chunker_type")
@classmethod
def validate_chunker_type(cls, v: str) -> str:
if v not in ("hybrid", "hierarchical"):
raise ValueError('chunker_type must be "hybrid" or "hierarchical"')
return v
@field_validator("max_tokens")
@classmethod
def validate_max_tokens(cls, v: int) -> int:
if v < 64 or v > 8192:
raise ValueError("max_tokens must be between 64 and 8192")
return v
class ChunkBboxResponse(_CamelModel):
page: int
bbox: list[float]
class ChunkResponse(_CamelModel):
text: str
headings: list[str] = []
source_page: int | None = None
token_count: int = 0
bboxes: list[ChunkBboxResponse] = []
class CreateAnalysisRequest(BaseModel):
documentId: str = Field(validation_alias=AliasChoices("documentId", "document_id"))
pipelineOptions: PipelineOptionsRequest | None = Field(
default=None, validation_alias=AliasChoices("pipelineOptions", "pipeline_options")
)
chunkingOptions: ChunkingOptionsRequest | None = Field(
default=None, validation_alias=AliasChoices("chunkingOptions", "chunking_options")
)
class RechunkRequest(BaseModel):
chunkingOptions: ChunkingOptionsRequest = Field(
validation_alias=AliasChoices("chunkingOptions", "chunking_options")
)