Spaces:
Running
Running
File size: 5,433 Bytes
5539271 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | """Pydantic schemas — API request/response DTOs.
All responses use camelCase serialization to match the existing frontend contract
(originally served by the Spring Boot backend).
"""
from __future__ import annotations
from datetime import datetime
from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator
def _to_camel(name: str) -> str:
parts = name.split("_")
return parts[0] + "".join(w.capitalize() for w in parts[1:])
class _CamelModel(BaseModel):
"""Base model that serializes field names to camelCase."""
model_config = ConfigDict(
alias_generator=_to_camel,
populate_by_name=True,
serialize_by_alias=True,
)
class DocumentResponse(_CamelModel):
id: str
filename: str
status: str = "uploaded" # Document status (always "uploaded" for now)
content_type: str | None = None
file_size: int | None = None
page_count: int | None = None
created_at: str | datetime
class AnalysisResponse(_CamelModel):
id: str
document_id: str = ""
document_filename: str | None = None
status: str
content_markdown: str | None = None
content_html: str | None = None
pages_json: str | None = None
chunks_json: str | None = None
has_document_json: bool = False
error_message: str | None = None
started_at: str | datetime | None = None
completed_at: str | datetime | None = None
created_at: str | datetime
class PipelineOptionsRequest(BaseModel):
"""Docling pipeline configuration options."""
model_config = ConfigDict(populate_by_name=True)
do_ocr: bool = Field(default=True, validation_alias=AliasChoices("do_ocr", "doOcr"))
do_table_structure: bool = Field(
default=True, validation_alias=AliasChoices("do_table_structure", "doTableStructure")
)
table_mode: str = Field(
default="accurate", validation_alias=AliasChoices("table_mode", "tableMode")
)
do_code_enrichment: bool = Field(
default=False, validation_alias=AliasChoices("do_code_enrichment", "doCodeEnrichment")
)
do_formula_enrichment: bool = Field(
default=False, validation_alias=AliasChoices("do_formula_enrichment", "doFormulaEnrichment")
)
do_picture_classification: bool = Field(
default=False,
validation_alias=AliasChoices("do_picture_classification", "doPictureClassification"),
)
do_picture_description: bool = Field(
default=False,
validation_alias=AliasChoices("do_picture_description", "doPictureDescription"),
)
generate_picture_images: bool = Field(
default=False,
validation_alias=AliasChoices("generate_picture_images", "generatePictureImages"),
)
generate_page_images: bool = Field(
default=False, validation_alias=AliasChoices("generate_page_images", "generatePageImages")
)
images_scale: float = Field(
default=1.0, validation_alias=AliasChoices("images_scale", "imagesScale")
)
@field_validator("table_mode")
@classmethod
def validate_table_mode(cls, v: str) -> str:
if v not in ("accurate", "fast"):
raise ValueError('table_mode must be "accurate" or "fast"')
return v
@field_validator("images_scale")
@classmethod
def validate_images_scale(cls, v: float) -> float:
if v <= 0 or v > 10:
raise ValueError("images_scale must be between 0 (exclusive) and 10")
return v
class ChunkingOptionsRequest(BaseModel):
"""Docling chunking configuration options."""
model_config = ConfigDict(populate_by_name=True)
chunker_type: str = Field(
default="hybrid", validation_alias=AliasChoices("chunker_type", "chunkerType")
)
max_tokens: int = Field(default=512, validation_alias=AliasChoices("max_tokens", "maxTokens"))
merge_peers: bool = Field(
default=True, validation_alias=AliasChoices("merge_peers", "mergePeers")
)
repeat_table_header: bool = Field(
default=True, validation_alias=AliasChoices("repeat_table_header", "repeatTableHeader")
)
@field_validator("chunker_type")
@classmethod
def validate_chunker_type(cls, v: str) -> str:
if v not in ("hybrid", "hierarchical"):
raise ValueError('chunker_type must be "hybrid" or "hierarchical"')
return v
@field_validator("max_tokens")
@classmethod
def validate_max_tokens(cls, v: int) -> int:
if v < 64 or v > 8192:
raise ValueError("max_tokens must be between 64 and 8192")
return v
class ChunkBboxResponse(_CamelModel):
page: int
bbox: list[float]
class ChunkResponse(_CamelModel):
text: str
headings: list[str] = []
source_page: int | None = None
token_count: int = 0
bboxes: list[ChunkBboxResponse] = []
class CreateAnalysisRequest(BaseModel):
documentId: str = Field(validation_alias=AliasChoices("documentId", "document_id"))
pipelineOptions: PipelineOptionsRequest | None = Field(
default=None, validation_alias=AliasChoices("pipelineOptions", "pipeline_options")
)
chunkingOptions: ChunkingOptionsRequest | None = Field(
default=None, validation_alias=AliasChoices("chunkingOptions", "chunking_options")
)
class RechunkRequest(BaseModel):
chunkingOptions: ChunkingOptionsRequest = Field(
validation_alias=AliasChoices("chunkingOptions", "chunking_options")
)
|