docling-studio / document-parser /domain /value_objects.py
Pier-Jean's picture
Initial deploy: Docling Studio (local mode, port 7860)
5539271
"""Domain value objects — pure data structures for document conversion.
These types define the contract between the domain and infrastructure layers.
They have ZERO external dependencies (no docling, no HTTP, no DB).
"""
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass
class PageElement:
type: str
bbox: list[float]
content: str
level: int = 0
@dataclass
class PageDetail:
page_number: int
width: float
height: float
elements: list[PageElement] = field(default_factory=list)
@dataclass
class ConversionOptions:
do_ocr: bool = True
do_table_structure: bool = True
table_mode: str = "accurate"
do_code_enrichment: bool = False
do_formula_enrichment: bool = False
do_picture_classification: bool = False
do_picture_description: bool = False
generate_picture_images: bool = False
generate_page_images: bool = False
images_scale: float = 1.0
def is_default(self) -> bool:
"""Return True if all options match their defaults."""
return self == ConversionOptions()
@dataclass
class ConversionResult:
page_count: int
content_markdown: str
content_html: str
pages: list[PageDetail]
skipped_items: int = 0
document_json: str | None = None
@dataclass
class ChunkingOptions:
chunker_type: str = "hybrid" # "hybrid", "hierarchical", "page"
max_tokens: int = 512
merge_peers: bool = True
repeat_table_header: bool = True
def is_default(self) -> bool:
"""Return True if all options match their defaults."""
return self == ChunkingOptions()
@dataclass
class ChunkBbox:
page: int
bbox: list[float] # [left, top, right, bottom] in TOPLEFT origin
@dataclass
class ChunkResult:
text: str
headings: list[str] = field(default_factory=list)
source_page: int | None = None
token_count: int = 0
bboxes: list[ChunkBbox] = field(default_factory=list)