Spaces:
Running
Running
File size: 1,965 Bytes
5539271 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | """Domain value objects — pure data structures for document conversion.
These types define the contract between the domain and infrastructure layers.
They have ZERO external dependencies (no docling, no HTTP, no DB).
"""
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass
class PageElement:
type: str
bbox: list[float]
content: str
level: int = 0
@dataclass
class PageDetail:
page_number: int
width: float
height: float
elements: list[PageElement] = field(default_factory=list)
@dataclass
class ConversionOptions:
do_ocr: bool = True
do_table_structure: bool = True
table_mode: str = "accurate"
do_code_enrichment: bool = False
do_formula_enrichment: bool = False
do_picture_classification: bool = False
do_picture_description: bool = False
generate_picture_images: bool = False
generate_page_images: bool = False
images_scale: float = 1.0
def is_default(self) -> bool:
"""Return True if all options match their defaults."""
return self == ConversionOptions()
@dataclass
class ConversionResult:
page_count: int
content_markdown: str
content_html: str
pages: list[PageDetail]
skipped_items: int = 0
document_json: str | None = None
@dataclass
class ChunkingOptions:
chunker_type: str = "hybrid" # "hybrid", "hierarchical", "page"
max_tokens: int = 512
merge_peers: bool = True
repeat_table_header: bool = True
def is_default(self) -> bool:
"""Return True if all options match their defaults."""
return self == ChunkingOptions()
@dataclass
class ChunkBbox:
page: int
bbox: list[float] # [left, top, right, bottom] in TOPLEFT origin
@dataclass
class ChunkResult:
text: str
headings: list[str] = field(default_factory=list)
source_page: int | None = None
token_count: int = 0
bboxes: list[ChunkBbox] = field(default_factory=list)
|