| from enum import Enum |
| from typing import TYPE_CHECKING, Dict, List, Optional, Union |
|
|
| from docling_core.types.doc import ( |
| BoundingBox, |
| DocItemLabel, |
| NodeItem, |
| PictureDataType, |
| Size, |
| TableCell, |
| ) |
| from docling_core.types.io import ( |
| DocumentStream, |
| ) |
| from PIL.Image import Image |
| from pydantic import BaseModel, ConfigDict |
|
|
| if TYPE_CHECKING: |
| from docling.backend.pdf_backend import PdfPageBackend |
|
|
|
|
| class ConversionStatus(str, Enum): |
| PENDING = "pending" |
| STARTED = "started" |
| FAILURE = "failure" |
| SUCCESS = "success" |
| PARTIAL_SUCCESS = "partial_success" |
| SKIPPED = "skipped" |
|
|
|
|
| class InputFormat(str, Enum): |
| """A document format supported by document backend parsers.""" |
|
|
| DOCX = "docx" |
| PPTX = "pptx" |
| HTML = "html" |
| XML_PUBMED = "xml_pubmed" |
| IMAGE = "image" |
| PDF = "pdf" |
| ASCIIDOC = "asciidoc" |
| MD = "md" |
| XLSX = "xlsx" |
| XML_USPTO = "xml_uspto" |
| JSON_DOCLING = "json_docling" |
|
|
|
|
| class OutputFormat(str, Enum): |
| MARKDOWN = "md" |
| JSON = "json" |
| HTML = "html" |
| TEXT = "text" |
| DOCTAGS = "doctags" |
|
|
|
|
| FormatToExtensions: Dict[InputFormat, List[str]] = { |
| InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"], |
| InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"], |
| InputFormat.PDF: ["pdf"], |
| InputFormat.MD: ["md"], |
| InputFormat.HTML: ["html", "htm", "xhtml"], |
| InputFormat.XML_PUBMED: ["xml", "nxml"], |
| InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], |
| InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], |
| InputFormat.XLSX: ["xlsx"], |
| InputFormat.XML_USPTO: ["xml", "txt"], |
| InputFormat.JSON_DOCLING: ["json"], |
| } |
|
|
| FormatToMimeType: Dict[InputFormat, List[str]] = { |
| InputFormat.DOCX: [ |
| "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
| "application/vnd.openxmlformats-officedocument.wordprocessingml.template", |
| ], |
| InputFormat.PPTX: [ |
| "application/vnd.openxmlformats-officedocument.presentationml.template", |
| "application/vnd.openxmlformats-officedocument.presentationml.slideshow", |
| "application/vnd.openxmlformats-officedocument.presentationml.presentation", |
| ], |
| InputFormat.HTML: ["text/html", "application/xhtml+xml"], |
| InputFormat.XML_PUBMED: ["application/xml"], |
| InputFormat.IMAGE: [ |
| "image/png", |
| "image/jpeg", |
| "image/tiff", |
| "image/gif", |
| "image/bmp", |
| ], |
| InputFormat.PDF: ["application/pdf"], |
| InputFormat.ASCIIDOC: ["text/asciidoc"], |
| InputFormat.MD: ["text/markdown", "text/x-markdown"], |
| InputFormat.XLSX: [ |
| "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
| ], |
| InputFormat.XML_USPTO: ["application/xml", "text/plain"], |
| InputFormat.JSON_DOCLING: ["application/json"], |
| } |
|
|
| MimeTypeToFormat: dict[str, list[InputFormat]] = { |
| mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]] |
| for value in FormatToMimeType.values() |
| for mime in value |
| } |
|
|
|
|
| class DocInputType(str, Enum): |
| PATH = "path" |
| STREAM = "stream" |
|
|
|
|
| class DoclingComponentType(str, Enum): |
| DOCUMENT_BACKEND = "document_backend" |
| MODEL = "model" |
| DOC_ASSEMBLER = "doc_assembler" |
| USER_INPUT = "user_input" |
|
|
|
|
| class ErrorItem(BaseModel): |
| component_type: DoclingComponentType |
| module_name: str |
| error_message: str |
|
|
|
|
| class Cell(BaseModel): |
| id: int |
| text: str |
| bbox: BoundingBox |
|
|
|
|
| class OcrCell(Cell): |
| confidence: float |
|
|
|
|
| class Cluster(BaseModel): |
| id: int |
| label: DocItemLabel |
| bbox: BoundingBox |
| confidence: float = 1.0 |
| cells: List[Cell] = [] |
| children: List["Cluster"] = [] |
|
|
|
|
| class BasePageElement(BaseModel): |
| label: DocItemLabel |
| id: int |
| page_no: int |
| cluster: Cluster |
| text: Optional[str] = None |
|
|
|
|
| class LayoutPrediction(BaseModel): |
| clusters: List[Cluster] = [] |
|
|
|
|
| class ContainerElement( |
| BasePageElement |
| ): |
| pass |
|
|
|
|
| class Table(BasePageElement): |
| otsl_seq: List[str] |
| num_rows: int = 0 |
| num_cols: int = 0 |
| table_cells: List[TableCell] |
|
|
|
|
| class TableStructurePrediction(BaseModel): |
| table_map: Dict[int, Table] = {} |
|
|
|
|
| class TextElement(BasePageElement): |
| text: str |
|
|
|
|
| class FigureElement(BasePageElement): |
| annotations: List[PictureDataType] = [] |
| provenance: Optional[str] = None |
| predicted_class: Optional[str] = None |
| confidence: Optional[float] = None |
|
|
|
|
| class FigureClassificationPrediction(BaseModel): |
| figure_count: int = 0 |
| figure_map: Dict[int, FigureElement] = {} |
|
|
|
|
| class EquationPrediction(BaseModel): |
| equation_count: int = 0 |
| equation_map: Dict[int, TextElement] = {} |
|
|
|
|
| class PagePredictions(BaseModel): |
| layout: Optional[LayoutPrediction] = None |
| tablestructure: Optional[TableStructurePrediction] = None |
| figures_classification: Optional[FigureClassificationPrediction] = None |
| equations_prediction: Optional[EquationPrediction] = None |
|
|
|
|
| PageElement = Union[TextElement, Table, FigureElement, ContainerElement] |
|
|
|
|
| class AssembledUnit(BaseModel): |
| elements: List[PageElement] = [] |
| body: List[PageElement] = [] |
| headers: List[PageElement] = [] |
|
|
|
|
| class ItemAndImageEnrichmentElement(BaseModel): |
| model_config = ConfigDict(arbitrary_types_allowed=True) |
|
|
| item: NodeItem |
| image: Image |
|
|
|
|
| class Page(BaseModel): |
| model_config = ConfigDict(arbitrary_types_allowed=True) |
|
|
| page_no: int |
| |
| size: Optional[Size] = None |
| cells: List[Cell] = [] |
| predictions: PagePredictions = PagePredictions() |
| assembled: Optional[AssembledUnit] = None |
|
|
| _backend: Optional["PdfPageBackend"] = ( |
| None |
| ) |
| _default_image_scale: float = 1.0 |
| _image_cache: Dict[float, Image] = ( |
| {} |
| ) |
|
|
| def get_image( |
| self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None |
| ) -> Optional[Image]: |
| if self._backend is None: |
| return self._image_cache.get(scale, None) |
|
|
| if not scale in self._image_cache: |
| if cropbox is None: |
| self._image_cache[scale] = self._backend.get_page_image(scale=scale) |
| else: |
| return self._backend.get_page_image(scale=scale, cropbox=cropbox) |
|
|
| if cropbox is None: |
| return self._image_cache[scale] |
| else: |
| page_im = self._image_cache[scale] |
| assert self.size is not None |
| return page_im.crop( |
| cropbox.to_top_left_origin(page_height=self.size.height) |
| .scaled(scale=scale) |
| .as_tuple() |
| ) |
|
|
| @property |
| def image(self) -> Optional[Image]: |
| return self.get_image(scale=self._default_image_scale) |
|
|