"""Document OCR via MiniCPM-V on Modal; LiteParse reserved for layout/format.""" from typing import Any, Dict, Optional from models.ocr import MiniCPMVOCR from utils.liteparse_parser import ( extract_document_structured_ocr, file_to_ocr_image_bytes, preview_page_base64, ) from utils.ocr_structure import structured_to_plain_text class ChartExtractorService: def __init__(self, ocr: MiniCPMVOCR): self._ocr = ocr def extract_structured( self, file_bytes: bytes, filename: Optional[str] = None ) -> Dict[str, Any]: structured = extract_document_structured_ocr(file_bytes, filename, self._ocr) structured["text"] = structured_to_plain_text(structured) return structured def extract_text(self, file_bytes: bytes, filename: Optional[str] = None) -> str: return self.extract_structured(file_bytes, filename)["text"] def extract_tables(self, file_bytes: bytes, filename: Optional[str] = None) -> str: image_bytes = file_to_ocr_image_bytes(file_bytes, filename) return self._ocr.extract_tables(image_bytes) def extract_chart(self, file_bytes: bytes, filename: Optional[str] = None) -> str: image_bytes = file_to_ocr_image_bytes(file_bytes, filename) return self._ocr.describe_chart(image_bytes) def preview_page( self, file_bytes: bytes, filename: Optional[str] = None, page_num: int = 1, ) -> Optional[str]: return preview_page_base64(file_bytes, page_num=page_num, filename=filename)