Spaces:
Sleeping
Sleeping
| """Document OCR via MiniCPM-V on Modal; LiteParse reserved for layout/format.""" | |
| from typing import Any, Dict, Optional | |
| from models.ocr import MiniCPMVOCR | |
| from utils.liteparse_parser import ( | |
| extract_document_structured_ocr, | |
| file_to_ocr_image_bytes, | |
| preview_page_base64, | |
| ) | |
| from utils.ocr_structure import structured_to_plain_text | |
| class ChartExtractorService: | |
| def __init__(self, ocr: MiniCPMVOCR): | |
| self._ocr = ocr | |
| def extract_structured( | |
| self, file_bytes: bytes, filename: Optional[str] = None | |
| ) -> Dict[str, Any]: | |
| structured = extract_document_structured_ocr(file_bytes, filename, self._ocr) | |
| structured["text"] = structured_to_plain_text(structured) | |
| return structured | |
| def extract_text(self, file_bytes: bytes, filename: Optional[str] = None) -> str: | |
| return self.extract_structured(file_bytes, filename)["text"] | |
| def extract_tables(self, file_bytes: bytes, filename: Optional[str] = None) -> str: | |
| image_bytes = file_to_ocr_image_bytes(file_bytes, filename) | |
| return self._ocr.extract_tables(image_bytes) | |
| def extract_chart(self, file_bytes: bytes, filename: Optional[str] = None) -> str: | |
| image_bytes = file_to_ocr_image_bytes(file_bytes, filename) | |
| return self._ocr.describe_chart(image_bytes) | |
| def preview_page( | |
| self, | |
| file_bytes: bytes, | |
| filename: Optional[str] = None, | |
| page_num: int = 1, | |
| ) -> Optional[str]: | |
| return preview_page_base64(file_bytes, page_num=page_num, filename=filename) | |