File size: 1,552 Bytes
7248d39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
"""Document OCR via MiniCPM-V on Modal; LiteParse reserved for layout/format."""

from typing import Any, Dict, Optional

from models.ocr import MiniCPMVOCR
from utils.liteparse_parser import (
    extract_document_structured_ocr,
    file_to_ocr_image_bytes,
    preview_page_base64,
)
from utils.ocr_structure import structured_to_plain_text


class ChartExtractorService:
    def __init__(self, ocr: MiniCPMVOCR):
        self._ocr = ocr

    def extract_structured(
        self, file_bytes: bytes, filename: Optional[str] = None
    ) -> Dict[str, Any]:
        structured = extract_document_structured_ocr(file_bytes, filename, self._ocr)
        structured["text"] = structured_to_plain_text(structured)
        return structured

    def extract_text(self, file_bytes: bytes, filename: Optional[str] = None) -> str:
        return self.extract_structured(file_bytes, filename)["text"]

    def extract_tables(self, file_bytes: bytes, filename: Optional[str] = None) -> str:
        image_bytes = file_to_ocr_image_bytes(file_bytes, filename)
        return self._ocr.extract_tables(image_bytes)

    def extract_chart(self, file_bytes: bytes, filename: Optional[str] = None) -> str:
        image_bytes = file_to_ocr_image_bytes(file_bytes, filename)
        return self._ocr.describe_chart(image_bytes)

    def preview_page(
        self,
        file_bytes: bytes,
        filename: Optional[str] = None,
        page_num: int = 1,
    ) -> Optional[str]:
        return preview_page_base64(file_bytes, page_num=page_num, filename=filename)