Spaces:

build-small-hackathon
/

FinSightAI

Sleeping

FinSightAI / backend /services /chart_extractor.py

start

7248d39 16 days ago

1.55 kB

	"""Document OCR via MiniCPM-V on Modal; LiteParse reserved for layout/format."""

	from typing import Any, Dict, Optional

	from models.ocr import MiniCPMVOCR
	from utils.liteparse_parser import (
	extract_document_structured_ocr,
	file_to_ocr_image_bytes,
	preview_page_base64,
	)
	from utils.ocr_structure import structured_to_plain_text


	class ChartExtractorService:
	def __init__(self, ocr: MiniCPMVOCR):
	self._ocr = ocr

	def extract_structured(
	self, file_bytes: bytes, filename: Optional[str] = None
	) -> Dict[str, Any]:
	structured = extract_document_structured_ocr(file_bytes, filename, self._ocr)
	structured["text"] = structured_to_plain_text(structured)
	return structured

	def extract_text(self, file_bytes: bytes, filename: Optional[str] = None) -> str:
	return self.extract_structured(file_bytes, filename)["text"]

	def extract_tables(self, file_bytes: bytes, filename: Optional[str] = None) -> str:
	image_bytes = file_to_ocr_image_bytes(file_bytes, filename)
	return self._ocr.extract_tables(image_bytes)

	def extract_chart(self, file_bytes: bytes, filename: Optional[str] = None) -> str:
	image_bytes = file_to_ocr_image_bytes(file_bytes, filename)
	return self._ocr.describe_chart(image_bytes)

	def preview_page(
	self,
	file_bytes: bytes,
	filename: Optional[str] = None,
	page_num: int = 1,
	) -> Optional[str]:
	return preview_page_base64(file_bytes, page_num=page_num, filename=filename)