from typing import Any, Dict, List from mistralai.client import Mistral from app.core.logger import logger from app.core.config import settings from app.core.exceptions import OCRProcessingError class OCRService: """Encapsulates document-to-markdown conversion via Mistral OCR.""" def __init__(self, client: Mistral) -> None: self._client = client self._model = settings.MISTRAL_OCR_MODEL self._table_format = settings.MISTRAL_TABLE_FORMAT async def document_to_markdown(self, document_url: str) -> str: """Convert a remote document (PDF / image) to markdown. Args: document_url: Public URL of the document. Returns: Concatenated markdown content (pages joined by blank lines). Raises: OCRProcessingError: If the OCR call fails or returns no pages. """ logger.info("Starting OCR for document: {}", document_url) try: response = await self._client.ocr.process_async( model=self._model, document={ "type": "document_url", "document_url": document_url, }, table_format=self._table_format, ) except Exception as exc: logger.exception("Mistral OCR call failed") raise OCRProcessingError(f"OCR processing failed: {exc}") from exc pages = getattr(response, "pages", None) or [] if not pages: raise OCRProcessingError("OCR returned no pages for the given document.") markdown = "\n\n".join( page.markdown for page in pages if getattr(page, "markdown", None) ) logger.info("OCR succeeded: {} pages, {} chars", len(pages), len(markdown)) return markdown async def document_to_structured(self, document_url: str) -> Dict[str, Any]: """Convert a document and return per-page structure alongside merged markdown. Useful when callers need page-level metadata (page index, individual markdown). """ logger.info("Starting structured OCR for document: {}", document_url) try: response = await self._client.ocr.process_async( model=self._model, document={ "type": "document_url", "document_url": document_url, }, table_format=self._table_format, ) except Exception as exc: logger.exception("Mistral OCR call failed") raise OCRProcessingError(f"OCR processing failed: {exc}") from exc pages: List[Dict[str, Any]] = [] merged_parts: List[str] = [] for idx, page in enumerate(getattr(response, "pages", []) or []): md = getattr(page, "markdown", "") or "" pages.append({"index": idx, "markdown": md}) if md: merged_parts.append(md) if not pages: raise OCRProcessingError("OCR returned no pages for the given document.") return { "page_count": len(pages), "pages": pages, "markdown": "\n\n".join(merged_parts), }