Spaces:
Sleeping
Sleeping
| from typing import Any, Dict, List | |
| from mistralai.client import Mistral | |
| from app.core.logger import logger | |
| from app.core.config import settings | |
| from app.core.exceptions import OCRProcessingError | |
| class OCRService: | |
| """Encapsulates document-to-markdown conversion via Mistral OCR.""" | |
| def __init__(self, client: Mistral) -> None: | |
| self._client = client | |
| self._model = settings.MISTRAL_OCR_MODEL | |
| self._table_format = settings.MISTRAL_TABLE_FORMAT | |
| async def document_to_markdown(self, document_url: str) -> str: | |
| """Convert a remote document (PDF / image) to markdown. | |
| Args: | |
| document_url: Public URL of the document. | |
| Returns: | |
| Concatenated markdown content (pages joined by blank lines). | |
| Raises: | |
| OCRProcessingError: If the OCR call fails or returns no pages. | |
| """ | |
| logger.info("Starting OCR for document: {}", document_url) | |
| try: | |
| response = await self._client.ocr.process_async( | |
| model=self._model, | |
| document={ | |
| "type": "document_url", | |
| "document_url": document_url, | |
| }, | |
| table_format=self._table_format, | |
| ) | |
| except Exception as exc: | |
| logger.exception("Mistral OCR call failed") | |
| raise OCRProcessingError(f"OCR processing failed: {exc}") from exc | |
| pages = getattr(response, "pages", None) or [] | |
| if not pages: | |
| raise OCRProcessingError("OCR returned no pages for the given document.") | |
| markdown = "\n\n".join( | |
| page.markdown for page in pages if getattr(page, "markdown", None) | |
| ) | |
| logger.info("OCR succeeded: {} pages, {} chars", len(pages), len(markdown)) | |
| return markdown | |
| async def document_to_structured(self, document_url: str) -> Dict[str, Any]: | |
| """Convert a document and return per-page structure alongside merged markdown. | |
| Useful when callers need page-level metadata (page index, individual markdown). | |
| """ | |
| logger.info("Starting structured OCR for document: {}", document_url) | |
| try: | |
| response = await self._client.ocr.process_async( | |
| model=self._model, | |
| document={ | |
| "type": "document_url", | |
| "document_url": document_url, | |
| }, | |
| table_format=self._table_format, | |
| ) | |
| except Exception as exc: | |
| logger.exception("Mistral OCR call failed") | |
| raise OCRProcessingError(f"OCR processing failed: {exc}") from exc | |
| pages: List[Dict[str, Any]] = [] | |
| merged_parts: List[str] = [] | |
| for idx, page in enumerate(getattr(response, "pages", []) or []): | |
| md = getattr(page, "markdown", "") or "" | |
| pages.append({"index": idx, "markdown": md}) | |
| if md: | |
| merged_parts.append(md) | |
| if not pages: | |
| raise OCRProcessingError("OCR returned no pages for the given document.") | |
| return { | |
| "page_count": len(pages), | |
| "pages": pages, | |
| "markdown": "\n\n".join(merged_parts), | |
| } | |