Spaces:
Sleeping
Sleeping
File size: 3,219 Bytes
4ccde7a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | from typing import Any, Dict, List
from mistralai.client import Mistral
from app.core.logger import logger
from app.core.config import settings
from app.core.exceptions import OCRProcessingError
class OCRService:
"""Encapsulates document-to-markdown conversion via Mistral OCR."""
def __init__(self, client: Mistral) -> None:
self._client = client
self._model = settings.MISTRAL_OCR_MODEL
self._table_format = settings.MISTRAL_TABLE_FORMAT
async def document_to_markdown(self, document_url: str) -> str:
"""Convert a remote document (PDF / image) to markdown.
Args:
document_url: Public URL of the document.
Returns:
Concatenated markdown content (pages joined by blank lines).
Raises:
OCRProcessingError: If the OCR call fails or returns no pages.
"""
logger.info("Starting OCR for document: {}", document_url)
try:
response = await self._client.ocr.process_async(
model=self._model,
document={
"type": "document_url",
"document_url": document_url,
},
table_format=self._table_format,
)
except Exception as exc:
logger.exception("Mistral OCR call failed")
raise OCRProcessingError(f"OCR processing failed: {exc}") from exc
pages = getattr(response, "pages", None) or []
if not pages:
raise OCRProcessingError("OCR returned no pages for the given document.")
markdown = "\n\n".join(
page.markdown for page in pages if getattr(page, "markdown", None)
)
logger.info("OCR succeeded: {} pages, {} chars", len(pages), len(markdown))
return markdown
async def document_to_structured(self, document_url: str) -> Dict[str, Any]:
"""Convert a document and return per-page structure alongside merged markdown.
Useful when callers need page-level metadata (page index, individual markdown).
"""
logger.info("Starting structured OCR for document: {}", document_url)
try:
response = await self._client.ocr.process_async(
model=self._model,
document={
"type": "document_url",
"document_url": document_url,
},
table_format=self._table_format,
)
except Exception as exc:
logger.exception("Mistral OCR call failed")
raise OCRProcessingError(f"OCR processing failed: {exc}") from exc
pages: List[Dict[str, Any]] = []
merged_parts: List[str] = []
for idx, page in enumerate(getattr(response, "pages", []) or []):
md = getattr(page, "markdown", "") or ""
pages.append({"index": idx, "markdown": md})
if md:
merged_parts.append(md)
if not pages:
raise OCRProcessingError("OCR returned no pages for the given document.")
return {
"page_count": len(pages),
"pages": pages,
"markdown": "\n\n".join(merged_parts),
}
|