Deep Chavda
feat: initial release — PDF to Markdown MCP server
4ccde7a
from typing import Any, Dict, List
from mistralai.client import Mistral
from app.core.logger import logger
from app.core.config import settings
from app.core.exceptions import OCRProcessingError
class OCRService:
"""Encapsulates document-to-markdown conversion via Mistral OCR."""
def __init__(self, client: Mistral) -> None:
self._client = client
self._model = settings.MISTRAL_OCR_MODEL
self._table_format = settings.MISTRAL_TABLE_FORMAT
async def document_to_markdown(self, document_url: str) -> str:
"""Convert a remote document (PDF / image) to markdown.
Args:
document_url: Public URL of the document.
Returns:
Concatenated markdown content (pages joined by blank lines).
Raises:
OCRProcessingError: If the OCR call fails or returns no pages.
"""
logger.info("Starting OCR for document: {}", document_url)
try:
response = await self._client.ocr.process_async(
model=self._model,
document={
"type": "document_url",
"document_url": document_url,
},
table_format=self._table_format,
)
except Exception as exc:
logger.exception("Mistral OCR call failed")
raise OCRProcessingError(f"OCR processing failed: {exc}") from exc
pages = getattr(response, "pages", None) or []
if not pages:
raise OCRProcessingError("OCR returned no pages for the given document.")
markdown = "\n\n".join(
page.markdown for page in pages if getattr(page, "markdown", None)
)
logger.info("OCR succeeded: {} pages, {} chars", len(pages), len(markdown))
return markdown
async def document_to_structured(self, document_url: str) -> Dict[str, Any]:
"""Convert a document and return per-page structure alongside merged markdown.
Useful when callers need page-level metadata (page index, individual markdown).
"""
logger.info("Starting structured OCR for document: {}", document_url)
try:
response = await self._client.ocr.process_async(
model=self._model,
document={
"type": "document_url",
"document_url": document_url,
},
table_format=self._table_format,
)
except Exception as exc:
logger.exception("Mistral OCR call failed")
raise OCRProcessingError(f"OCR processing failed: {exc}") from exc
pages: List[Dict[str, Any]] = []
merged_parts: List[str] = []
for idx, page in enumerate(getattr(response, "pages", []) or []):
md = getattr(page, "markdown", "") or ""
pages.append({"index": idx, "markdown": md})
if md:
merged_parts.append(md)
if not pages:
raise OCRProcessingError("OCR returned no pages for the given document.")
return {
"page_count": len(pages),
"pages": pages,
"markdown": "\n\n".join(merged_parts),
}