Spaces:

itachi023
/

Markdown-Layout-Extractor

Sleeping

Markdown-Layout-Extractor / app /services /ocr_service.py

Deep Chavda

feat: initial release — PDF to Markdown MCP server

4ccde7a 9 days ago

3.22 kB

	from typing import Any, Dict, List

	from mistralai.client import Mistral

	from app.core.logger import logger
	from app.core.config import settings
	from app.core.exceptions import OCRProcessingError


	class OCRService:
	"""Encapsulates document-to-markdown conversion via Mistral OCR."""

	def __init__(self, client: Mistral) -> None:
	self._client = client
	self._model = settings.MISTRAL_OCR_MODEL
	self._table_format = settings.MISTRAL_TABLE_FORMAT

	async def document_to_markdown(self, document_url: str) -> str:
	"""Convert a remote document (PDF / image) to markdown.

	Args:
	document_url: Public URL of the document.

	Returns:
	Concatenated markdown content (pages joined by blank lines).

	Raises:
	OCRProcessingError: If the OCR call fails or returns no pages.
	"""
	logger.info("Starting OCR for document: {}", document_url)
	try:
	response = await self._client.ocr.process_async(
	model=self._model,
	document={
	"type": "document_url",
	"document_url": document_url,
	},
	table_format=self._table_format,
	)
	except Exception as exc:
	logger.exception("Mistral OCR call failed")
	raise OCRProcessingError(f"OCR processing failed: {exc}") from exc

	pages = getattr(response, "pages", None) or []
	if not pages:
	raise OCRProcessingError("OCR returned no pages for the given document.")

	markdown = "\n\n".join(
	page.markdown for page in pages if getattr(page, "markdown", None)
	)
	logger.info("OCR succeeded: {} pages, {} chars", len(pages), len(markdown))
	return markdown

	async def document_to_structured(self, document_url: str) -> Dict[str, Any]:
	"""Convert a document and return per-page structure alongside merged markdown.

	Useful when callers need page-level metadata (page index, individual markdown).
	"""
	logger.info("Starting structured OCR for document: {}", document_url)
	try:
	response = await self._client.ocr.process_async(
	model=self._model,
	document={
	"type": "document_url",
	"document_url": document_url,
	},
	table_format=self._table_format,
	)
	except Exception as exc:
	logger.exception("Mistral OCR call failed")
	raise OCRProcessingError(f"OCR processing failed: {exc}") from exc

	pages: List[Dict[str, Any]] = []
	merged_parts: List[str] = []
	for idx, page in enumerate(getattr(response, "pages", []) or []):
	md = getattr(page, "markdown", "") or ""
	pages.append({"index": idx, "markdown": md})
	if md:
	merged_parts.append(md)

	if not pages:
	raise OCRProcessingError("OCR returned no pages for the given document.")

	return {
	"page_count": len(pages),
	"pages": pages,
	"markdown": "\n\n".join(merged_parts),
	}