File size: 3,219 Bytes
4ccde7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from typing import Any, Dict, List

from mistralai.client import Mistral

from app.core.logger import logger
from app.core.config import settings
from app.core.exceptions import OCRProcessingError


class OCRService:
    """Encapsulates document-to-markdown conversion via Mistral OCR."""

    def __init__(self, client: Mistral) -> None:
        self._client = client
        self._model = settings.MISTRAL_OCR_MODEL
        self._table_format = settings.MISTRAL_TABLE_FORMAT

    async def document_to_markdown(self, document_url: str) -> str:
        """Convert a remote document (PDF / image) to markdown.

        Args:
            document_url: Public URL of the document.

        Returns:
            Concatenated markdown content (pages joined by blank lines).

        Raises:
            OCRProcessingError: If the OCR call fails or returns no pages.
        """
        logger.info("Starting OCR for document: {}", document_url)
        try:
            response = await self._client.ocr.process_async(
                model=self._model,
                document={
                    "type": "document_url",
                    "document_url": document_url,
                },
                table_format=self._table_format,
            )
        except Exception as exc:
            logger.exception("Mistral OCR call failed")
            raise OCRProcessingError(f"OCR processing failed: {exc}") from exc

        pages = getattr(response, "pages", None) or []
        if not pages:
            raise OCRProcessingError("OCR returned no pages for the given document.")

        markdown = "\n\n".join(
            page.markdown for page in pages if getattr(page, "markdown", None)
        )
        logger.info("OCR succeeded: {} pages, {} chars", len(pages), len(markdown))
        return markdown

    async def document_to_structured(self, document_url: str) -> Dict[str, Any]:
        """Convert a document and return per-page structure alongside merged markdown.

        Useful when callers need page-level metadata (page index, individual markdown).
        """
        logger.info("Starting structured OCR for document: {}", document_url)
        try:
            response = await self._client.ocr.process_async(
                model=self._model,
                document={
                    "type": "document_url",
                    "document_url": document_url,
                },
                table_format=self._table_format,
            )
        except Exception as exc:
            logger.exception("Mistral OCR call failed")
            raise OCRProcessingError(f"OCR processing failed: {exc}") from exc

        pages: List[Dict[str, Any]] = []
        merged_parts: List[str] = []
        for idx, page in enumerate(getattr(response, "pages", []) or []):
            md = getattr(page, "markdown", "") or ""
            pages.append({"index": idx, "markdown": md})
            if md:
                merged_parts.append(md)

        if not pages:
            raise OCRProcessingError("OCR returned no pages for the given document.")

        return {
            "page_count": len(pages),
            "pages": pages,
            "markdown": "\n\n".join(merged_parts),
        }