File size: 2,208 Bytes
4ccde7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from typing import Any, Dict

from mcp.server.fastmcp import Context, FastMCP

from app.core.logger import logger
from app.services.ocr_service import OCRService
from app.utils.validators import validate_document_url
from app.core.exceptions import InvalidDocumentURLError, OCRProcessingError


def register_markdown_tools(mcp: FastMCP) -> None:
    """Attach markdown-extraction tools to the given FastMCP server."""

    @mcp.tool()
    async def pdf_to_markdown(document_url: str, ctx: Context) -> str:
        """Convert a PDF or document from a URL to Markdown using Mistral OCR.

        Args:
            document_url: Publicly accessible URL of the PDF / document.

        Returns:
            Markdown string (all pages concatenated).
        """
        try:
            url = validate_document_url(document_url)
        except InvalidDocumentURLError as exc:
            logger.warning("Invalid URL rejected: {}", exc)
            return f"Error: {exc}"

        service = OCRService(client=ctx.request_context.lifespan_context.mistral)

        try:
            return await service.document_to_markdown(url)
        except OCRProcessingError as exc:
            logger.error("OCR failed for {}: {}", url, exc)
            return f"Error: {exc}"

    @mcp.tool()
    async def pdf_to_structured_markdown(
        document_url: str, ctx: Context
    ) -> Dict[str, Any]:
        """Convert a document to per-page structured markdown.

        Returns:
            Dict with keys: page_count (int), pages (list of {index, markdown}),
            markdown (str, merged).
        """
        try:
            url = validate_document_url(document_url)
        except InvalidDocumentURLError as exc:
            logger.warning("Invalid URL rejected: {}", exc)
            return {"error": str(exc), "page_count": 0, "pages": [], "markdown": ""}

        service = OCRService(client=ctx.request_context.lifespan_context.mistral)

        try:
            return await service.document_to_structured(url)
        except OCRProcessingError as exc:
            logger.error("Structured OCR failed for {}: {}", url, exc)
            return {"error": str(exc), "page_count": 0, "pages": [], "markdown": ""}