Spaces:
Sleeping
Sleeping
| from typing import Any, Dict | |
| from mcp.server.fastmcp import Context, FastMCP | |
| from app.core.logger import logger | |
| from app.services.ocr_service import OCRService | |
| from app.utils.validators import validate_document_url | |
| from app.core.exceptions import InvalidDocumentURLError, OCRProcessingError | |
| def register_markdown_tools(mcp: FastMCP) -> None: | |
| """Attach markdown-extraction tools to the given FastMCP server.""" | |
| async def pdf_to_markdown(document_url: str, ctx: Context) -> str: | |
| """Convert a PDF or document from a URL to Markdown using Mistral OCR. | |
| Args: | |
| document_url: Publicly accessible URL of the PDF / document. | |
| Returns: | |
| Markdown string (all pages concatenated). | |
| """ | |
| try: | |
| url = validate_document_url(document_url) | |
| except InvalidDocumentURLError as exc: | |
| logger.warning("Invalid URL rejected: {}", exc) | |
| return f"Error: {exc}" | |
| service = OCRService(client=ctx.request_context.lifespan_context.mistral) | |
| try: | |
| return await service.document_to_markdown(url) | |
| except OCRProcessingError as exc: | |
| logger.error("OCR failed for {}: {}", url, exc) | |
| return f"Error: {exc}" | |
| async def pdf_to_structured_markdown( | |
| document_url: str, ctx: Context | |
| ) -> Dict[str, Any]: | |
| """Convert a document to per-page structured markdown. | |
| Returns: | |
| Dict with keys: page_count (int), pages (list of {index, markdown}), | |
| markdown (str, merged). | |
| """ | |
| try: | |
| url = validate_document_url(document_url) | |
| except InvalidDocumentURLError as exc: | |
| logger.warning("Invalid URL rejected: {}", exc) | |
| return {"error": str(exc), "page_count": 0, "pages": [], "markdown": ""} | |
| service = OCRService(client=ctx.request_context.lifespan_context.mistral) | |
| try: | |
| return await service.document_to_structured(url) | |
| except OCRProcessingError as exc: | |
| logger.error("Structured OCR failed for {}: {}", url, exc) | |
| return {"error": str(exc), "page_count": 0, "pages": [], "markdown": ""} | |