Spaces:

itachi023
/

Markdown-Layout-Extractor

Sleeping

Markdown-Layout-Extractor / app /tools /markdown_tools.py

Deep Chavda

feat: initial release — PDF to Markdown MCP server

4ccde7a 9 days ago

2.21 kB

	from typing import Any, Dict

	from mcp.server.fastmcp import Context, FastMCP

	from app.core.logger import logger
	from app.services.ocr_service import OCRService
	from app.utils.validators import validate_document_url
	from app.core.exceptions import InvalidDocumentURLError, OCRProcessingError


	def register_markdown_tools(mcp: FastMCP) -> None:
	"""Attach markdown-extraction tools to the given FastMCP server."""

	@mcp.tool()
	async def pdf_to_markdown(document_url: str, ctx: Context) -> str:
	"""Convert a PDF or document from a URL to Markdown using Mistral OCR.

	Args:
	document_url: Publicly accessible URL of the PDF / document.

	Returns:
	Markdown string (all pages concatenated).
	"""
	try:
	url = validate_document_url(document_url)
	except InvalidDocumentURLError as exc:
	logger.warning("Invalid URL rejected: {}", exc)
	return f"Error: {exc}"

	service = OCRService(client=ctx.request_context.lifespan_context.mistral)

	try:
	return await service.document_to_markdown(url)
	except OCRProcessingError as exc:
	logger.error("OCR failed for {}: {}", url, exc)
	return f"Error: {exc}"

	@mcp.tool()
	async def pdf_to_structured_markdown(
	document_url: str, ctx: Context
	) -> Dict[str, Any]:
	"""Convert a document to per-page structured markdown.

	Returns:
	Dict with keys: page_count (int), pages (list of {index, markdown}),
	markdown (str, merged).
	"""
	try:
	url = validate_document_url(document_url)
	except InvalidDocumentURLError as exc:
	logger.warning("Invalid URL rejected: {}", exc)
	return {"error": str(exc), "page_count": 0, "pages": [], "markdown": ""}

	service = OCRService(client=ctx.request_context.lifespan_context.mistral)

	try:
	return await service.document_to_structured(url)
	except OCRProcessingError as exc:
	logger.error("Structured OCR failed for {}: {}", url, exc)
	return {"error": str(exc), "page_count": 0, "pages": [], "markdown": ""}