""" Markdown Format Handler Parses Markdown files and extracts text with source line mapping. Supports syntax highlighting for code blocks. Usage: from potato.format_handlers.markdown_handler import MarkdownHandler handler = MarkdownHandler() output = handler.extract("document.md", { "highlight_code": True, "gfm": True, # GitHub Flavored Markdown }) """ from typing import Dict, List, Any, Optional from pathlib import Path import html import logging import re import uuid from .base import BaseFormatHandler, FormatOutput from .coordinate_mapping import CoordinateMapper, CodeCoordinate, DocumentCoordinate logger = logging.getLogger(__name__) # Check if dependencies are available try: import mistune MISTUNE_AVAILABLE = True except ImportError: MISTUNE_AVAILABLE = False mistune = None try: from pygments import highlight from pygments.lexers import get_lexer_by_name, guess_lexer from pygments.formatters import HtmlFormatter PYGMENTS_AVAILABLE = True except ImportError: PYGMENTS_AVAILABLE = False class MarkdownHandler(BaseFormatHandler): """ Handler for Markdown files. Uses mistune for parsing and Pygments for syntax highlighting. Maintains line/column coordinate mappings. """ format_name = "markdown" supported_extensions = [".md", ".markdown", ".mdown", ".mkd"] description = "Markdown parsing with line/column mapping and syntax highlighting" requires_dependencies = ["mistune"] def get_default_options(self) -> Dict[str, Any]: """Get default extraction options.""" return { "highlight_code": True, "gfm": True, # GitHub Flavored Markdown "include_raw_blocks": False, "preserve_line_breaks": True, } def extract( self, file_path: str, options: Optional[Dict[str, Any]] = None ) -> FormatOutput: """ Parse and render a Markdown file. Args: file_path: Path to the Markdown file options: Extraction options: - highlight_code: Syntax highlight code blocks - gfm: Use GitHub Flavored Markdown extensions - preserve_line_breaks: Keep original line structure Returns: FormatOutput with text, rendered HTML, and coordinate mappings """ if not MISTUNE_AVAILABLE: raise ImportError( "mistune is required for Markdown extraction. " "Install with: pip install mistune" ) opts = self.merge_options(options) # Read source file path = Path(file_path) source_text = path.read_text(encoding="utf-8") # Build line index for coordinate mapping line_offsets = self._build_line_index(source_text) # Parse and render mapper = CoordinateMapper() rendered_html = self._render_markdown(source_text, opts) # Build coordinate mappings for each line for line_num, (start, end) in enumerate(line_offsets, start=1): mapper.add_mapping( start, end, CodeCoordinate(line=line_num, column=1) ) metadata = { "format": "markdown", "source_file": str(file_path), "line_count": len(line_offsets), "char_count": len(source_text), "headings": self._extract_headings(source_text), } coord_dict = mapper.to_dict() coord_dict["get_coords_for_range"] = mapper.get_coords_for_range return FormatOutput( text=source_text, rendered_html=rendered_html, coordinate_map=coord_dict, metadata=metadata, format_name=self.format_name, source_path=str(file_path), ) def _build_line_index(self, text: str) -> List[tuple]: """ Build an index of line start/end offsets. Returns: List of (start, end) tuples for each line """ lines = [] start = 0 for line in text.split("\n"): end = start + len(line) lines.append((start, end)) start = end + 1 # +1 for newline return lines def _render_markdown(self, text: str, opts: Dict[str, Any]) -> str: """ Render Markdown to HTML using mistune. """ # Create custom renderer with code highlighting if opts.get("highlight_code") and PYGMENTS_AVAILABLE: renderer = HighlightRenderer() else: renderer = None # Configure mistune if opts.get("gfm"): # Use plugins for GFM features md = mistune.create_markdown( renderer=renderer, plugins=['strikethrough', 'table', 'task_lists'] ) else: md = mistune.create_markdown(renderer=renderer) html_content = md(text) # Wrap in container return f'
{escaped}\n'
try:
if info:
lexer = get_lexer_by_name(info, stripall=True)
else:
lexer = guess_lexer(code)
except Exception:
# Fall back to plain text
escaped = html.escape(code)
return f'{escaped}\n'
return highlight(code, lexer, self.formatter)
def codespan(self, text: str) -> str:
"""
Render inline code.
"""
escaped = html.escape(text)
return f'{escaped}'