""" Source Code Format Handler Parses source code files with syntax highlighting and line/column coordinate mapping for code annotation tasks. Usage: from potato.format_handlers.code_handler import CodeHandler handler = CodeHandler() output = handler.extract("script.py", { "show_line_numbers": True, "highlight_syntax": True, }) """ from typing import Dict, List, Any, Optional from pathlib import Path import html import logging import re from .base import BaseFormatHandler, FormatOutput from .coordinate_mapping import CoordinateMapper, CodeCoordinate logger = logging.getLogger(__name__) # Check if Pygments is available try: from pygments import highlight from pygments.lexers import get_lexer_by_name, get_lexer_for_filename, guess_lexer from pygments.formatters import HtmlFormatter from pygments.token import Token PYGMENTS_AVAILABLE = True except ImportError: PYGMENTS_AVAILABLE = False # Common source code extensions and their languages LANGUAGE_MAP = { ".py": "python", ".js": "javascript", ".jsx": "jsx", ".ts": "typescript", ".tsx": "tsx", ".java": "java", ".c": "c", ".cpp": "cpp", ".h": "c", ".hpp": "cpp", ".cs": "csharp", ".go": "go", ".rs": "rust", ".rb": "ruby", ".php": "php", ".swift": "swift", ".kt": "kotlin", ".scala": "scala", ".r": "r", ".R": "r", ".sql": "sql", ".sh": "bash", ".bash": "bash", ".zsh": "zsh", ".ps1": "powershell", ".yaml": "yaml", ".yml": "yaml", ".json": "json", ".xml": "xml", ".html": "html", ".css": "css", ".scss": "scss", ".less": "less", ".lua": "lua", ".pl": "perl", ".m": "matlab", ".jl": "julia", ".hs": "haskell", ".ml": "ocaml", ".ex": "elixir", ".exs": "elixir", ".erl": "erlang", ".clj": "clojure", ".lisp": "lisp", ".vim": "vim", ".dockerfile": "docker", ".tf": "terraform", ".proto": "protobuf", ".graphql": "graphql", } class CodeHandler(BaseFormatHandler): """ Handler for source code files. Provides syntax highlighting via Pygments and line/column coordinate mapping for code annotation. """ format_name = "code" supported_extensions = list(LANGUAGE_MAP.keys()) description = "Source code with syntax highlighting and line/column mapping" requires_dependencies = ["pygments"] def get_default_options(self) -> Dict[str, Any]: """Get default extraction options.""" return { "highlight_syntax": True, "show_line_numbers": True, "language": None, # Auto-detect from extension "tab_size": 4, "max_lines": None, "start_line": 1, "extract_structure": True, # Extract function/class names } def extract( self, file_path: str, options: Optional[Dict[str, Any]] = None ) -> FormatOutput: """ Parse and render a source code file. Args: file_path: Path to the source code file options: Extraction options: - highlight_syntax: Apply syntax highlighting - show_line_numbers: Include line numbers in output - language: Override language detection - tab_size: Spaces per tab for rendering - max_lines: Limit number of lines - extract_structure: Extract function/class definitions Returns: FormatOutput with code text, highlighted HTML, and coordinates """ opts = self.merge_options(options) path = Path(file_path) # Read source file source_text = path.read_text(encoding="utf-8") # Expand tabs if needed if opts.get("tab_size"): source_text = source_text.expandtabs(opts["tab_size"]) # Build line index and coordinates lines = source_text.split("\n") mapper = CoordinateMapper() line_offsets = [] current_offset = 0 # Apply line limits start_line = opts.get("start_line", 1) - 1 # Convert to 0-indexed max_lines = opts.get("max_lines") end_line = min(len(lines), start_line + max_lines) if max_lines else len(lines) # Build coordinate mappings for each line for line_num, line in enumerate(lines): line_start = current_offset line_end = current_offset + len(line) line_offsets.append((line_start, line_end)) # Only map lines within our range if start_line <= line_num < end_line: mapper.add_mapping( line_start, line_end, CodeCoordinate( line=line_num + 1, # 1-indexed column=1, ) ) current_offset = line_end + 1 # +1 for newline # Extract text for the requested range if start_line > 0 or max_lines: display_lines = lines[start_line:end_line] display_text = "\n".join(display_lines) else: display_text = source_text # Detect language language = opts.get("language") if not language: ext = path.suffix.lower() language = LANGUAGE_MAP.get(ext, "text") # Render HTML if opts.get("highlight_syntax") and PYGMENTS_AVAILABLE: rendered_html = self._render_highlighted( display_text, language, opts, start_line + 1 ) else: rendered_html = self._render_plain( display_text, opts, start_line + 1 ) # Extract code structure structure = [] if opts.get("extract_structure"): structure = self._extract_structure(source_text, language) metadata = { "format": "code", "source_file": str(file_path), "language": language, "line_count": len(lines), "char_count": len(source_text), "displayed_lines": (start_line + 1, end_line), "structure": structure, } coord_dict = mapper.to_dict() coord_dict["get_coords_for_range"] = mapper.get_coords_for_range return FormatOutput( text=display_text, rendered_html=rendered_html, coordinate_map=coord_dict, metadata=metadata, format_name=self.format_name, source_path=str(file_path), ) def _render_highlighted( self, code: str, language: str, opts: Dict[str, Any], start_line: int ) -> str: """ Render code with Pygments syntax highlighting. """ try: lexer = get_lexer_by_name(language, stripall=False) except Exception: try: lexer = guess_lexer(code) except Exception: lexer = get_lexer_by_name("text") # Configure formatter formatter_opts = { "cssclass": "code-highlight", "linenos": opts.get("show_line_numbers", True), "linenostart": start_line, "lineanchors": "line", "anchorlinenos": True, } if opts.get("show_line_numbers"): formatter_opts["linenos"] = "table" formatter = HtmlFormatter(**formatter_opts) highlighted = highlight(code, lexer, formatter) # Wrap in container return f'
{highlighted}
' def _render_plain( self, code: str, opts: Dict[str, Any], start_line: int ) -> str: """ Render code as plain text with optional line numbers. """ lines = code.split("\n") html_parts = [] html_parts.append('
') html_parts.append('') for i, line in enumerate(lines, start=start_line): escaped_line = html.escape(line) or " " if opts.get("show_line_numbers"): html_parts.append( f'' f'' f'' f'' ) else: html_parts.append( f'' f'' f'' ) html_parts.append('
{i}{escaped_line}
{escaped_line}
') html_parts.append('
') return "\n".join(html_parts) def _extract_structure( self, code: str, language: str ) -> List[Dict[str, Any]]: """ Extract code structure (functions, classes) using pattern matching. This is a simplified extraction that works for common languages. For production use, consider using language-specific parsers. """ structure = [] # Pattern definitions for common languages patterns = { "python": { "function": r"^\s*(?:async\s+)?def\s+(\w+)\s*\(", "class": r"^\s*class\s+(\w+)\s*[:\(]", }, "javascript": { "function": r"(?:function\s+(\w+)|(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s*)?\([^)]*\)\s*=>|(?:const|let|var)\s+(\w+)\s*=\s*function)", "class": r"class\s+(\w+)", }, "java": { "function": r"(?:public|private|protected|static|\s)+[\w<>\[\]]+\s+(\w+)\s*\([^\)]*\)\s*(?:throws\s+[\w,\s]+)?\s*\{", "class": r"(?:public|private|protected)?\s*class\s+(\w+)", }, "go": { "function": r"func\s+(?:\([^)]+\)\s+)?(\w+)\s*\(", "class": r"type\s+(\w+)\s+struct", }, "rust": { "function": r"(?:pub\s+)?fn\s+(\w+)", "class": r"(?:pub\s+)?struct\s+(\w+)", }, } # Get patterns for this language (or use generic) lang_patterns = patterns.get(language, {}) lines = code.split("\n") for line_num, line in enumerate(lines, start=1): # Check for functions if "function" in lang_patterns: match = re.search(lang_patterns["function"], line) if match: name = next((g for g in match.groups() if g), None) if name: structure.append({ "type": "function", "name": name, "line": line_num, }) # Check for classes/structs if "class" in lang_patterns: match = re.search(lang_patterns["class"], line) if match: name = match.group(1) structure.append({ "type": "class", "name": name, "line": line_num, }) return structure def get_language(self, file_path: str) -> str: """ Detect the programming language from a file path. Args: file_path: Path to the source file Returns: Language identifier """ ext = Path(file_path).suffix.lower() return LANGUAGE_MAP.get(ext, "text")