"""HTML file processor.""" import os import logging from typing import Dict, Any from .base import BaseProcessor from ..result import ConversionResult from ..exceptions import ConversionError, FileNotFoundError # Configure logging logger = logging.getLogger(__name__) class HTMLProcessor(BaseProcessor): """Processor for HTML files using markdownify for conversion.""" def can_process(self, file_path: str) -> bool: """Check if this processor can handle the given file. Args: file_path: Path to the file to check Returns: True if this processor can handle the file """ if not os.path.exists(file_path): return False # Check file extension - ensure file_path is a string file_path_str = str(file_path) _, ext = os.path.splitext(file_path_str.lower()) return ext in ['.html', '.htm'] def process(self, file_path: str) -> ConversionResult: """Process the HTML file and return a conversion result. Args: file_path: Path to the HTML file to process Returns: ConversionResult containing the processed content Raises: FileNotFoundError: If the file doesn't exist ConversionError: If processing fails """ if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") try: try: from markdownify import markdownify as md except ImportError: raise ConversionError("markdownify is required for HTML processing. Install it with: pip install markdownify") metadata = self.get_metadata(file_path) with open(file_path, 'r', encoding='utf-8') as f: html_content = f.read() content = md(html_content, heading_style="ATX") return ConversionResult(content, metadata) except Exception as e: if isinstance(e, (FileNotFoundError, ConversionError)): raise raise ConversionError(f"Failed to process HTML file {file_path}: {str(e)}")