docling-processor / docstrange /processors /html_processor.py
arjunbhargav212's picture
Upload 63 files
5b14aa2 verified
"""HTML file processor."""
import os
import logging
from typing import Dict, Any
from .base import BaseProcessor
from ..result import ConversionResult
from ..exceptions import ConversionError, FileNotFoundError
# Configure logging
logger = logging.getLogger(__name__)
class HTMLProcessor(BaseProcessor):
"""Processor for HTML files using markdownify for conversion."""
def can_process(self, file_path: str) -> bool:
"""Check if this processor can handle the given file.
Args:
file_path: Path to the file to check
Returns:
True if this processor can handle the file
"""
if not os.path.exists(file_path):
return False
# Check file extension - ensure file_path is a string
file_path_str = str(file_path)
_, ext = os.path.splitext(file_path_str.lower())
return ext in ['.html', '.htm']
def process(self, file_path: str) -> ConversionResult:
"""Process the HTML file and return a conversion result.
Args:
file_path: Path to the HTML file to process
Returns:
ConversionResult containing the processed content
Raises:
FileNotFoundError: If the file doesn't exist
ConversionError: If processing fails
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
try:
try:
from markdownify import markdownify as md
except ImportError:
raise ConversionError("markdownify is required for HTML processing. Install it with: pip install markdownify")
metadata = self.get_metadata(file_path)
with open(file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
content = md(html_content, heading_style="ATX")
return ConversionResult(content, metadata)
except Exception as e:
if isinstance(e, (FileNotFoundError, ConversionError)):
raise
raise ConversionError(f"Failed to process HTML file {file_path}: {str(e)}")