arjunbhargav212's picture
Upload 63 files
5b14aa2 verified
"""Base processor class for document conversion."""
import os
import logging
from abc import ABC, abstractmethod
from typing import Any, Dict, Optional
from ..result import ConversionResult
from docstrange.config import InternalConfig
logger = logging.getLogger(__name__)
class BaseProcessor(ABC):
"""Base class for all document processors."""
def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = InternalConfig.use_markdownify):
"""Initialize the processor.
Args:
preserve_layout: Whether to preserve document layout
include_images: Whether to include images in output
ocr_enabled: Whether to enable OCR for image processing
use_markdownify: Whether to use markdownify for HTML->Markdown conversion
"""
self.preserve_layout = preserve_layout
self.include_images = include_images
self.ocr_enabled = ocr_enabled
self.use_markdownify = use_markdownify
@abstractmethod
def can_process(self, file_path: str) -> bool:
"""Check if this processor can handle the given file.
Args:
file_path: Path to the file to check
Returns:
True if this processor can handle the file
"""
pass
@abstractmethod
def process(self, file_path: str) -> ConversionResult:
"""Process the file and return a conversion result.
Args:
file_path: Path to the file to process
Returns:
ConversionResult containing the processed content
Raises:
ConversionError: If processing fails
"""
pass
def get_metadata(self, file_path: str) -> Dict[str, Any]:
"""Get metadata about the file.
Args:
file_path: Path to the file
Returns:
Dictionary containing file metadata
"""
try:
file_stat = os.stat(file_path)
# Ensure file_path is a string for splitext
file_path_str = str(file_path)
return {
"file_size": file_stat.st_size,
"file_extension": os.path.splitext(file_path_str)[1].lower(),
"file_name": os.path.basename(file_path_str),
"processor": self.__class__.__name__,
"preserve_layout": self.preserve_layout,
"include_images": self.include_images,
"ocr_enabled": self.ocr_enabled
}
except Exception as e:
logger.warning(f"Failed to get metadata for {file_path}: {e}")
return {
"processor": self.__class__.__name__,
"preserve_layout": self.preserve_layout,
"include_images": self.include_images,
"ocr_enabled": self.ocr_enabled
}