Spaces:
Sleeping
Sleeping
| """Base processor class for document conversion.""" | |
| import os | |
| import logging | |
| from abc import ABC, abstractmethod | |
| from typing import Any, Dict, Optional | |
| from ..result import ConversionResult | |
| from docstrange.config import InternalConfig | |
| logger = logging.getLogger(__name__) | |
| class BaseProcessor(ABC): | |
| """Base class for all document processors.""" | |
| def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = InternalConfig.use_markdownify): | |
| """Initialize the processor. | |
| Args: | |
| preserve_layout: Whether to preserve document layout | |
| include_images: Whether to include images in output | |
| ocr_enabled: Whether to enable OCR for image processing | |
| use_markdownify: Whether to use markdownify for HTML->Markdown conversion | |
| """ | |
| self.preserve_layout = preserve_layout | |
| self.include_images = include_images | |
| self.ocr_enabled = ocr_enabled | |
| self.use_markdownify = use_markdownify | |
| def can_process(self, file_path: str) -> bool: | |
| """Check if this processor can handle the given file. | |
| Args: | |
| file_path: Path to the file to check | |
| Returns: | |
| True if this processor can handle the file | |
| """ | |
| pass | |
| def process(self, file_path: str) -> ConversionResult: | |
| """Process the file and return a conversion result. | |
| Args: | |
| file_path: Path to the file to process | |
| Returns: | |
| ConversionResult containing the processed content | |
| Raises: | |
| ConversionError: If processing fails | |
| """ | |
| pass | |
| def get_metadata(self, file_path: str) -> Dict[str, Any]: | |
| """Get metadata about the file. | |
| Args: | |
| file_path: Path to the file | |
| Returns: | |
| Dictionary containing file metadata | |
| """ | |
| try: | |
| file_stat = os.stat(file_path) | |
| # Ensure file_path is a string for splitext | |
| file_path_str = str(file_path) | |
| return { | |
| "file_size": file_stat.st_size, | |
| "file_extension": os.path.splitext(file_path_str)[1].lower(), | |
| "file_name": os.path.basename(file_path_str), | |
| "processor": self.__class__.__name__, | |
| "preserve_layout": self.preserve_layout, | |
| "include_images": self.include_images, | |
| "ocr_enabled": self.ocr_enabled | |
| } | |
| except Exception as e: | |
| logger.warning(f"Failed to get metadata for {file_path}: {e}") | |
| return { | |
| "processor": self.__class__.__name__, | |
| "preserve_layout": self.preserve_layout, | |
| "include_images": self.include_images, | |
| "ocr_enabled": self.ocr_enabled | |
| } |