Spaces:
Sleeping
Sleeping
File size: 2,997 Bytes
5b14aa2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | """Base processor class for document conversion."""
import os
import logging
from abc import ABC, abstractmethod
from typing import Any, Dict, Optional
from ..result import ConversionResult
from docstrange.config import InternalConfig
logger = logging.getLogger(__name__)
class BaseProcessor(ABC):
"""Base class for all document processors."""
def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = InternalConfig.use_markdownify):
"""Initialize the processor.
Args:
preserve_layout: Whether to preserve document layout
include_images: Whether to include images in output
ocr_enabled: Whether to enable OCR for image processing
use_markdownify: Whether to use markdownify for HTML->Markdown conversion
"""
self.preserve_layout = preserve_layout
self.include_images = include_images
self.ocr_enabled = ocr_enabled
self.use_markdownify = use_markdownify
@abstractmethod
def can_process(self, file_path: str) -> bool:
"""Check if this processor can handle the given file.
Args:
file_path: Path to the file to check
Returns:
True if this processor can handle the file
"""
pass
@abstractmethod
def process(self, file_path: str) -> ConversionResult:
"""Process the file and return a conversion result.
Args:
file_path: Path to the file to process
Returns:
ConversionResult containing the processed content
Raises:
ConversionError: If processing fails
"""
pass
def get_metadata(self, file_path: str) -> Dict[str, Any]:
"""Get metadata about the file.
Args:
file_path: Path to the file
Returns:
Dictionary containing file metadata
"""
try:
file_stat = os.stat(file_path)
# Ensure file_path is a string for splitext
file_path_str = str(file_path)
return {
"file_size": file_stat.st_size,
"file_extension": os.path.splitext(file_path_str)[1].lower(),
"file_name": os.path.basename(file_path_str),
"processor": self.__class__.__name__,
"preserve_layout": self.preserve_layout,
"include_images": self.include_images,
"ocr_enabled": self.ocr_enabled
}
except Exception as e:
logger.warning(f"Failed to get metadata for {file_path}: {e}")
return {
"processor": self.__class__.__name__,
"preserve_layout": self.preserve_layout,
"include_images": self.include_images,
"ocr_enabled": self.ocr_enabled
} |