Spaces:

arjunbhargav212
/

docling-processor

Running

File size: 4,311 Bytes

5b14aa2

"""Image file processor with OCR capabilities."""

import os
import logging
from typing import Dict, Any

from .base import BaseProcessor
from ..result import ConversionResult
from ..exceptions import ConversionError, FileNotFoundError
from ..pipeline.ocr_service import OCRServiceFactory

# Configure logging
logger = logging.getLogger(__name__)


class ImageProcessor(BaseProcessor):
    """Processor for image files (JPG, PNG, etc.) with OCR capabilities."""
    
    def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None, ocr_service=None):
        super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify)
        self._ocr_service = ocr_service
    
    def can_process(self, file_path: str) -> bool:
        """Check if this processor can handle the given file.
        
        Args:
            file_path: Path to the file to check
            
        Returns:
            True if this processor can handle the file
        """
        if not os.path.exists(file_path):
            return False
        
        # Check file extension - ensure file_path is a string
        file_path_str = str(file_path)
        _, ext = os.path.splitext(file_path_str.lower())
        return ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif']
    
    def _get_ocr_service(self):
        """Get OCR service instance."""
        if self._ocr_service is not None:
            return self._ocr_service
        self._ocr_service = OCRServiceFactory.create_service()
        return self._ocr_service
    
    def process(self, file_path: str) -> ConversionResult:
        """Process image file with OCR capabilities.
        
        Args:
            file_path: Path to the image file
            
        Returns:
            ConversionResult with extracted content
        """
        try:
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"Image file not found: {file_path}")
            
            logger.info(f"Processing image file: {file_path}")
            
            # Get OCR service
            ocr_service = self._get_ocr_service()
            
            # Extract text with layout awareness if enabled
            if self.ocr_enabled and self.preserve_layout:
                logger.info("Extracting text with layout awareness")
                extracted_text = ocr_service.extract_text_with_layout(file_path)
            elif self.ocr_enabled:
                logger.info("Extracting text without layout awareness")
                extracted_text = ocr_service.extract_text(file_path)
            else:
                logger.warning("OCR is disabled, returning empty content")
                extracted_text = ""
            
            # Create result
            result = ConversionResult(
                content=extracted_text,
                metadata={
                    'file_path': file_path,
                    'file_type': 'image',
                    'ocr_enabled': self.ocr_enabled,
                    'preserve_layout': self.preserve_layout
                }
            )
            
            logger.info(f"Image processing completed. Extracted {len(extracted_text)} characters")
            return result
            
        except Exception as e:
            logger.error(f"Failed to process image file {file_path}: {e}")
            raise ConversionError(f"Image processing failed: {e}")
    
    @staticmethod
    def predownload_ocr_models():
        """Pre-download OCR models by running a dummy prediction."""
        try:
            from docstrange.services.ocr_service import OCRServiceFactory
            ocr_service = OCRServiceFactory.create_service()
            # Create a blank image for testing
            from PIL import Image
            import tempfile
            with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
                img = Image.new('RGB', (100, 100), color='white')
                img.save(tmp.name)
                ocr_service.extract_text_with_layout(tmp.name)
                os.unlink(tmp.name)
            logger.info("OCR models pre-downloaded and cached.")
        except Exception as e:
            logger.error(f"Failed to pre-download OCR models: {e}")