docling-processor / docstrange /processors /image_processor.py
arjunbhargav212's picture
Upload 63 files
5b14aa2 verified
"""Image file processor with OCR capabilities."""
import os
import logging
from typing import Dict, Any
from .base import BaseProcessor
from ..result import ConversionResult
from ..exceptions import ConversionError, FileNotFoundError
from ..pipeline.ocr_service import OCRServiceFactory
# Configure logging
logger = logging.getLogger(__name__)
class ImageProcessor(BaseProcessor):
"""Processor for image files (JPG, PNG, etc.) with OCR capabilities."""
def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None, ocr_service=None):
super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify)
self._ocr_service = ocr_service
def can_process(self, file_path: str) -> bool:
"""Check if this processor can handle the given file.
Args:
file_path: Path to the file to check
Returns:
True if this processor can handle the file
"""
if not os.path.exists(file_path):
return False
# Check file extension - ensure file_path is a string
file_path_str = str(file_path)
_, ext = os.path.splitext(file_path_str.lower())
return ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif']
def _get_ocr_service(self):
"""Get OCR service instance."""
if self._ocr_service is not None:
return self._ocr_service
self._ocr_service = OCRServiceFactory.create_service()
return self._ocr_service
def process(self, file_path: str) -> ConversionResult:
"""Process image file with OCR capabilities.
Args:
file_path: Path to the image file
Returns:
ConversionResult with extracted content
"""
try:
if not os.path.exists(file_path):
raise FileNotFoundError(f"Image file not found: {file_path}")
logger.info(f"Processing image file: {file_path}")
# Get OCR service
ocr_service = self._get_ocr_service()
# Extract text with layout awareness if enabled
if self.ocr_enabled and self.preserve_layout:
logger.info("Extracting text with layout awareness")
extracted_text = ocr_service.extract_text_with_layout(file_path)
elif self.ocr_enabled:
logger.info("Extracting text without layout awareness")
extracted_text = ocr_service.extract_text(file_path)
else:
logger.warning("OCR is disabled, returning empty content")
extracted_text = ""
# Create result
result = ConversionResult(
content=extracted_text,
metadata={
'file_path': file_path,
'file_type': 'image',
'ocr_enabled': self.ocr_enabled,
'preserve_layout': self.preserve_layout
}
)
logger.info(f"Image processing completed. Extracted {len(extracted_text)} characters")
return result
except Exception as e:
logger.error(f"Failed to process image file {file_path}: {e}")
raise ConversionError(f"Image processing failed: {e}")
@staticmethod
def predownload_ocr_models():
"""Pre-download OCR models by running a dummy prediction."""
try:
from docstrange.services.ocr_service import OCRServiceFactory
ocr_service = OCRServiceFactory.create_service()
# Create a blank image for testing
from PIL import Image
import tempfile
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
img = Image.new('RGB', (100, 100), color='white')
img.save(tmp.name)
ocr_service.extract_text_with_layout(tmp.name)
os.unlink(tmp.name)
logger.info("OCR models pre-downloaded and cached.")
except Exception as e:
logger.error(f"Failed to pre-download OCR models: {e}")