File size: 4,311 Bytes
5b14aa2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | """Image file processor with OCR capabilities."""
import os
import logging
from typing import Dict, Any
from .base import BaseProcessor
from ..result import ConversionResult
from ..exceptions import ConversionError, FileNotFoundError
from ..pipeline.ocr_service import OCRServiceFactory
# Configure logging
logger = logging.getLogger(__name__)
class ImageProcessor(BaseProcessor):
"""Processor for image files (JPG, PNG, etc.) with OCR capabilities."""
def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None, ocr_service=None):
super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify)
self._ocr_service = ocr_service
def can_process(self, file_path: str) -> bool:
"""Check if this processor can handle the given file.
Args:
file_path: Path to the file to check
Returns:
True if this processor can handle the file
"""
if not os.path.exists(file_path):
return False
# Check file extension - ensure file_path is a string
file_path_str = str(file_path)
_, ext = os.path.splitext(file_path_str.lower())
return ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif']
def _get_ocr_service(self):
"""Get OCR service instance."""
if self._ocr_service is not None:
return self._ocr_service
self._ocr_service = OCRServiceFactory.create_service()
return self._ocr_service
def process(self, file_path: str) -> ConversionResult:
"""Process image file with OCR capabilities.
Args:
file_path: Path to the image file
Returns:
ConversionResult with extracted content
"""
try:
if not os.path.exists(file_path):
raise FileNotFoundError(f"Image file not found: {file_path}")
logger.info(f"Processing image file: {file_path}")
# Get OCR service
ocr_service = self._get_ocr_service()
# Extract text with layout awareness if enabled
if self.ocr_enabled and self.preserve_layout:
logger.info("Extracting text with layout awareness")
extracted_text = ocr_service.extract_text_with_layout(file_path)
elif self.ocr_enabled:
logger.info("Extracting text without layout awareness")
extracted_text = ocr_service.extract_text(file_path)
else:
logger.warning("OCR is disabled, returning empty content")
extracted_text = ""
# Create result
result = ConversionResult(
content=extracted_text,
metadata={
'file_path': file_path,
'file_type': 'image',
'ocr_enabled': self.ocr_enabled,
'preserve_layout': self.preserve_layout
}
)
logger.info(f"Image processing completed. Extracted {len(extracted_text)} characters")
return result
except Exception as e:
logger.error(f"Failed to process image file {file_path}: {e}")
raise ConversionError(f"Image processing failed: {e}")
@staticmethod
def predownload_ocr_models():
"""Pre-download OCR models by running a dummy prediction."""
try:
from docstrange.services.ocr_service import OCRServiceFactory
ocr_service = OCRServiceFactory.create_service()
# Create a blank image for testing
from PIL import Image
import tempfile
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
img = Image.new('RGB', (100, 100), color='white')
img.save(tmp.name)
ocr_service.extract_text_with_layout(tmp.name)
os.unlink(tmp.name)
logger.info("OCR models pre-downloaded and cached.")
except Exception as e:
logger.error(f"Failed to pre-download OCR models: {e}") |