Spaces:

arjunbhargav212
/

docling-processor

Running

App Files Files Community

docling-processor / docstrange /processors /image_processor.py

arjunbhargav212

Upload 63 files

5b14aa2 verified 27 days ago

raw

history blame contribute delete

4.31 kB

	"""Image file processor with OCR capabilities."""

	import os
	import logging
	from typing import Dict, Any

	from .base import BaseProcessor
	from ..result import ConversionResult
	from ..exceptions import ConversionError, FileNotFoundError
	from ..pipeline.ocr_service import OCRServiceFactory

	# Configure logging
	logger = logging.getLogger(__name__)


	class ImageProcessor(BaseProcessor):
	"""Processor for image files (JPG, PNG, etc.) with OCR capabilities."""

	def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None, ocr_service=None):
	super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify)
	self._ocr_service = ocr_service

	def can_process(self, file_path: str) -> bool:
	"""Check if this processor can handle the given file.

	Args:
	file_path: Path to the file to check

	Returns:
	True if this processor can handle the file
	"""
	if not os.path.exists(file_path):
	return False

	# Check file extension - ensure file_path is a string
	file_path_str = str(file_path)
	_, ext = os.path.splitext(file_path_str.lower())
	return ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif']

	def _get_ocr_service(self):
	"""Get OCR service instance."""
	if self._ocr_service is not None:
	return self._ocr_service
	self._ocr_service = OCRServiceFactory.create_service()
	return self._ocr_service

	def process(self, file_path: str) -> ConversionResult:
	"""Process image file with OCR capabilities.

	Args:
	file_path: Path to the image file

	Returns:
	ConversionResult with extracted content
	"""
	try:
	if not os.path.exists(file_path):
	raise FileNotFoundError(f"Image file not found: {file_path}")

	logger.info(f"Processing image file: {file_path}")

	# Get OCR service
	ocr_service = self._get_ocr_service()

	# Extract text with layout awareness if enabled
	if self.ocr_enabled and self.preserve_layout:
	logger.info("Extracting text with layout awareness")
	extracted_text = ocr_service.extract_text_with_layout(file_path)
	elif self.ocr_enabled:
	logger.info("Extracting text without layout awareness")
	extracted_text = ocr_service.extract_text(file_path)
	else:
	logger.warning("OCR is disabled, returning empty content")
	extracted_text = ""

	# Create result
	result = ConversionResult(
	content=extracted_text,
	metadata={
	'file_path': file_path,
	'file_type': 'image',
	'ocr_enabled': self.ocr_enabled,
	'preserve_layout': self.preserve_layout
	}
	)

	logger.info(f"Image processing completed. Extracted {len(extracted_text)} characters")
	return result

	except Exception as e:
	logger.error(f"Failed to process image file {file_path}: {e}")
	raise ConversionError(f"Image processing failed: {e}")

	@staticmethod
	def predownload_ocr_models():
	"""Pre-download OCR models by running a dummy prediction."""
	try:
	from docstrange.services.ocr_service import OCRServiceFactory
	ocr_service = OCRServiceFactory.create_service()
	# Create a blank image for testing
	from PIL import Image
	import tempfile
	with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
	img = Image.new('RGB', (100, 100), color='white')
	img.save(tmp.name)
	ocr_service.extract_text_with_layout(tmp.name)
	os.unlink(tmp.name)
	logger.info("OCR models pre-downloaded and cached.")
	except Exception as e:
	logger.error(f"Failed to pre-download OCR models: {e}")