Spaces:

arjunbhargav212
/

docling-processor

Running

App Files Files Community

docling-processor / docstrange /processors /pdf_processor.py

arjunbhargav212

Upload 63 files

5b14aa2 verified about 1 month ago

raw

history blame contribute delete

5.62 kB

	"""PDF file processor with OCR support for scanned PDFs."""

	import os
	import logging
	import tempfile
	from typing import Dict, Any, List, Tuple

	from .base import BaseProcessor
	from .image_processor import ImageProcessor
	from ..result import ConversionResult
	from ..exceptions import ConversionError, FileNotFoundError
	from ..config import InternalConfig
	from ..pipeline.ocr_service import OCRServiceFactory, NeuralOCRService

	# Configure logging
	logger = logging.getLogger(__name__)


	class PDFProcessor(BaseProcessor):
	"""Processor for PDF files using PDF-to-image conversion with OCR."""

	def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None):
	super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify)
	# Create a shared OCR service instance for all pages
	shared_ocr_service = NeuralOCRService()
	self._image_processor = ImageProcessor(
	preserve_layout=preserve_layout,
	include_images=include_images,
	ocr_enabled=ocr_enabled,
	use_markdownify=use_markdownify,
	ocr_service=shared_ocr_service
	)

	def can_process(self, file_path: str) -> bool:
	"""Check if this processor can handle the given file.

	Args:
	file_path: Path to the file to check

	Returns:
	True if this processor can handle the file
	"""
	if not os.path.exists(file_path):
	return False

	# Check file extension - ensure file_path is a string
	file_path_str = str(file_path)
	_, ext = os.path.splitext(file_path_str.lower())
	return ext == '.pdf'

	def process(self, file_path: str) -> ConversionResult:
	"""Process PDF file with OCR capabilities.

	Args:
	file_path: Path to the PDF file

	Returns:
	ConversionResult with extracted content
	"""
	try:
	from ..config import InternalConfig
	pdf_to_image_enabled = InternalConfig.pdf_to_image_enabled
	except (ImportError, AttributeError):
	# Fallback if config is not available
	pdf_to_image_enabled = True
	logger.warning("InternalConfig not available, defaulting to pdf_to_image_enabled = True")

	try:
	if not os.path.exists(file_path):
	raise FileNotFoundError(f"PDF file not found: {file_path}")

	logger.info(f"Processing PDF file: {file_path}")
	logger.info(f"pdf_to_image_enabled = {pdf_to_image_enabled}")

	# Always use OCR-based processing (pdf2image + OCR)
	logger.info("Using OCR-based PDF processing with pdf2image")
	return self._process_with_ocr(file_path)

	except Exception as e:
	logger.error(f"Failed to process PDF file {file_path}: {e}")
	raise ConversionError(f"PDF processing failed: {e}")

	def _process_with_ocr(self, file_path: str) -> ConversionResult:
	"""Process PDF using OCR after converting pages to images."""
	try:
	from pdf2image import convert_from_path
	from ..config import InternalConfig

	# Get DPI from config
	dpi = getattr(InternalConfig, 'pdf_image_dpi', 300)

	# Convert PDF pages to images using pdf2image
	images = convert_from_path(file_path, dpi=dpi)
	page_count = len(images)
	all_content = []

	for page_num, image in enumerate(images):
	# Save to temporary file for OCR processing
	with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
	image.save(tmp.name, 'PNG')
	temp_image_path = tmp.name

	try:
	# Process the page image
	page_result = self._image_processor.process(temp_image_path)
	page_content = page_result.content

	if page_content.strip():
	all_content.append(f"## Page {page_num + 1}\n\n{page_content}")

	finally:
	# Clean up temporary file
	os.unlink(temp_image_path)

	content = "\n\n".join(all_content) if all_content else "No content extracted from PDF"

	return ConversionResult(
	content=content,
	metadata={
	'file_path': file_path,
	'file_type': 'pdf',
	'pages': page_count,
	'extraction_method': 'ocr'
	}
	)

	except ImportError:
	logger.error("pdf2image not available. Please install it: pip install pdf2image")
	raise ConversionError("pdf2image is required for PDF processing")
	except Exception as e:
	logger.error(f"OCR-based PDF processing failed: {e}")
	raise ConversionError(f"OCR-based PDF processing failed: {e}")

	@staticmethod
	def predownload_ocr_models():
	"""Pre-download OCR models by running a dummy prediction."""
	try:
	# Use ImageProcessor's predownload method
	ImageProcessor.predownload_ocr_models()
	except Exception as e:
	logger.error(f"Failed to pre-download OCR models: {e}")