"""PDF file processor with OCR support for scanned PDFs.""" import os import logging import tempfile from typing import Dict, Any, List, Tuple from .base import BaseProcessor from .image_processor import ImageProcessor from ..result import ConversionResult from ..exceptions import ConversionError, FileNotFoundError from ..config import InternalConfig from ..pipeline.ocr_service import OCRServiceFactory, NeuralOCRService # Configure logging logger = logging.getLogger(__name__) class PDFProcessor(BaseProcessor): """Processor for PDF files using PDF-to-image conversion with OCR.""" def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None): super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify) # Create a shared OCR service instance for all pages shared_ocr_service = NeuralOCRService() self._image_processor = ImageProcessor( preserve_layout=preserve_layout, include_images=include_images, ocr_enabled=ocr_enabled, use_markdownify=use_markdownify, ocr_service=shared_ocr_service ) def can_process(self, file_path: str) -> bool: """Check if this processor can handle the given file. Args: file_path: Path to the file to check Returns: True if this processor can handle the file """ if not os.path.exists(file_path): return False # Check file extension - ensure file_path is a string file_path_str = str(file_path) _, ext = os.path.splitext(file_path_str.lower()) return ext == '.pdf' def process(self, file_path: str) -> ConversionResult: """Process PDF file with OCR capabilities. Args: file_path: Path to the PDF file Returns: ConversionResult with extracted content """ try: from ..config import InternalConfig pdf_to_image_enabled = InternalConfig.pdf_to_image_enabled except (ImportError, AttributeError): # Fallback if config is not available pdf_to_image_enabled = True logger.warning("InternalConfig not available, defaulting to pdf_to_image_enabled = True") try: if not os.path.exists(file_path): raise FileNotFoundError(f"PDF file not found: {file_path}") logger.info(f"Processing PDF file: {file_path}") logger.info(f"pdf_to_image_enabled = {pdf_to_image_enabled}") # Always use OCR-based processing (pdf2image + OCR) logger.info("Using OCR-based PDF processing with pdf2image") return self._process_with_ocr(file_path) except Exception as e: logger.error(f"Failed to process PDF file {file_path}: {e}") raise ConversionError(f"PDF processing failed: {e}") def _process_with_ocr(self, file_path: str) -> ConversionResult: """Process PDF using OCR after converting pages to images.""" try: from pdf2image import convert_from_path from ..config import InternalConfig # Get DPI from config dpi = getattr(InternalConfig, 'pdf_image_dpi', 300) # Convert PDF pages to images using pdf2image images = convert_from_path(file_path, dpi=dpi) page_count = len(images) all_content = [] for page_num, image in enumerate(images): # Save to temporary file for OCR processing with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp: image.save(tmp.name, 'PNG') temp_image_path = tmp.name try: # Process the page image page_result = self._image_processor.process(temp_image_path) page_content = page_result.content if page_content.strip(): all_content.append(f"## Page {page_num + 1}\n\n{page_content}") finally: # Clean up temporary file os.unlink(temp_image_path) content = "\n\n".join(all_content) if all_content else "No content extracted from PDF" return ConversionResult( content=content, metadata={ 'file_path': file_path, 'file_type': 'pdf', 'pages': page_count, 'extraction_method': 'ocr' } ) except ImportError: logger.error("pdf2image not available. Please install it: pip install pdf2image") raise ConversionError("pdf2image is required for PDF processing") except Exception as e: logger.error(f"OCR-based PDF processing failed: {e}") raise ConversionError(f"OCR-based PDF processing failed: {e}") @staticmethod def predownload_ocr_models(): """Pre-download OCR models by running a dummy prediction.""" try: # Use ImageProcessor's predownload method ImageProcessor.predownload_ocr_models() except Exception as e: logger.error(f"Failed to pre-download OCR models: {e}")