File size: 5,615 Bytes
5b14aa2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | """PDF file processor with OCR support for scanned PDFs."""
import os
import logging
import tempfile
from typing import Dict, Any, List, Tuple
from .base import BaseProcessor
from .image_processor import ImageProcessor
from ..result import ConversionResult
from ..exceptions import ConversionError, FileNotFoundError
from ..config import InternalConfig
from ..pipeline.ocr_service import OCRServiceFactory, NeuralOCRService
# Configure logging
logger = logging.getLogger(__name__)
class PDFProcessor(BaseProcessor):
"""Processor for PDF files using PDF-to-image conversion with OCR."""
def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None):
super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify)
# Create a shared OCR service instance for all pages
shared_ocr_service = NeuralOCRService()
self._image_processor = ImageProcessor(
preserve_layout=preserve_layout,
include_images=include_images,
ocr_enabled=ocr_enabled,
use_markdownify=use_markdownify,
ocr_service=shared_ocr_service
)
def can_process(self, file_path: str) -> bool:
"""Check if this processor can handle the given file.
Args:
file_path: Path to the file to check
Returns:
True if this processor can handle the file
"""
if not os.path.exists(file_path):
return False
# Check file extension - ensure file_path is a string
file_path_str = str(file_path)
_, ext = os.path.splitext(file_path_str.lower())
return ext == '.pdf'
def process(self, file_path: str) -> ConversionResult:
"""Process PDF file with OCR capabilities.
Args:
file_path: Path to the PDF file
Returns:
ConversionResult with extracted content
"""
try:
from ..config import InternalConfig
pdf_to_image_enabled = InternalConfig.pdf_to_image_enabled
except (ImportError, AttributeError):
# Fallback if config is not available
pdf_to_image_enabled = True
logger.warning("InternalConfig not available, defaulting to pdf_to_image_enabled = True")
try:
if not os.path.exists(file_path):
raise FileNotFoundError(f"PDF file not found: {file_path}")
logger.info(f"Processing PDF file: {file_path}")
logger.info(f"pdf_to_image_enabled = {pdf_to_image_enabled}")
# Always use OCR-based processing (pdf2image + OCR)
logger.info("Using OCR-based PDF processing with pdf2image")
return self._process_with_ocr(file_path)
except Exception as e:
logger.error(f"Failed to process PDF file {file_path}: {e}")
raise ConversionError(f"PDF processing failed: {e}")
def _process_with_ocr(self, file_path: str) -> ConversionResult:
"""Process PDF using OCR after converting pages to images."""
try:
from pdf2image import convert_from_path
from ..config import InternalConfig
# Get DPI from config
dpi = getattr(InternalConfig, 'pdf_image_dpi', 300)
# Convert PDF pages to images using pdf2image
images = convert_from_path(file_path, dpi=dpi)
page_count = len(images)
all_content = []
for page_num, image in enumerate(images):
# Save to temporary file for OCR processing
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
image.save(tmp.name, 'PNG')
temp_image_path = tmp.name
try:
# Process the page image
page_result = self._image_processor.process(temp_image_path)
page_content = page_result.content
if page_content.strip():
all_content.append(f"## Page {page_num + 1}\n\n{page_content}")
finally:
# Clean up temporary file
os.unlink(temp_image_path)
content = "\n\n".join(all_content) if all_content else "No content extracted from PDF"
return ConversionResult(
content=content,
metadata={
'file_path': file_path,
'file_type': 'pdf',
'pages': page_count,
'extraction_method': 'ocr'
}
)
except ImportError:
logger.error("pdf2image not available. Please install it: pip install pdf2image")
raise ConversionError("pdf2image is required for PDF processing")
except Exception as e:
logger.error(f"OCR-based PDF processing failed: {e}")
raise ConversionError(f"OCR-based PDF processing failed: {e}")
@staticmethod
def predownload_ocr_models():
"""Pre-download OCR models by running a dummy prediction."""
try:
# Use ImageProcessor's predownload method
ImageProcessor.predownload_ocr_models()
except Exception as e:
logger.error(f"Failed to pre-download OCR models: {e}") |