"""PDF file processor with OCR support for scanned PDFs."""

import os
import logging
import tempfile
from typing import Dict, Any, List, Tuple

from .base import BaseProcessor
from .image_processor import ImageProcessor
from ..result import ConversionResult
from ..exceptions import ConversionError, FileNotFoundError
from ..config import InternalConfig
from ..pipeline.ocr_service import OCRServiceFactory, NeuralOCRService

# Configure logging
logger = logging.getLogger(__name__)


class PDFProcessor(BaseProcessor):
    """Processor for PDF files using PDF-to-image conversion with OCR."""
    
    def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None):
        super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify)
        # Create a shared OCR service instance for all pages
        shared_ocr_service = NeuralOCRService()
        self._image_processor = ImageProcessor(
            preserve_layout=preserve_layout,
            include_images=include_images,
            ocr_enabled=ocr_enabled,
            use_markdownify=use_markdownify,
            ocr_service=shared_ocr_service
        )
    
    def can_process(self, file_path: str) -> bool:
        """Check if this processor can handle the given file.
        
        Args:
            file_path: Path to the file to check
            
        Returns:
            True if this processor can handle the file
        """
        if not os.path.exists(file_path):
            return False
        
        # Check file extension - ensure file_path is a string
        file_path_str = str(file_path)
        _, ext = os.path.splitext(file_path_str.lower())
        return ext == '.pdf'
    
    def process(self, file_path: str) -> ConversionResult:
        """Process PDF file with OCR capabilities.
        
        Args:
            file_path: Path to the PDF file
            
        Returns:
            ConversionResult with extracted content
        """
        try:
            from ..config import InternalConfig
            pdf_to_image_enabled = InternalConfig.pdf_to_image_enabled
        except (ImportError, AttributeError):
            # Fallback if config is not available
            pdf_to_image_enabled = True
            logger.warning("InternalConfig not available, defaulting to pdf_to_image_enabled = True")
        
        try:
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"PDF file not found: {file_path}")
            
            logger.info(f"Processing PDF file: {file_path}")
            logger.info(f"pdf_to_image_enabled = {pdf_to_image_enabled}")
            
            # Always use OCR-based processing (pdf2image + OCR)
            logger.info("Using OCR-based PDF processing with pdf2image")
            return self._process_with_ocr(file_path)
            
        except Exception as e:
            logger.error(f"Failed to process PDF file {file_path}: {e}")
            raise ConversionError(f"PDF processing failed: {e}")
    
    def _process_with_ocr(self, file_path: str) -> ConversionResult:
        """Process PDF using OCR after converting pages to images."""
        try:
            from pdf2image import convert_from_path
            from ..config import InternalConfig
            
            # Get DPI from config
            dpi = getattr(InternalConfig, 'pdf_image_dpi', 300)
            
            # Convert PDF pages to images using pdf2image
            images = convert_from_path(file_path, dpi=dpi)
            page_count = len(images)
            all_content = []
            
            for page_num, image in enumerate(images):
                # Save to temporary file for OCR processing
                with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
                    image.save(tmp.name, 'PNG')
                    temp_image_path = tmp.name
                
                try:
                    # Process the page image
                    page_result = self._image_processor.process(temp_image_path)
                    page_content = page_result.content
                    
                    if page_content.strip():
                        all_content.append(f"## Page {page_num + 1}\n\n{page_content}")
                    
                finally:
                    # Clean up temporary file
                    os.unlink(temp_image_path)
            
            content = "\n\n".join(all_content) if all_content else "No content extracted from PDF"
            
            return ConversionResult(
                content=content,
                metadata={
                    'file_path': file_path,
                    'file_type': 'pdf',
                    'pages': page_count,
                    'extraction_method': 'ocr'
                }
            )
            
        except ImportError:
            logger.error("pdf2image not available. Please install it: pip install pdf2image")
            raise ConversionError("pdf2image is required for PDF processing")
        except Exception as e:
            logger.error(f"OCR-based PDF processing failed: {e}")
            raise ConversionError(f"OCR-based PDF processing failed: {e}")

    @staticmethod
    def predownload_ocr_models():
        """Pre-download OCR models by running a dummy prediction."""
        try:
            # Use ImageProcessor's predownload method
            ImageProcessor.predownload_ocr_models()
        except Exception as e:
            logger.error(f"Failed to pre-download OCR models: {e}")