"""
Text extraction utilities for PDF and images.
Supports both digital PDFs and scanned documents (OCR).
"""

import pdfplumber
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
from pathlib import Path
from typing import Dict, Tuple
import logging

logger = logging.getLogger(__name__)


def extract_text_from_pdf(file_path: Path) -> Tuple[str, Dict]:
    """
    Extract text from PDF using pdfplumber (for digital PDFs).
    
    Returns:
        (raw_text, metadata)
    """
    try:
        text_pages = []
        page_count = 0
        
        with pdfplumber.open(str(file_path)) as pdf:
            page_count = len(pdf.pages)
            
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    text_pages.append(text)
        
        raw_text = "\n\n".join(text_pages)
        
        metadata = {
            "page_count": page_count,
            "extraction_method": "pdfplumber",
            "confidence_score": 1.0 if len(raw_text) > 50 else 0.5
        }
        
        # If no text extracted, it might be a scanned PDF
        if not raw_text.strip():
            logger.info("No text found with pdfplumber, trying OCR...")
            return extract_text_from_pdf_ocr(file_path)
        
        return raw_text, metadata
    
    except Exception as e:
        logger.error(f"PDF extraction failed: {e}")
        raise


def extract_text_from_pdf_ocr(file_path: Path) -> Tuple[str, Dict]:
    """
    Extract text from scanned PDF using OCR (PyMuPDF + Tesseract).
    """
    try:
        text_pages = []
        doc = fitz.open(str(file_path))
        page_count = len(doc)
        
        for page_num in range(page_count):
            page = doc[page_num]
            # Convert page to image
            pix = page.get_pixmap(dpi=300)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            # OCR
            text = pytesseract.image_to_string(img)
            text_pages.append(text)
        
        doc.close()
        raw_text = "\n\n".join(text_pages)
        
        metadata = {
            "page_count": page_count,
            "extraction_method": "tesseract_ocr",
            "confidence_score": 0.7  # OCR typically less confident
        }
        
        return raw_text, metadata
    
    except Exception as e:
        logger.error(f"OCR extraction failed: {e}")
        raise


def extract_text_from_image(file_path: Path) -> Tuple[str, Dict]:
    """
    Extract text from image using OCR (Tesseract).
    """
    try:
        img = Image.open(str(file_path))
        raw_text = pytesseract.image_to_string(img)
        
        metadata = {
            "page_count": 1,
            "extraction_method": "tesseract_ocr",
            "confidence_score": 0.7
        }
        
        return raw_text, metadata
    
    except Exception as e:
        logger.error(f"Image OCR failed: {e}")
        raise


def extract_text(file_path: Path, mime_type: str) -> Tuple[str, Dict]:
    """
    Main entry point for text extraction.
    Routes to appropriate extractor based on file type.
    
    Args:
        file_path: Path to document
        mime_type: MIME type of document
        
    Returns:
        (raw_text, metadata_dict)
    """
    if mime_type == "application/pdf":
        return extract_text_from_pdf(file_path)
    elif mime_type in ["image/png", "image/jpeg", "image/jpg"]:
        return extract_text_from_image(file_path)
    else:
        raise ValueError(f"Unsupported file type: {mime_type}")