"""
PDF Processor - Convert PDFs to images and extract text.

This module works INDEPENDENTLY of embedding and vector storage.
Use it if you just need PDF → images conversion.

Features:
- Batch processing to save memory
- Text extraction with surrogate character handling
- Configurable DPI and quality settings
"""

import gc
import logging
import re
from pathlib import Path
from typing import Any, Dict, Generator, List, Optional, Tuple

from PIL import Image

logger = logging.getLogger(__name__)


class PDFProcessor:
    """
    Process PDFs into images and text for visual retrieval.

    Works independently - no embedding or storage dependencies.

    Args:
        dpi: DPI for image conversion (higher = better quality)
        output_format: Image format (RGB, L, etc.)
        page_batch_size: Pages per batch for memory efficiency

    Example:
        >>> processor = PDFProcessor(dpi=140)
        >>>
        >>> # Convert single PDF
        >>> images, texts = processor.process_pdf(Path("report.pdf"))
        >>>
        >>> # Stream large PDFs
        >>> for images, texts in processor.stream_pdf(Path("large.pdf"), batch_size=10):
        ...     # Process each batch
        ...     pass
    """

    def __init__(
        self,
        dpi: int = 140,
        output_format: str = "RGB",
        page_batch_size: int = 50,
    ):
        self.dpi = dpi
        self.output_format = output_format
        self.page_batch_size = page_batch_size

        # PDF deps are optional: we only require them when calling PDF-specific methods.
        # This keeps the class usable for helper utilities like `resize_for_colpali()`
        # even in minimal installs.
        self._pdf_deps_available = True
        try:
            import pdf2image  # noqa: F401
            import pypdf  # noqa: F401
        except Exception:
            self._pdf_deps_available = False

    def _require_pdf_deps(self) -> None:
        if not self._pdf_deps_available:
            raise ImportError(
                "PDF processing requires `pdf2image` and `pypdf`.\n"
                'Install with: pip install "visual-rag-toolkit[pdf]"'
            )

    def process_pdf(
        self,
        pdf_path: Path,
        dpi: Optional[int] = None,
    ) -> Tuple[List[Image.Image], List[str]]:
        """
        Convert PDF to images and extract text.

        Args:
            pdf_path: Path to PDF file
            dpi: Override default DPI

        Returns:
            Tuple of (list of images, list of page texts)
        """
        self._require_pdf_deps()
        from pdf2image import convert_from_path
        from pypdf import PdfReader

        dpi = dpi or self.dpi
        pdf_path = Path(pdf_path)

        logger.info(f"📄 Processing PDF: {pdf_path.name}")

        # Extract text
        reader = PdfReader(str(pdf_path))
        total_pages = len(reader.pages)

        page_texts = []
        for page in reader.pages:
            text = page.extract_text() or ""
            # Handle surrogate characters
            text = self._sanitize_text(text)
            page_texts.append(text)

        # Convert to images in batches
        all_images = []
        for start_page in range(1, total_pages + 1, self.page_batch_size):
            end_page = min(start_page + self.page_batch_size - 1, total_pages)

            batch_images = convert_from_path(
                str(pdf_path),
                dpi=dpi,
                fmt=self.output_format.lower(),
                first_page=start_page,
                last_page=end_page,
            )

            all_images.extend(batch_images)

            del batch_images
            gc.collect()

        assert len(all_images) == len(
            page_texts
        ), f"Mismatch: {len(all_images)} images vs {len(page_texts)} texts"

        logger.info(f"✅ Processed {len(all_images)} pages")
        return all_images, page_texts

    def stream_pdf(
        self,
        pdf_path: Path,
        batch_size: int = 10,
        dpi: Optional[int] = None,
    ) -> Generator[Tuple[List[Image.Image], List[str], int], None, None]:
        """
        Stream PDF processing for large files.

        Yields batches of (images, texts, start_page) without loading
        entire PDF into memory.

        Args:
            pdf_path: Path to PDF file
            batch_size: Pages per batch
            dpi: Override default DPI

        Yields:
            Tuple of (batch_images, batch_texts, start_page_number)
        """
        self._require_pdf_deps()
        from pdf2image import convert_from_path
        from pypdf import PdfReader

        dpi = dpi or self.dpi
        pdf_path = Path(pdf_path)

        reader = PdfReader(str(pdf_path))
        total_pages = len(reader.pages)

        logger.info(f"📄 Streaming PDF: {pdf_path.name} ({total_pages} pages)")

        for start_idx in range(0, total_pages, batch_size):
            end_idx = min(start_idx + batch_size, total_pages)

            # Extract text for batch
            batch_texts = []
            for page_idx in range(start_idx, end_idx):
                text = reader.pages[page_idx].extract_text() or ""
                text = self._sanitize_text(text)
                batch_texts.append(text)

            # Convert images for batch
            batch_images = convert_from_path(
                str(pdf_path),
                dpi=dpi,
                fmt=self.output_format.lower(),
                first_page=start_idx + 1,  # 1-indexed
                last_page=end_idx,
            )

            yield batch_images, batch_texts, start_idx + 1

            del batch_images
            gc.collect()

    def get_page_count(self, pdf_path: Path) -> int:
        """Get number of pages in PDF without loading images."""
        self._require_pdf_deps()
        from pypdf import PdfReader

        reader = PdfReader(str(pdf_path))
        return len(reader.pages)

    def resize_for_colpali(
        self,
        image: Image.Image,
        max_edge: int = 2048,
        tile_size: int = 512,
    ) -> Tuple[Image.Image, int, int]:
        """
        Resize image following ColPali/Idefics3 processor logic.

        Resizes to fit within tile grid without black padding.

        Args:
            image: PIL Image
            max_edge: Maximum edge length
            tile_size: Size of each tile

        Returns:
            Tuple of (resized_image, tile_rows, tile_cols)
        """
        # Ensure consistent mode for downstream processors (and predictable tests)
        if image.mode != "RGB":
            image = image.convert("RGB")

        w, h = image.size

        # Step 1: Resize so longest edge = max_edge
        if w > h:
            new_w = max_edge
            new_h = int(h * (max_edge / w))
        else:
            new_h = max_edge
            new_w = int(w * (max_edge / h))

        # Step 2: Calculate tile grid
        tile_cols = (new_w + tile_size - 1) // tile_size
        tile_rows = (new_h + tile_size - 1) // tile_size

        # Step 3: Calculate exact dimensions for tiles
        final_w = tile_cols * tile_size
        final_h = tile_rows * tile_size

        # Step 4: Scale to fit within tile grid
        scale_w = final_w / w
        scale_h = final_h / h
        scale = min(scale_w, scale_h)

        scaled_w = int(w * scale)
        scaled_h = int(h * scale)

        resized = image.resize((scaled_w, scaled_h), Image.LANCZOS)

        # Center on white canvas if needed
        if scaled_w != final_w or scaled_h != final_h:
            canvas = Image.new("RGB", (final_w, final_h), (255, 255, 255))
            offset_x = (final_w - scaled_w) // 2
            offset_y = (final_h - scaled_h) // 2
            canvas.paste(resized, (offset_x, offset_y))
            resized = canvas

        return resized, tile_rows, tile_cols

    def _sanitize_text(self, text: str) -> str:
        """Remove invalid Unicode characters (surrogates) from text."""
        if not text:
            return ""

        # Remove surrogate characters (U+D800-U+DFFF)
        return text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="ignore")

    def extract_metadata_from_filename(
        self,
        filename: str,
        mapping: Optional[Dict[str, Dict[str, Any]]] = None,
    ) -> Dict[str, Any]:
        """
        Extract metadata from PDF filename.

        Uses mapping if provided, otherwise falls back to pattern matching.

        Args:
            filename: PDF filename (with or without .pdf extension)
            mapping: Optional mapping dict {filename: metadata}

        Returns:
            Metadata dict with year, source, district, etc.
        """
        # Remove extension
        stem = Path(filename).stem
        stem_lower = stem.lower().strip()

        # Try mapping first
        if mapping:
            if stem_lower in mapping:
                return mapping[stem_lower].copy()

            # Try without .pdf
            stem_no_ext = stem_lower.replace(".pdf", "")
            if stem_no_ext in mapping:
                return mapping[stem_no_ext].copy()

        # Fallback: pattern matching
        metadata = {"filename": filename}

        # Extract year
        year_match = re.search(r"(20\d{2})", stem)
        if year_match:
            metadata["year"] = int(year_match.group(1))

        # Detect source type
        if "consolidated" in stem_lower or ("annual" in stem_lower and "oag" in stem_lower):
            metadata["source"] = "Consolidated"
        elif "dlg" in stem_lower or "district local government" in stem_lower:
            metadata["source"] = "Local Government"
            # Try to extract district name
            district_match = re.search(r"([a-z]+)\s+(?:dlg|district local government)", stem_lower)
            if district_match:
                metadata["district"] = district_match.group(1).title()
        elif "hospital" in stem_lower or "referral" in stem_lower:
            metadata["source"] = "Hospital"
        elif "ministry" in stem_lower:
            metadata["source"] = "Ministry"
        elif "project" in stem_lower:
            metadata["source"] = "Project"
        else:
            metadata["source"] = "Unknown"

        return metadata