"""PDF text extractor using PyMuPDF (lightweight alternative to Docling)."""

from datetime import datetime
from pathlib import Path
from typing import Optional

try:
    import fitz  # PyMuPDF
    PYMUPDF_AVAILABLE = True
except ImportError:
    PYMUPDF_AVAILABLE = False


class PDFExtractor:
    """Extracts text from PDF documents using PyMuPDF."""

    def __init__(self, output_dir: Optional[Path] = None):
        """Initialize the extractor.

        Args:
            output_dir: Directory to store extracted text files.
        """
        self.output_dir = output_dir or Path("data/extracted")
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def extract_text(self, pdf_path: Path) -> dict:
        """Extract text from a PDF file.

        Args:
            pdf_path: Path to the PDF file.

        Returns:
            Dict with 'success', 'text', 'page_count', and 'error' keys.
        """
        pdf_path = Path(pdf_path).resolve()

        if not PYMUPDF_AVAILABLE:
            return {
                'success': False,
                'text': '',
                'page_count': 0,
                'error': 'PyMuPDF not installed'
            }

        if not pdf_path.exists():
            return {
                'success': False,
                'text': '',
                'page_count': 0,
                'error': f'File not found: {pdf_path}'
            }

        try:
            doc = fitz.open(pdf_path)
            text_parts = []
            
            for page_num, page in enumerate(doc):
                page_text = page.get_text()
                if page_text.strip():
                    text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}")
            
            full_text = "\n\n".join(text_parts)
            page_count = len(doc)
            doc.close()

            # Save extracted text
            txt_path = self.output_dir / f"{pdf_path.stem}.txt"
            txt_path.write_text(full_text, encoding='utf-8')

            return {
                'success': True,
                'text': full_text,
                'page_count': page_count,
                'error': None
            }

        except Exception as e:
            return {
                'success': False,
                'text': '',
                'page_count': 0,
                'error': str(e)
            }

    def extract_batch(self, pdf_paths: list) -> list:
        """Extract text from multiple PDFs.

        Args:
            pdf_paths: List of PDF file paths.

        Returns:
            List of extraction results.
        """
        return [self.extract_text(pdf_path) for pdf_path in pdf_paths]