import fitz  # PyMuPDF
import re
import os
from typing import Dict, List, Optional


def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract clean text from PDF file

    Args:
        pdf_path (str): Path to the PDF file

    Returns:
        str: Extracted and cleaned text

    Raises:
        RuntimeError: If PDF cannot be opened or processed
    """
    if not pdf_path or not os.path.exists(pdf_path):
        raise RuntimeError("PDF file not found or path is invalid")

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        raise RuntimeError(f"Failed to open PDF: {str(e)}")

    full_text = ""

    try:
        total_pages = doc.page_count
        print(f"📄 Processing {total_pages} pages...")

        for page_num in range(total_pages):
            try:
                page = doc[page_num]

                # Extract text from page
                text = page.get_text("text")

                if text.strip():
                    # Clean the extracted text
                    cleaned_text = clean_extracted_text(text)

                    # Add page separator (except for last page)
                    if page_num < total_pages - 1:
                        cleaned_text += "\n\n--- PAGE BREAK ---\n\n"

                    full_text += cleaned_text

                print(f"✅ Page {page_num + 1} processed")

            except Exception as e:
                print(f"⚠️ Error processing page {page_num + 1}: {e}")
                continue

    except Exception as e:
        raise RuntimeError(f"Error during text extraction: {str(e)}")

    finally:
        doc.close()

    if not full_text.strip():
        raise RuntimeError(
            "No text found in PDF. The file may contain only images or be corrupted.")

    return post_process_text(full_text)


def extract_text_with_metadata(pdf_path: str) -> Dict:
    """
    Extract text with additional metadata and document info

    Args:
        pdf_path (str): Path to the PDF file

    Returns:
        dict: Complete extraction results with metadata
    """
    if not pdf_path or not os.path.exists(pdf_path):
        raise RuntimeError("PDF file not found or path is invalid")

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        raise RuntimeError(f"Failed to open PDF: {str(e)}")

    full_text = ""
    page_texts = []

    try:
        total_pages = doc.page_count
        print(f"📄 Processing {total_pages} pages with metadata...")

        # Extract metadata
        metadata = doc.metadata

        # Process each page
        for page_num in range(total_pages):
            try:
                page = doc[page_num]
                text = page.get_text("text")

                if text.strip():
                    cleaned_text = clean_extracted_text(text)
                    page_texts.append(cleaned_text)

                    if page_num < total_pages - 1:
                        cleaned_text += "\n\n--- PAGE BREAK ---\n\n"

                    full_text += cleaned_text
                else:
                    page_texts.append("")

                print(f"✅ Page {page_num + 1} processed")

            except Exception as e:
                print(f"⚠️ Error processing page {page_num + 1}: {e}")
                page_texts.append("")
                continue

        result = {
            'full_text': post_process_text(full_text),
            'page_texts': page_texts,
            'page_count': total_pages,
            'metadata': clean_metadata(metadata),
            'file_info': {
                'file_path': pdf_path,
                'file_size': os.path.getsize(pdf_path) if os.path.exists(pdf_path) else 0
            }
        }

        return result

    except Exception as e:
        raise RuntimeError(f"Error during extraction with metadata: {str(e)}")

    finally:
        doc.close()


def clean_extracted_text(text: str) -> str:
    """
    Clean raw extracted text from PDF artifacts

    Args:
        text (str): Raw text from PDF

    Returns:
        str: Cleaned text
    """
    if not text:
        return ""

    try:
        # Remove form feed characters
        text = text.replace('\f', '')

        # Fix hyphenated words broken across lines
        text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)

        # Normalize whitespace
        text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces to single
        text = re.sub(r'\n[ \t]+', '\n', text)  # Spaces after newlines
        text = re.sub(r'[ \t]+\n', '\n', text)  # Spaces before newlines

        # Normalize line endings
        text = re.sub(r'\r\n?', '\n', text)

        # Remove excessive blank lines
        text = re.sub(r'\n{3,}', '\n\n', text)

        return text.strip()

    except Exception as e:
        print(f"Warning: Error cleaning text: {e}")
        return text.strip() if text else ""


def post_process_text(text: str) -> str:
    """
    Final post-processing of extracted text

    Args:
        text (str): Text to post-process

    Returns:
        str: Final processed text
    """
    if not text:
        return ""

    try:
        # Fix common character encoding issues
        replacements = {
            ''': "'",  # Smart quotes
            ''': "'",
            '"': '"',
            '"': '"',
            '–': '-',  # En dash
            '—': '--',  # Em dash
            '…': '...',  # Ellipsis
            '\u00a0': ' ',  # Non-breaking space
            '\u2028': '\n',  # Line separator
            '\u2029': '\n\n',  # Paragraph separator
        }

        for old_char, new_char in replacements.items():
            text = text.replace(old_char, new_char)

        # Remove isolated single characters (OCR artifacts)
        text = re.sub(r'\n[a-zA-Z]\n', '\n', text)

        # Remove standalone numbers (likely page numbers)
        text = re.sub(r'\n\s*\d{1,3}\s*\n', '\n', text)

        # Final whitespace cleanup
        text = re.sub(r'\n{3,}', '\n\n', text)

        return text.strip()

    except Exception as e:
        print(f"Warning: Error in post-processing: {e}")
        return text.strip() if text else ""


def clean_metadata(metadata: dict) -> dict:
    """
    Clean and structure PDF metadata

    Args:
        metadata (dict): Raw metadata from PDF

    Returns:
        dict: Cleaned metadata
    """
    if not metadata:
        return {}

    try:
        cleaned = {}

        # Common metadata fields
        field_mapping = {
            'title': 'Title',
            'author': 'Author',
            'subject': 'Subject',
            'creator': 'Creator',
            'producer': 'Producer',
            'creationDate': 'Creation Date',
            'modDate': 'Modification Date'
        }

        for key, display_name in field_mapping.items():
            value = metadata.get(key, '')
            if value and isinstance(value, str):
                # Clean the value
                value = value.strip()
                if value and value != 'Unknown':
                    cleaned[display_name] = value

        return cleaned

    except Exception as e:
        print(f"Warning: Error cleaning metadata: {e}")
        return {}


def validate_pdf(pdf_path: str) -> bool:
    """
    Validate if the file is a readable PDF

    Args:
        pdf_path (str): Path to PDF file

    Returns:
        bool: True if valid PDF, False otherwise
    """
    try:
        if not pdf_path or not os.path.exists(pdf_path):
            return False

        # Check file extension
        if not pdf_path.lower().endswith('.pdf'):
            return False

        # Try to open with PyMuPDF
        doc = fitz.open(pdf_path)

        # Check if document has pages
        has_pages = doc.page_count > 0

        doc.close()
        return has_pages

    except Exception:
        return False


def get_pdf_info(pdf_path: str) -> dict:
    """
    Get basic information about PDF without extracting text

    Args:
        pdf_path (str): Path to PDF file

    Returns:
        dict: Basic PDF information
    """
    try:
        if not validate_pdf(pdf_path):
            return {'error': 'Invalid PDF file'}

        doc = fitz.open(pdf_path)

        info = {
            'page_count': doc.page_count,
            'file_size': os.path.getsize(pdf_path),
            'is_encrypted': doc.needs_pass,
            'metadata': clean_metadata(doc.metadata)
        }

        doc.close()
        return info

    except Exception as e:
        return {'error': f'Error getting PDF info: {str(e)}'}


def extract_images_info(pdf_path: str) -> List[dict]:
    """
    Extract information about images in the PDF

    Args:
        pdf_path (str): Path to PDF file

    Returns:
        list: List of image information dictionaries
    """
    try:
        if not validate_pdf(pdf_path):
            return []

        doc = fitz.open(pdf_path)
        images_info = []

        for page_num in range(doc.page_count):
            page = doc[page_num]
            image_list = page.get_images()

            for img_index, img in enumerate(image_list):
                img_info = {
                    'page': page_num + 1,
                    'index': img_index,
                    'width': img[2] if len(img) > 2 else None,
                    'height': img[3] if len(img) > 3 else None,
                }
                images_info.append(img_info)

        doc.close()
        return images_info

    except Exception as e:
        print(f"Warning: Error extracting image info: {e}")
        return []

# Test functionality


def test_pdf_reader():
    """Test the PDF reader functionality"""
    print("=== PDF Reader Test ===")

    # This would need an actual PDF file to test
    test_pdf = "sample.pdf"  # Replace with actual PDF path

    try:
        if os.path.exists(test_pdf):
            print(f"Testing with: {test_pdf}")

            # Test validation
            is_valid = validate_pdf(test_pdf)
            print(f"Valid PDF: {is_valid}")

            if is_valid:
                # Test basic info
                info = get_pdf_info(test_pdf)
                print(f"Pages: {info.get('page_count', 'Unknown')}")

                # Test text extraction
                text = extract_text_from_pdf(test_pdf)
                print(f"Extracted {len(text)} characters")
                print(f"First 100 chars: {text[:100]}...")

        else:
            print("No test PDF found. Create a 'sample.pdf' to test.")

    except Exception as e:
        print(f"Test failed: {e}")


if __name__ == "__main__":
    test_pdf_reader()