"""Document parsing utilities for various file formats."""

from pathlib import Path

from src.utils.common import normalize_text
from src.utils.logging import print_log


def load_pdf(file_path: Path) -> str:
    """Load text from PDF file."""
    try:
        import pypdf
    except ImportError:
        raise ImportError("pypdf is required for PDF files. Install with: pip install pypdf")

    reader = pypdf.PdfReader(str(file_path))
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text.strip()


def load_docx(file_path: Path) -> str:
    """Load text from DOCX file."""
    try:
        import docx
    except ImportError:
        raise ImportError("python-docx is required for DOCX files. Install with: pip install python-docx")

    doc = docx.Document(str(file_path))
    return "\n".join([para.text for para in doc.paragraphs])


def load_txt(file_path: Path) -> str:
    """Load text from TXT file."""
    with open(file_path, encoding="utf-8") as f:
        return f.read()


def load_document(file_path: Path) -> tuple[str | None, dict | None]:
    """Load document (PDF, DOCX, TXT), normalize text, and return (text, metadata).

    Returns (None, None) for unsupported or failed files.
    """
    ext = file_path.suffix.lower()

    try:
        if ext == ".pdf":
            text = load_pdf(file_path)
        elif ext == ".docx":
            text = load_docx(file_path)
        elif ext == ".txt":
            text = load_txt(file_path)
        else:
            return None, None

        text = normalize_text(text)
        if not text:
            return None, None

        metadata = {
            "source_file": str(file_path),
            "file_name": file_path.name,
            "file_type": ext[1:],
        }
        return text, metadata

    except Exception as e:
        print_log(f"        [Error] Failed to load {file_path.name}: {e}")
        return None, None