"""
document_loader.py
------------------
Handles loading and extracting text from different file types.

Supported formats:
  - .txt  (plain text)
  - .pdf  (PDF documents)
  - .csv  (comma-separated values)
  - .docx (Microsoft Word documents)

Each loader returns a list of LangChain Document objects.
A Document has two fields:
  - page_content : the extracted text
  - metadata     : a dict with extra info like the source file name
"""

import os
from langchain_core.documents import Document

# ── helpers ──────────────────────────────────────────────────────────────────

def _make_doc(text: str, source: str) -> Document:
    """Wrap extracted text in a LangChain Document with source metadata."""
    return Document(page_content=text, metadata={"source": source})


# ── per-format loaders ────────────────────────────────────────────────────────

def load_txt(file_path: str) -> list[Document]:
    """Load a plain-text file and return it as a single Document."""
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
    return [_make_doc(text, file_path)]


def load_pdf(file_path: str) -> list[Document]:
    """
    Load a PDF file page-by-page.
    Each page becomes its own Document so we can cite the exact page later.
    Requires: pypdf
    """
    try:
        from pypdf import PdfReader
    except ImportError:
        raise ImportError("pypdf is required for PDF support. Run: pip install pypdf")

    reader = PdfReader(file_path)
    documents = []
    for page_num, page in enumerate(reader.pages):
        text = page.extract_text() or ""
        if text.strip():                        # skip blank pages
            doc = Document(
                page_content=text,
                metadata={"source": file_path, "page": page_num + 1},
            )
            documents.append(doc)
    return documents


def load_csv(file_path: str) -> list[Document]:
    """
    Load a CSV file.
    Each row is turned into a readable 'key: value' string and stored as
    one Document so every row is individually searchable.
    Requires: pandas
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas is required for CSV support. Run: pip install pandas")

    df = pd.read_csv(file_path)
    documents = []
    for idx, row in df.iterrows():
        # Build a human-readable string from each row
        row_text = "\n".join(f"{col}: {val}" for col, val in row.items())
        doc = Document(
            page_content=row_text,
            metadata={"source": file_path, "row": idx + 1},
        )
        documents.append(doc)
    return documents


def load_docx(file_path: str) -> list[Document]:
    """
    Load a Microsoft Word (.docx) file.
    Each paragraph becomes its own Document.
    Requires: python-docx
    """
    try:
        from docx import Document as WordDocument
    except ImportError:
        raise ImportError(
            "python-docx is required for DOCX support. Run: pip install python-docx"
        )

    word_doc = WordDocument(file_path)
    documents = []
    for para_num, para in enumerate(word_doc.paragraphs):
        text = para.text.strip()
        if text:                                # skip empty paragraphs
            doc = Document(
                page_content=text,
                metadata={"source": file_path, "paragraph": para_num + 1},
            )
            documents.append(doc)
    return documents


# ── main entry point ──────────────────────────────────────────────────────────

def load_document(file_path: str) -> list[Document]:
    """
    Detect the file extension and call the right loader.

    Parameters
    ----------
    file_path : str
        Full path to the file on disk.

    Returns
    -------
    list[Document]
        A list of LangChain Document objects with extracted text.

    Raises
    ------
    ValueError  – if the file type is not supported.
    Exception   – if loading fails for any reason.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    extension = os.path.splitext(file_path)[1].lower()

    loaders = {
        ".txt":  load_txt,
        ".pdf":  load_pdf,
        ".csv":  load_csv,
        ".docx": load_docx,
    }

    if extension not in loaders:
        raise ValueError(
            f"Unsupported file type: '{extension}'. "
            f"Supported types: {', '.join(loaders.keys())}"
        )

    # Call the appropriate loader
    documents = loaders[extension](file_path)

    if not documents:
        raise ValueError(f"No readable text found in: {file_path}")

    print(f"  OK: Loaded {len(documents)} chunk(s) from '{os.path.basename(file_path)}'")
    return documents