"""
utils.py
--------
Shared helper functions used across the project.

Keeping utilities here avoids repeating the same logic in multiple files
and makes it easier to improve or test each helper independently.
"""

import os
import shutil
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

# ── configuration ─────────────────────────────────────────────────────────────

UPLOADS_DIR = os.path.join(os.path.dirname(__file__), "..", "uploads")

# Chunking settings
# Chunk size   : how many characters per chunk (≈200–300 words)
# Chunk overlap: how many characters are shared between neighbouring chunks
#                (overlap helps avoid cutting a sentence right at a boundary)
CHUNK_SIZE    = 1000
CHUNK_OVERLAP = 200


# ── file helpers ──────────────────────────────────────────────────────────────

def save_uploaded_file(tmp_path: str) -> str:
    """
    Copy a Gradio-uploaded temp file to our persistent uploads/ folder.

    Gradio saves uploaded files to a temporary location that may be cleaned
    up between sessions. This function copies the file to uploads/ so it
    stays available.

    Parameters
    ----------
    tmp_path : str – the temporary path Gradio gives us

    Returns
    -------
    str – the new permanent path inside uploads/
    """
    os.makedirs(UPLOADS_DIR, exist_ok=True)
    filename = os.path.basename(tmp_path)
    dest_path = os.path.join(UPLOADS_DIR, filename)
    shutil.copy2(tmp_path, dest_path)
    return dest_path


def get_file_extension(file_path: str) -> str:
    """Return the lowercase file extension including the dot, e.g. '.pdf'."""
    return os.path.splitext(file_path)[1].lower()


# ── text splitting ────────────────────────────────────────────────────────────

def split_documents(documents: list[Document]) -> list[Document]:
    """
    Split a list of Documents into smaller chunks.

    Why do we split?
      LLMs have a limited context window (maximum number of tokens they can
      process at once). Splitting ensures we can always fit the most relevant
      pieces into the prompt without exceeding the limit.

    RecursiveCharacterTextSplitter tries to split at natural boundaries:
      paragraph → sentence → word → character
    so that chunks remain readable.

    Parameters
    ----------
    documents : list[Document] – full-length documents from the loaders

    Returns
    -------
    list[Document] – smaller chunks, each inheriting the original metadata
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""],
    )
    chunks = splitter.split_documents(documents)
    print(f"  OK: Split {len(documents)} document(s) into {len(chunks)} chunk(s).")
    return chunks


# ── source formatting ─────────────────────────────────────────────────────────

def format_sources(source_docs: list[Document]) -> str:
    """
    Format the retrieved source documents into a readable string for display.

    Parameters
    ----------
    source_docs : list[Document] – source documents returned by the RAG chain

    Returns
    -------
    str – formatted text listing each source and a snippet of its content
    """
    if not source_docs:
        return "No sources found."

    lines = []
    for i, doc in enumerate(source_docs, start=1):
        source = doc.metadata.get("source", "Unknown")
        filename = os.path.basename(source)

        # Add optional page / row / paragraph info if available
        extra = ""
        if "page" in doc.metadata:
            extra = f" – Page {doc.metadata['page']}"
        elif "row" in doc.metadata:
            extra = f" – Row {doc.metadata['row']}"
        elif "paragraph" in doc.metadata:
            extra = f" – Para {doc.metadata['paragraph']}"

        # Show a short preview of the chunk text
        snippet = doc.page_content[:300].strip().replace("\n", " ")
        if len(doc.page_content) > 300:
            snippet += " …"

        lines.append(f"[{i}] {filename}{extra}\n    {snippet}\n")

    return "\n".join(lines)