""" utils.py -------- Shared helper functions used across the project. Keeping utilities here avoids repeating the same logic in multiple files and makes it easier to improve or test each helper independently. """ import os import shutil from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter # ── configuration ───────────────────────────────────────────────────────────── UPLOADS_DIR = os.path.join(os.path.dirname(__file__), "..", "uploads") # Chunking settings # Chunk size : how many characters per chunk (≈200–300 words) # Chunk overlap: how many characters are shared between neighbouring chunks # (overlap helps avoid cutting a sentence right at a boundary) CHUNK_SIZE = 1000 CHUNK_OVERLAP = 200 # ── file helpers ────────────────────────────────────────────────────────────── def save_uploaded_file(tmp_path: str) -> str: """ Copy a Gradio-uploaded temp file to our persistent uploads/ folder. Gradio saves uploaded files to a temporary location that may be cleaned up between sessions. This function copies the file to uploads/ so it stays available. Parameters ---------- tmp_path : str – the temporary path Gradio gives us Returns ------- str – the new permanent path inside uploads/ """ os.makedirs(UPLOADS_DIR, exist_ok=True) filename = os.path.basename(tmp_path) dest_path = os.path.join(UPLOADS_DIR, filename) shutil.copy2(tmp_path, dest_path) return dest_path def get_file_extension(file_path: str) -> str: """Return the lowercase file extension including the dot, e.g. '.pdf'.""" return os.path.splitext(file_path)[1].lower() # ── text splitting ──────────────────────────────────────────────────────────── def split_documents(documents: list[Document]) -> list[Document]: """ Split a list of Documents into smaller chunks. Why do we split? LLMs have a limited context window (maximum number of tokens they can process at once). Splitting ensures we can always fit the most relevant pieces into the prompt without exceeding the limit. RecursiveCharacterTextSplitter tries to split at natural boundaries: paragraph → sentence → word → character so that chunks remain readable. Parameters ---------- documents : list[Document] – full-length documents from the loaders Returns ------- list[Document] – smaller chunks, each inheriting the original metadata """ splitter = RecursiveCharacterTextSplitter( chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len, separators=["\n\n", "\n", ". ", " ", ""], ) chunks = splitter.split_documents(documents) print(f" OK: Split {len(documents)} document(s) into {len(chunks)} chunk(s).") return chunks # ── source formatting ───────────────────────────────────────────────────────── def format_sources(source_docs: list[Document]) -> str: """ Format the retrieved source documents into a readable string for display. Parameters ---------- source_docs : list[Document] – source documents returned by the RAG chain Returns ------- str – formatted text listing each source and a snippet of its content """ if not source_docs: return "No sources found." lines = [] for i, doc in enumerate(source_docs, start=1): source = doc.metadata.get("source", "Unknown") filename = os.path.basename(source) # Add optional page / row / paragraph info if available extra = "" if "page" in doc.metadata: extra = f" – Page {doc.metadata['page']}" elif "row" in doc.metadata: extra = f" – Row {doc.metadata['row']}" elif "paragraph" in doc.metadata: extra = f" – Para {doc.metadata['paragraph']}" # Show a short preview of the chunk text snippet = doc.page_content[:300].strip().replace("\n", " ") if len(doc.page_content) > 300: snippet += " …" lines.append(f"[{i}] {filename}{extra}\n {snippet}\n") return "\n".join(lines)