Ragbase_Studio / src /utils.py
Abdul2000's picture
Rename utils.py to src/utils.py
50c80d3 verified
Raw
History Blame Contribute Delete
4.84 kB
"""
utils.py
--------
Shared helper functions used across the project.
Keeping utilities here avoids repeating the same logic in multiple files
and makes it easier to improve or test each helper independently.
"""
import os
import shutil
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
# ── configuration ─────────────────────────────────────────────────────────────
UPLOADS_DIR = os.path.join(os.path.dirname(__file__), "..", "uploads")
# Chunking settings
# Chunk size : how many characters per chunk (β‰ˆ200–300 words)
# Chunk overlap: how many characters are shared between neighbouring chunks
# (overlap helps avoid cutting a sentence right at a boundary)
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
# ── file helpers ──────────────────────────────────────────────────────────────
def save_uploaded_file(tmp_path: str) -> str:
"""
Copy a Gradio-uploaded temp file to our persistent uploads/ folder.
Gradio saves uploaded files to a temporary location that may be cleaned
up between sessions. This function copies the file to uploads/ so it
stays available.
Parameters
----------
tmp_path : str – the temporary path Gradio gives us
Returns
-------
str – the new permanent path inside uploads/
"""
os.makedirs(UPLOADS_DIR, exist_ok=True)
filename = os.path.basename(tmp_path)
dest_path = os.path.join(UPLOADS_DIR, filename)
shutil.copy2(tmp_path, dest_path)
return dest_path
def get_file_extension(file_path: str) -> str:
"""Return the lowercase file extension including the dot, e.g. '.pdf'."""
return os.path.splitext(file_path)[1].lower()
# ── text splitting ────────────────────────────────────────────────────────────
def split_documents(documents: list[Document]) -> list[Document]:
"""
Split a list of Documents into smaller chunks.
Why do we split?
LLMs have a limited context window (maximum number of tokens they can
process at once). Splitting ensures we can always fit the most relevant
pieces into the prompt without exceeding the limit.
RecursiveCharacterTextSplitter tries to split at natural boundaries:
paragraph β†’ sentence β†’ word β†’ character
so that chunks remain readable.
Parameters
----------
documents : list[Document] – full-length documents from the loaders
Returns
-------
list[Document] – smaller chunks, each inheriting the original metadata
"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
length_function=len,
separators=["\n\n", "\n", ". ", " ", ""],
)
chunks = splitter.split_documents(documents)
print(f" OK: Split {len(documents)} document(s) into {len(chunks)} chunk(s).")
return chunks
# ── source formatting ─────────────────────────────────────────────────────────
def format_sources(source_docs: list[Document]) -> str:
"""
Format the retrieved source documents into a readable string for display.
Parameters
----------
source_docs : list[Document] – source documents returned by the RAG chain
Returns
-------
str – formatted text listing each source and a snippet of its content
"""
if not source_docs:
return "No sources found."
lines = []
for i, doc in enumerate(source_docs, start=1):
source = doc.metadata.get("source", "Unknown")
filename = os.path.basename(source)
# Add optional page / row / paragraph info if available
extra = ""
if "page" in doc.metadata:
extra = f" – Page {doc.metadata['page']}"
elif "row" in doc.metadata:
extra = f" – Row {doc.metadata['row']}"
elif "paragraph" in doc.metadata:
extra = f" – Para {doc.metadata['paragraph']}"
# Show a short preview of the chunk text
snippet = doc.page_content[:300].strip().replace("\n", " ")
if len(doc.page_content) > 300:
snippet += " …"
lines.append(f"[{i}] {filename}{extra}\n {snippet}\n")
return "\n".join(lines)