Spaces:

Abdul2000
/

Ragbase_Studio

Sleeping

App Files Files Community

Ragbase_Studio / src /utils.py

Abdul2000

Rename utils.py to src/utils.py

50c80d3 verified 19 days ago

Raw

History Blame Contribute Delete

4.84 kB

	"""
	utils.py
	--------
	Shared helper functions used across the project.

	Keeping utilities here avoids repeating the same logic in multiple files
	and makes it easier to improve or test each helper independently.
	"""

	import os
	import shutil
	from langchain_core.documents import Document
	from langchain_text_splitters import RecursiveCharacterTextSplitter

	# ── configuration ─────────────────────────────────────────────────────────────

	UPLOADS_DIR = os.path.join(os.path.dirname(__file__), "..", "uploads")

	# Chunking settings
	# Chunk size : how many characters per chunk (≈200–300 words)
	# Chunk overlap: how many characters are shared between neighbouring chunks
	# (overlap helps avoid cutting a sentence right at a boundary)
	CHUNK_SIZE = 1000
	CHUNK_OVERLAP = 200


	# ── file helpers ──────────────────────────────────────────────────────────────

	def save_uploaded_file(tmp_path: str) -> str:
	"""
	Copy a Gradio-uploaded temp file to our persistent uploads/ folder.

	Gradio saves uploaded files to a temporary location that may be cleaned
	up between sessions. This function copies the file to uploads/ so it
	stays available.

	Parameters
	----------
	tmp_path : str – the temporary path Gradio gives us

	Returns
	-------
	str – the new permanent path inside uploads/
	"""
	os.makedirs(UPLOADS_DIR, exist_ok=True)
	filename = os.path.basename(tmp_path)
	dest_path = os.path.join(UPLOADS_DIR, filename)
	shutil.copy2(tmp_path, dest_path)
	return dest_path


	def get_file_extension(file_path: str) -> str:
	"""Return the lowercase file extension including the dot, e.g. '.pdf'."""
	return os.path.splitext(file_path)[1].lower()


	# ── text splitting ────────────────────────────────────────────────────────────

	def split_documents(documents: list[Document]) -> list[Document]:
	"""
	Split a list of Documents into smaller chunks.

	Why do we split?
	LLMs have a limited context window (maximum number of tokens they can
	process at once). Splitting ensures we can always fit the most relevant
	pieces into the prompt without exceeding the limit.

	RecursiveCharacterTextSplitter tries to split at natural boundaries:
	paragraph → sentence → word → character
	so that chunks remain readable.

	Parameters
	----------
	documents : list[Document] – full-length documents from the loaders

	Returns
	-------
	list[Document] – smaller chunks, each inheriting the original metadata
	"""
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=CHUNK_SIZE,
	chunk_overlap=CHUNK_OVERLAP,
	length_function=len,
	separators=["\n\n", "\n", ". ", " ", ""],
	)
	chunks = splitter.split_documents(documents)
	print(f" OK: Split {len(documents)} document(s) into {len(chunks)} chunk(s).")
	return chunks


	# ── source formatting ─────────────────────────────────────────────────────────

	def format_sources(source_docs: list[Document]) -> str:
	"""
	Format the retrieved source documents into a readable string for display.

	Parameters
	----------
	source_docs : list[Document] – source documents returned by the RAG chain

	Returns
	-------
	str – formatted text listing each source and a snippet of its content
	"""
	if not source_docs:
	return "No sources found."

	lines = []
	for i, doc in enumerate(source_docs, start=1):
	source = doc.metadata.get("source", "Unknown")
	filename = os.path.basename(source)

	# Add optional page / row / paragraph info if available
	extra = ""
	if "page" in doc.metadata:
	extra = f" – Page {doc.metadata['page']}"
	elif "row" in doc.metadata:
	extra = f" – Row {doc.metadata['row']}"
	elif "paragraph" in doc.metadata:
	extra = f" – Para {doc.metadata['paragraph']}"

	# Show a short preview of the chunk text
	snippet = doc.page_content[:300].strip().replace("\n", " ")
	if len(doc.page_content) > 300:
	snippet += " …"

	lines.append(f"[{i}] {filename}{extra}\n {snippet}\n")

	return "\n".join(lines)