Spaces:
Sleeping
Sleeping
| """ | |
| utils.py | |
| -------- | |
| Shared helper functions used across the project. | |
| Keeping utilities here avoids repeating the same logic in multiple files | |
| and makes it easier to improve or test each helper independently. | |
| """ | |
| import os | |
| import shutil | |
| from langchain_core.documents import Document | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| # ββ configuration βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| UPLOADS_DIR = os.path.join(os.path.dirname(__file__), "..", "uploads") | |
| # Chunking settings | |
| # Chunk size : how many characters per chunk (β200β300 words) | |
| # Chunk overlap: how many characters are shared between neighbouring chunks | |
| # (overlap helps avoid cutting a sentence right at a boundary) | |
| CHUNK_SIZE = 1000 | |
| CHUNK_OVERLAP = 200 | |
| # ββ file helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def save_uploaded_file(tmp_path: str) -> str: | |
| """ | |
| Copy a Gradio-uploaded temp file to our persistent uploads/ folder. | |
| Gradio saves uploaded files to a temporary location that may be cleaned | |
| up between sessions. This function copies the file to uploads/ so it | |
| stays available. | |
| Parameters | |
| ---------- | |
| tmp_path : str β the temporary path Gradio gives us | |
| Returns | |
| ------- | |
| str β the new permanent path inside uploads/ | |
| """ | |
| os.makedirs(UPLOADS_DIR, exist_ok=True) | |
| filename = os.path.basename(tmp_path) | |
| dest_path = os.path.join(UPLOADS_DIR, filename) | |
| shutil.copy2(tmp_path, dest_path) | |
| return dest_path | |
| def get_file_extension(file_path: str) -> str: | |
| """Return the lowercase file extension including the dot, e.g. '.pdf'.""" | |
| return os.path.splitext(file_path)[1].lower() | |
| # ββ text splitting ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def split_documents(documents: list[Document]) -> list[Document]: | |
| """ | |
| Split a list of Documents into smaller chunks. | |
| Why do we split? | |
| LLMs have a limited context window (maximum number of tokens they can | |
| process at once). Splitting ensures we can always fit the most relevant | |
| pieces into the prompt without exceeding the limit. | |
| RecursiveCharacterTextSplitter tries to split at natural boundaries: | |
| paragraph β sentence β word β character | |
| so that chunks remain readable. | |
| Parameters | |
| ---------- | |
| documents : list[Document] β full-length documents from the loaders | |
| Returns | |
| ------- | |
| list[Document] β smaller chunks, each inheriting the original metadata | |
| """ | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=CHUNK_SIZE, | |
| chunk_overlap=CHUNK_OVERLAP, | |
| length_function=len, | |
| separators=["\n\n", "\n", ". ", " ", ""], | |
| ) | |
| chunks = splitter.split_documents(documents) | |
| print(f" OK: Split {len(documents)} document(s) into {len(chunks)} chunk(s).") | |
| return chunks | |
| # ββ source formatting βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def format_sources(source_docs: list[Document]) -> str: | |
| """ | |
| Format the retrieved source documents into a readable string for display. | |
| Parameters | |
| ---------- | |
| source_docs : list[Document] β source documents returned by the RAG chain | |
| Returns | |
| ------- | |
| str β formatted text listing each source and a snippet of its content | |
| """ | |
| if not source_docs: | |
| return "No sources found." | |
| lines = [] | |
| for i, doc in enumerate(source_docs, start=1): | |
| source = doc.metadata.get("source", "Unknown") | |
| filename = os.path.basename(source) | |
| # Add optional page / row / paragraph info if available | |
| extra = "" | |
| if "page" in doc.metadata: | |
| extra = f" β Page {doc.metadata['page']}" | |
| elif "row" in doc.metadata: | |
| extra = f" β Row {doc.metadata['row']}" | |
| elif "paragraph" in doc.metadata: | |
| extra = f" β Para {doc.metadata['paragraph']}" | |
| # Show a short preview of the chunk text | |
| snippet = doc.page_content[:300].strip().replace("\n", " ") | |
| if len(doc.page_content) > 300: | |
| snippet += " β¦" | |
| lines.append(f"[{i}] {filename}{extra}\n {snippet}\n") | |
| return "\n".join(lines) | |