Spaces:

JARVISXIRONMAN
/

StratoPilot

Sleeping

JARVISXIRONMAN commited on Jul 27, 2025

Commit

7c86b28

verified ·

1 Parent(s): 8088d3a

Create utils/rag_utils.py

Files changed (1) hide show

utils/rag_utils.py ADDED Viewed

+import os
+import fitz  # PyMuPDF
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores.base import VectorStoreRetriever
+from langchain_core.documents import Document
+CHROMA_PATH = "data/chroma_store"
+def extract_text_from_pdf(pdf_path):
+    text = ""
+    doc = fitz.open(pdf_path)
+    for page in doc:
+        text += page.get_text()
+    return text
+def chunk_text(text, chunk_size=800, chunk_overlap=100):
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
+    )
+    chunks = splitter.create_documents([text])
+    return chunks
+def store_chunks_in_chroma(chunks, persist_path=CHROMA_PATH):
+    os.makedirs(persist_path, exist_ok=True)
+    embeddings = OpenAIEmbeddings()  # Replace with Groq later
+    db = Chroma.from_documents(chunks, embedding=embeddings, persist_directory=persist_path)
+    db.persist()
+    return db
+def load_existing_chroma(persist_path=CHROMA_PATH):
+    embeddings = OpenAIEmbeddings()
+    db = Chroma(persist_directory=persist_path, embedding_function=embeddings)
+    return db
+def process_pdf_for_rag(pdf_file_path):
+    text = extract_text_from_pdf(pdf_file_path)
+    chunks = chunk_text(text)
+    db = store_chunks_in_chroma(chunks)
+    return db