StratoPilot / utils /rag_utils.py
JARVISXIRONMAN's picture
Create utils/rag_utils.py
7c86b28 verified
import os
import fitz # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.base import VectorStoreRetriever
from langchain_core.documents import Document
CHROMA_PATH = "data/chroma_store"
def extract_text_from_pdf(pdf_path):
text = ""
doc = fitz.open(pdf_path)
for page in doc:
text += page.get_text()
return text
def chunk_text(text, chunk_size=800, chunk_overlap=100):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)
chunks = splitter.create_documents([text])
return chunks
def store_chunks_in_chroma(chunks, persist_path=CHROMA_PATH):
os.makedirs(persist_path, exist_ok=True)
embeddings = OpenAIEmbeddings() # Replace with Groq later
db = Chroma.from_documents(chunks, embedding=embeddings, persist_directory=persist_path)
db.persist()
return db
def load_existing_chroma(persist_path=CHROMA_PATH):
embeddings = OpenAIEmbeddings()
db = Chroma(persist_directory=persist_path, embedding_function=embeddings)
return db
def process_pdf_for_rag(pdf_file_path):
text = extract_text_from_pdf(pdf_file_path)
chunks = chunk_text(text)
db = store_chunks_in_chroma(chunks)
return db