Spaces:

JARVISXIRONMAN
/

StratoPilot

Sleeping

StratoPilot / utils /rag_utils.py

Create utils/rag_utils.py

7c86b28 verified 6 months ago

1.45 kB

	import os
	import fitz # PyMuPDF
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores.base import VectorStoreRetriever
	from langchain_core.documents import Document

	CHROMA_PATH = "data/chroma_store"

	def extract_text_from_pdf(pdf_path):
	text = ""
	doc = fitz.open(pdf_path)
	for page in doc:
	text += page.get_text()
	return text

	def chunk_text(text, chunk_size=800, chunk_overlap=100):
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
	)
	chunks = splitter.create_documents([text])
	return chunks

	def store_chunks_in_chroma(chunks, persist_path=CHROMA_PATH):
	os.makedirs(persist_path, exist_ok=True)
	embeddings = OpenAIEmbeddings() # Replace with Groq later
	db = Chroma.from_documents(chunks, embedding=embeddings, persist_directory=persist_path)
	db.persist()
	return db

	def load_existing_chroma(persist_path=CHROMA_PATH):
	embeddings = OpenAIEmbeddings()
	db = Chroma(persist_directory=persist_path, embedding_function=embeddings)
	return db

	def process_pdf_for_rag(pdf_file_path):
	text = extract_text_from_pdf(pdf_file_path)
	chunks = chunk_text(text)
	db = store_chunks_in_chroma(chunks)
	return db