JARVISXIRONMAN commited on
Commit
7c86b28
·
verified ·
1 Parent(s): 8088d3a

Create utils/rag_utils.py

Browse files
Files changed (1) hide show
  1. utils/rag_utils.py +43 -0
utils/rag_utils.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz # PyMuPDF
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain.embeddings import OpenAIEmbeddings
6
+ from langchain.vectorstores.base import VectorStoreRetriever
7
+ from langchain_core.documents import Document
8
+
9
+ CHROMA_PATH = "data/chroma_store"
10
+
11
+ def extract_text_from_pdf(pdf_path):
12
+ text = ""
13
+ doc = fitz.open(pdf_path)
14
+ for page in doc:
15
+ text += page.get_text()
16
+ return text
17
+
18
+ def chunk_text(text, chunk_size=800, chunk_overlap=100):
19
+ splitter = RecursiveCharacterTextSplitter(
20
+ chunk_size=chunk_size,
21
+ chunk_overlap=chunk_overlap,
22
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
23
+ )
24
+ chunks = splitter.create_documents([text])
25
+ return chunks
26
+
27
+ def store_chunks_in_chroma(chunks, persist_path=CHROMA_PATH):
28
+ os.makedirs(persist_path, exist_ok=True)
29
+ embeddings = OpenAIEmbeddings() # Replace with Groq later
30
+ db = Chroma.from_documents(chunks, embedding=embeddings, persist_directory=persist_path)
31
+ db.persist()
32
+ return db
33
+
34
+ def load_existing_chroma(persist_path=CHROMA_PATH):
35
+ embeddings = OpenAIEmbeddings()
36
+ db = Chroma(persist_directory=persist_path, embedding_function=embeddings)
37
+ return db
38
+
39
+ def process_pdf_for_rag(pdf_file_path):
40
+ text = extract_text_from_pdf(pdf_file_path)
41
+ chunks = chunk_text(text)
42
+ db = store_chunks_in_chroma(chunks)
43
+ return db