Spaces:
Sleeping
Sleeping
File size: 1,452 Bytes
7c86b28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import os
import fitz # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.base import VectorStoreRetriever
from langchain_core.documents import Document
CHROMA_PATH = "data/chroma_store"
def extract_text_from_pdf(pdf_path):
text = ""
doc = fitz.open(pdf_path)
for page in doc:
text += page.get_text()
return text
def chunk_text(text, chunk_size=800, chunk_overlap=100):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)
chunks = splitter.create_documents([text])
return chunks
def store_chunks_in_chroma(chunks, persist_path=CHROMA_PATH):
os.makedirs(persist_path, exist_ok=True)
embeddings = OpenAIEmbeddings() # Replace with Groq later
db = Chroma.from_documents(chunks, embedding=embeddings, persist_directory=persist_path)
db.persist()
return db
def load_existing_chroma(persist_path=CHROMA_PATH):
embeddings = OpenAIEmbeddings()
db = Chroma(persist_directory=persist_path, embedding_function=embeddings)
return db
def process_pdf_for_rag(pdf_file_path):
text = extract_text_from_pdf(pdf_file_path)
chunks = chunk_text(text)
db = store_chunks_in_chroma(chunks)
return db
|