UBA_AI_Support / rag.py
alaselababatunde's picture
Updated
d6910f6
import os
import glob
from typing import List, Dict, Any
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
# Configuration
DATA_DIR = "./data"
CHROMA_DIR = "./chroma_db"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
class RAGSystem:
def __init__(self):
self.embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
self.vectorstore = None
self._initialize_db()
def _initialize_db(self):
"""Initializes or loads the ChromaDB with data from DATA_DIR."""
if not os.path.exists(CHROMA_DIR) or not os.listdir(CHROMA_DIR):
print("Initializing new vector database from PDFs...")
pdf_files = glob.glob(os.path.join(DATA_DIR, "*.pdf"))
all_docs = []
for pdf in pdf_files:
try:
loader = PyPDFLoader(pdf)
docs = loader.load()
all_docs.extend(docs)
except Exception as e:
print(f"Error loading {pdf}: {e}")
if all_docs:
splits = self.text_splitter.split_documents(all_docs)
self.vectorstore = Chroma.from_documents(
documents=splits,
embedding=self.embeddings,
persist_directory=CHROMA_DIR
)
print(f"Indexed {len(splits)} chunks.")
else:
self.vectorstore = Chroma(persist_directory=CHROMA_DIR, embedding_function=self.embeddings)
else:
print("Loading existing vector database...")
self.vectorstore = Chroma(persist_directory=CHROMA_DIR, embedding_function=self.embeddings)
def query(self, text: str, k: int = 5) -> str:
"""Queries the vector database and returns a combined context string."""
if not self.vectorstore:
return ""
results = self.vectorstore.similarity_search(text, k=k)
context = "\n\n".join([doc.page_content for doc in results])
return context
# Singleton instance
rag_system = RAGSystem()