DSTV_AI_Support / rag_engine.py
alaselababatunde's picture
Updated
e8d69f7
import os
from typing import List
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
class RAGEngine:
def __init__(self, data_dir: str = "data", db_dir: str = "chroma_db"):
self.data_dir = data_dir
self.db_dir = db_dir
self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
self.vector_store = None
self._initialize_vector_store()
def _initialize_vector_store(self):
if not os.path.exists(self.db_dir):
os.makedirs(self.db_dir)
self._process_documents()
else:
self.vector_store = Chroma(
persist_directory=self.db_dir,
embedding_function=self.embeddings
)
def _process_documents(self):
documents = []
for file in os.listdir(self.data_dir):
if file.endswith(".pdf"):
file_path = os.path.join(self.data_dir, file)
loader = PyPDFLoader(file_path)
documents.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)
self.vector_store = Chroma.from_documents(
documents=chunks,
embedding=self.embeddings,
persist_directory=self.db_dir
)
self.vector_store.persist()
def query(self, text: str, k: int = 3) -> List[Document]:
if not self.vector_store:
return []
return self.vector_store.similarity_search(text, k=k)
rag_engine = RAGEngine()