import os from typing import List from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain.schema import Document class RAGEngine: def __init__(self, data_dir: str = "data", db_dir: str = "chroma_db"): self.data_dir = data_dir self.db_dir = db_dir self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") self.vector_store = None self._initialize_vector_store() def _initialize_vector_store(self): if not os.path.exists(self.db_dir): os.makedirs(self.db_dir) self._process_documents() else: self.vector_store = Chroma( persist_directory=self.db_dir, embedding_function=self.embeddings ) def _process_documents(self): documents = [] for file in os.listdir(self.data_dir): if file.endswith(".pdf"): file_path = os.path.join(self.data_dir, file) loader = PyPDFLoader(file_path) documents.extend(loader.load()) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) chunks = text_splitter.split_documents(documents) self.vector_store = Chroma.from_documents( documents=chunks, embedding=self.embeddings, persist_directory=self.db_dir ) self.vector_store.persist() def query(self, text: str, k: int = 3) -> List[Document]: if not self.vector_store: return [] return self.vector_store.similarity_search(text, k=k) rag_engine = RAGEngine()