Spaces:
Runtime error
Runtime error
| import os | |
| from typing import List | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import Chroma | |
| from langchain.schema import Document | |
| class RAGEngine: | |
| def __init__(self, data_dir: str = "data", db_dir: str = "chroma_db"): | |
| self.data_dir = data_dir | |
| self.db_dir = db_dir | |
| self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| self.vector_store = None | |
| self._initialize_vector_store() | |
| def _initialize_vector_store(self): | |
| if not os.path.exists(self.db_dir): | |
| os.makedirs(self.db_dir) | |
| self._process_documents() | |
| else: | |
| self.vector_store = Chroma( | |
| persist_directory=self.db_dir, | |
| embedding_function=self.embeddings | |
| ) | |
| def _process_documents(self): | |
| documents = [] | |
| for file in os.listdir(self.data_dir): | |
| if file.endswith(".pdf"): | |
| file_path = os.path.join(self.data_dir, file) | |
| loader = PyPDFLoader(file_path) | |
| documents.extend(loader.load()) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
| chunks = text_splitter.split_documents(documents) | |
| self.vector_store = Chroma.from_documents( | |
| documents=chunks, | |
| embedding=self.embeddings, | |
| persist_directory=self.db_dir | |
| ) | |
| self.vector_store.persist() | |
| def query(self, text: str, k: int = 3) -> List[Document]: | |
| if not self.vector_store: | |
| return [] | |
| return self.vector_store.similarity_search(text, k=k) | |
| rag_engine = RAGEngine() | |