Spaces:
Sleeping
Sleeping
| from langchain_community.vectorstores import Chroma | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import os | |
| from typing import List | |
| from langchain_openai import OpenAIEmbeddings # ✅ Fixed import | |
| class PrepareVectorDB: | |
| """ | |
| A class for preparing and saving a VectorDB using OpenAI embeddings. | |
| """ | |
| def __init__( | |
| self, | |
| data_directory: str, | |
| persist_directory: str, | |
| embedding_model_engine: str, | |
| chunk_size: int, | |
| chunk_overlap: int | |
| ) -> None: | |
| """ | |
| Initialize the PrepareVectorDB instance. | |
| """ | |
| self.embedding_model_engine = embedding_model_engine | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| self.data_directory = data_directory | |
| self.persist_directory = persist_directory | |
| self.embedding = OpenAIEmbeddings() | |
| def __load_all_documents(self) -> List: | |
| """ | |
| Load all documents from the specified directory or directories. | |
| """ | |
| doc_counter = 0 | |
| docs = [] | |
| if isinstance(self.data_directory, list): | |
| print("Loading the uploaded documents...") | |
| for doc_dir in self.data_directory: | |
| docs.extend(PyPDFLoader(doc_dir).load()) | |
| doc_counter += 1 | |
| else: | |
| print("Loading documents manually...") | |
| if not os.path.exists(self.data_directory): | |
| os.makedirs(self.data_directory) # ✅ Ensure the directory exists | |
| print(f"Created missing directory: {self.data_directory}") | |
| document_list = os.listdir(self.data_directory) # ✅ Fixed undefined variable | |
| for doc_name in document_list: | |
| docs.extend(PyPDFLoader(os.path.join(self.data_directory, doc_name)).load()) | |
| doc_counter += 1 | |
| print("Number of loaded documents:", doc_counter) | |
| print("Number of pages:", len(docs), "\n\n") | |
| return docs | |
| def __chunk_documents(self, docs: List) -> List: | |
| """ | |
| Chunk the loaded documents using the specified text splitter. | |
| """ | |
| print("Chunking documents...") | |
| chunked_documents = self.text_splitter.split_documents(docs) | |
| print("Number of chunks:", len(chunked_documents), "\n\n") | |
| return chunked_documents | |
| def prepare_and_save_vectordb(self): | |
| """ | |
| Load, chunk, and create a VectorDB with OpenAI embeddings, and save it. | |
| """ | |
| docs = self.__load_all_documents() | |
| chunked_documents = self.__chunk_documents(docs) | |
| print("Preparing vectordb...") | |
| vectordb = Chroma.from_documents( | |
| documents=chunked_documents, | |
| embedding=self.embedding, | |
| persist_directory=self.persist_directory | |
| ) | |
| print("VectorDB is created and saved.") | |
| print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n") | |
| return vectordb | |