from langchain_community.vectorstores import Chroma from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import os from typing import List from langchain_openai import OpenAIEmbeddings # ✅ Fixed import class PrepareVectorDB: """ A class for preparing and saving a VectorDB using OpenAI embeddings. """ def __init__( self, data_directory: str, persist_directory: str, embedding_model_engine: str, chunk_size: int, chunk_overlap: int ) -> None: """ Initialize the PrepareVectorDB instance. """ self.embedding_model_engine = embedding_model_engine self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n\n", "\n", " ", ""] ) self.data_directory = data_directory self.persist_directory = persist_directory self.embedding = OpenAIEmbeddings() def __load_all_documents(self) -> List: """ Load all documents from the specified directory or directories. """ doc_counter = 0 docs = [] if isinstance(self.data_directory, list): print("Loading the uploaded documents...") for doc_dir in self.data_directory: docs.extend(PyPDFLoader(doc_dir).load()) doc_counter += 1 else: print("Loading documents manually...") if not os.path.exists(self.data_directory): os.makedirs(self.data_directory) # ✅ Ensure the directory exists print(f"Created missing directory: {self.data_directory}") document_list = os.listdir(self.data_directory) # ✅ Fixed undefined variable for doc_name in document_list: docs.extend(PyPDFLoader(os.path.join(self.data_directory, doc_name)).load()) doc_counter += 1 print("Number of loaded documents:", doc_counter) print("Number of pages:", len(docs), "\n\n") return docs def __chunk_documents(self, docs: List) -> List: """ Chunk the loaded documents using the specified text splitter. """ print("Chunking documents...") chunked_documents = self.text_splitter.split_documents(docs) print("Number of chunks:", len(chunked_documents), "\n\n") return chunked_documents def prepare_and_save_vectordb(self): """ Load, chunk, and create a VectorDB with OpenAI embeddings, and save it. """ docs = self.__load_all_documents() chunked_documents = self.__chunk_documents(docs) print("Preparing vectordb...") vectordb = Chroma.from_documents( documents=chunked_documents, embedding=self.embedding, persist_directory=self.persist_directory ) print("VectorDB is created and saved.") print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n") return vectordb