Spaces:
Build error
Build error
| import os | |
| from typing import List | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import HuggingFaceBgeEmbeddings | |
| class PrepareVectorDB: | |
| """ | |
| A class for preparing and saving a VectorDB using OpenAI embeddings. | |
| Involves process of loading documents, chunking them, and creating a VectorDB | |
| with OpenAI embeddings. contains methods to prepare & save the vecotordb. | |
| Parameters: | |
| data_directory (str): Directory or list of directories containing the documents. | |
| persist_directory (str): Directory to save the VectorDB. | |
| embedding_model_engine (str): The engine for OpenAI embeddings. | |
| chunk_size (int): The size of the chunks for document processing. | |
| chunk_overlap (int): The overlap between chunks. | |
| """ | |
| def __init__( | |
| self, | |
| data_directory: str, | |
| persist_directory: str, | |
| embedding_model_engine: str, | |
| chunk_size: int, | |
| chunk_overlap: int) -> None: | |
| """ | |
| Initializing the PrepareVectorDB instance. | |
| Parameters: | |
| data_directory (str): Directory or list of directories containing the documents. | |
| persist_directory (str): Directory to save the VectorDB. | |
| embedding_model_engine (str): The engine for OpenAI embeddings. | |
| chunk_size (int): The size of the chunks for document processing. | |
| chunk_overlap (int): The overlap between chunks. | |
| """ | |
| self.embedding_model_engine = embedding_model_engine | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| separators=[ | |
| "\n#{1,6} ", | |
| "```\n", | |
| "\n\\*\\*\\*+\n", | |
| "\n---+\n", | |
| "\n___+\n", | |
| "\n\n", | |
| "\n", | |
| " ", | |
| "", | |
| ] | |
| ) | |
| """choices: MarkdownHeaderTextSplitter,TokenTextSplitter, etc.""" | |
| self.data_directory = data_directory | |
| self.persist_directory = persist_directory | |
| self.embedding = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5", | |
| model_kwargs={'device': 'cpu'}, | |
| encode_kwargs={'normalize_embeddings': True}) | |
| def __load_all_documents(self) -> List: | |
| """ | |
| Load all documents from the specified directory or directories and | |
| handles the documents obtained live during chat. | |
| Returns: | |
| List: A list of loaded documents. | |
| """ | |
| doc_counter = 0 | |
| if isinstance(self.data_directory, list): | |
| print("Loading the uploaded documents...") | |
| docs = [doc for doc_dir in self.data_directory | |
| for doc in PyPDFLoader(doc_dir).load()] | |
| else: | |
| print("Loading documents manually...") | |
| document_list = os.listdir(self.data_directory) | |
| docs = [doc for doc_name in document_list | |
| for doc in PyPDFLoader(os.path.join( | |
| self.data_directory, doc_name)).load()] | |
| doc_counter = len(docs) | |
| print(f"Number of loaded documents: {doc_counter}") | |
| print(f"Number of pages: {len(docs)}\n\n") | |
| return docs | |
| def __chunk_documents(self, docs: List) -> List: | |
| """ | |
| Chunk the loaded documents using the specified text splitter. | |
| Parameters: | |
| docs (List): The list of loaded documents. | |
| Returns: | |
| List: A list of chunked documents. | |
| """ | |
| print("Chunking documents...") | |
| chunked_documents = self.text_splitter.split_documents(docs) | |
| print("Number of chunks:", len(chunked_documents), "\n\n") | |
| return chunked_documents | |
| def prepare_and_save_vectordb(self): | |
| """ | |
| Load, chunk, and create a VectorDB with OpenAI embeddings, and save it. | |
| Returns: | |
| Chroma: The created VectorDB. | |
| """ | |
| docs = self.__load_all_documents() | |
| chunked_documents = self.__chunk_documents(docs) | |
| print("Preparing vectordb...") | |
| vectordb = Chroma.from_documents( | |
| documents=chunked_documents, | |
| embedding=self.embedding, | |
| persist_directory=self.persist_directory | |
| ) | |
| print("Vectordb created and saved!") | |
| print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n") | |
| return vectordb | |