Spaces:
Sleeping
Sleeping
| import os | |
| import yaml | |
| from pyprojroot import here | |
| from langchain_chroma import Chroma | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from dotenv import load_dotenv | |
| class PrepareVectorDB: | |
| """ | |
| A class to prepare and manage a Vector Database (VectorDB) using documents from a specified directory. | |
| The class performs the following tasks: | |
| - Loads and splits documents (PDFs). | |
| - Splits the text into chunks based on the specified chunk size and overlap. | |
| - Embeds the document chunks using a specified embedding model. | |
| - Stores the embedded vectors in a persistent VectorDB directory. | |
| Attributes: | |
| doc_dir (str): Path to the directory containing documents (PDFs) to be processed. | |
| chunk_size (int): The maximum size of each chunk (in characters) into which the document text will be split. | |
| chunk_overlap (int): The number of overlapping characters between consecutive chunks. | |
| embedding_model (str): The name of the embedding model to be used for generating vector representations of text. | |
| vectordb_dir (str): Directory where the resulting vector database will be stored. | |
| collection_name (str): The name of the collection to be used within the vector database. | |
| Methods: | |
| path_maker(file_name: str, doc_dir: str) -> str: | |
| Creates a full file path by joining the given directory and file name. | |
| run() -> None: | |
| Executes the process of reading documents, splitting text, embedding them into vectors, and | |
| saving the resulting vector database. If the vector database directory already exists, it skips | |
| the creation process. | |
| """ | |
| def __init__(self, | |
| doc_dir: str, | |
| chunk_size: int, | |
| chunk_overlap: int, | |
| embedding_model: str, | |
| vectordb_dir: str, | |
| collection_name: str | |
| ) -> None: | |
| self.doc_dir = doc_dir | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| self.embedding_model = embedding_model | |
| self.vectordb_dir = vectordb_dir | |
| self.collection_name = collection_name | |
| def path_maker(self, file_name: str, doc_dir): | |
| """ | |
| Creates a full file path by joining the provided directory and file name. | |
| Args: | |
| file_name (str): Name of the file. | |
| doc_dir (str): Path of the directory. | |
| Returns: | |
| str: Full path of the file. | |
| """ | |
| return os.path.join(here(doc_dir), file_name) | |
| def run(self): | |
| """ | |
| Executes the main logic to create and store document embeddings in a VectorDB. | |
| If the vector database directory doesn't exist: | |
| - It loads PDF documents from the `doc_dir`, splits them into chunks, | |
| - Embeds the document chunks using the specified embedding model, | |
| - Stores the embeddings in a persistent VectorDB directory. | |
| If the directory already exists, it skips the embedding creation process. | |
| Prints the creation status and the number of vectors in the vector database. | |
| Returns: | |
| None | |
| """ | |
| if not os.path.exists(here(self.vectordb_dir)): | |
| # If it doesn't exist, create the directory and create the embeddings | |
| os.makedirs(here(self.vectordb_dir)) | |
| print(f"Directory '{self.vectordb_dir}' was created.") | |
| file_list = os.listdir(here(self.doc_dir)) | |
| docs = [PyPDFLoader(self.path_maker( | |
| fn, self.doc_dir)).load_and_split() for fn in file_list] | |
| docs_list = [item for sublist in docs for item in sublist] | |
| text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | |
| chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap | |
| ) | |
| doc_splits = text_splitter.split_documents(docs_list) | |
| # Add to vectorDB | |
| # vectordb = Chroma.from_documents( | |
| # documents=doc_splits, | |
| # collection_name=self.collection_name, | |
| # embedding=HuggingFaceEmbeddings(model_name=self.embedding_model), | |
| # persist_directory=str(here(self.vectordb_dir)) | |
| # ) | |
| # print("VectorDB is created and saved.") | |
| # print("Number of vectors in vectordb:", | |
| # vectordb._collection.count(), "\n\n") | |
| vectordb = Chroma.from_documents( | |
| documents=doc_splits, | |
| collection_name=self.collection_name, | |
| embedding=HuggingFaceEmbeddings( | |
| model_name=self.embedding_model | |
| ), | |
| persist_directory=str(here(self.vectordb_dir)) | |
| ) | |
| print("VectorDB is created and saved.") | |
| print( | |
| "Number of vectors in vectordb:", | |
| vectordb._collection.count(), | |
| "\n\n" | |
| ) | |
| else: | |
| print(f"Directory '{self.vectordb_dir}' already exists.") | |
| if __name__ == "__main__": | |
| load_dotenv() | |
| os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY") | |
| with open(here("configs/tools_config.yml")) as cfg: | |
| app_config = yaml.load(cfg, Loader=yaml.FullLoader) | |
| # Uncomment the following configs to run for swiss airline policy document | |
| chunk_size = app_config["swiss_airline_policy_rag"]["chunk_size"] | |
| chunk_overlap = app_config["swiss_airline_policy_rag"]["chunk_overlap"] | |
| embedding_model = app_config["swiss_airline_policy_rag"]["embedding_model"] | |
| vectordb_dir = app_config["swiss_airline_policy_rag"]["vectordb"] | |
| collection_name = app_config["swiss_airline_policy_rag"]["collection_name"] | |
| doc_dir = app_config["swiss_airline_policy_rag"]["unstructured_docs"] | |
| prepare_db_instance = PrepareVectorDB( | |
| doc_dir=doc_dir, | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| embedding_model=embedding_model, | |
| vectordb_dir=vectordb_dir, | |
| collection_name=collection_name) | |
| prepare_db_instance.run() | |
| # Uncomment the following configs to run for stories document | |
| chunk_size = app_config["stories_rag"]["chunk_size"] | |
| chunk_overlap = app_config["stories_rag"]["chunk_overlap"] | |
| embedding_model = app_config["stories_rag"]["embedding_model"] | |
| vectordb_dir = app_config["stories_rag"]["vectordb"] | |
| collection_name = app_config["stories_rag"]["collection_name"] | |
| doc_dir = app_config["stories_rag"]["unstructured_docs"] | |
| prepare_db_instance = PrepareVectorDB( | |
| doc_dir=doc_dir, | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| embedding_model=embedding_model, | |
| vectordb_dir=vectordb_dir, | |
| collection_name=collection_name) | |
| prepare_db_instance.run() | |
| print(here(vectordb_dir)) | |