First_Aid_Assistant / src /utils /prepare_vectordb.py
DrSyedFaizan's picture
Upload folder using huggingface_hub
f8bf7df verified
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from typing import List
from langchain_openai import OpenAIEmbeddings # ✅ Fixed import
class PrepareVectorDB:
"""
A class for preparing and saving a VectorDB using OpenAI embeddings.
"""
def __init__(
self,
data_directory: str,
persist_directory: str,
embedding_model_engine: str,
chunk_size: int,
chunk_overlap: int
) -> None:
"""
Initialize the PrepareVectorDB instance.
"""
self.embedding_model_engine = embedding_model_engine
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", " ", ""]
)
self.data_directory = data_directory
self.persist_directory = persist_directory
self.embedding = OpenAIEmbeddings()
def __load_all_documents(self) -> List:
"""
Load all documents from the specified directory or directories.
"""
doc_counter = 0
docs = []
if isinstance(self.data_directory, list):
print("Loading the uploaded documents...")
for doc_dir in self.data_directory:
docs.extend(PyPDFLoader(doc_dir).load())
doc_counter += 1
else:
print("Loading documents manually...")
if not os.path.exists(self.data_directory):
os.makedirs(self.data_directory) # ✅ Ensure the directory exists
print(f"Created missing directory: {self.data_directory}")
document_list = os.listdir(self.data_directory) # ✅ Fixed undefined variable
for doc_name in document_list:
docs.extend(PyPDFLoader(os.path.join(self.data_directory, doc_name)).load())
doc_counter += 1
print("Number of loaded documents:", doc_counter)
print("Number of pages:", len(docs), "\n\n")
return docs
def __chunk_documents(self, docs: List) -> List:
"""
Chunk the loaded documents using the specified text splitter.
"""
print("Chunking documents...")
chunked_documents = self.text_splitter.split_documents(docs)
print("Number of chunks:", len(chunked_documents), "\n\n")
return chunked_documents
def prepare_and_save_vectordb(self):
"""
Load, chunk, and create a VectorDB with OpenAI embeddings, and save it.
"""
docs = self.__load_all_documents()
chunked_documents = self.__chunk_documents(docs)
print("Preparing vectordb...")
vectordb = Chroma.from_documents(
documents=chunked_documents,
embedding=self.embedding,
persist_directory=self.persist_directory
)
print("VectorDB is created and saved.")
print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n")
return vectordb