Spaces:

DrSyedFaizan
/

First_Aid_Assistant

Sleeping

App Files Files Community

First_Aid_Assistant / src /utils /prepare_vectordb.py

DrSyedFaizan

Upload folder using huggingface_hub

f8bf7df verified about 1 year ago

raw

history blame contribute delete

3.26 kB

	from langchain_community.vectorstores import Chroma
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import os
	from typing import List
	from langchain_openai import OpenAIEmbeddings # ✅ Fixed import


	class PrepareVectorDB:
	"""
	A class for preparing and saving a VectorDB using OpenAI embeddings.
	"""

	def __init__(
	self,
	data_directory: str,
	persist_directory: str,
	embedding_model_engine: str,
	chunk_size: int,
	chunk_overlap: int
	) -> None:
	"""
	Initialize the PrepareVectorDB instance.
	"""
	self.embedding_model_engine = embedding_model_engine
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	separators=["\n\n", "\n", " ", ""]
	)
	self.data_directory = data_directory
	self.persist_directory = persist_directory
	self.embedding = OpenAIEmbeddings()

	def __load_all_documents(self) -> List:
	"""
	Load all documents from the specified directory or directories.
	"""
	doc_counter = 0
	docs = []

	if isinstance(self.data_directory, list):
	print("Loading the uploaded documents...")
	for doc_dir in self.data_directory:
	docs.extend(PyPDFLoader(doc_dir).load())
	doc_counter += 1
	else:
	print("Loading documents manually...")
	if not os.path.exists(self.data_directory):
	os.makedirs(self.data_directory) # ✅ Ensure the directory exists
	print(f"Created missing directory: {self.data_directory}")

	document_list = os.listdir(self.data_directory) # ✅ Fixed undefined variable

	for doc_name in document_list:
	docs.extend(PyPDFLoader(os.path.join(self.data_directory, doc_name)).load())
	doc_counter += 1

	print("Number of loaded documents:", doc_counter)
	print("Number of pages:", len(docs), "\n\n")
	return docs

	def __chunk_documents(self, docs: List) -> List:
	"""
	Chunk the loaded documents using the specified text splitter.
	"""
	print("Chunking documents...")
	chunked_documents = self.text_splitter.split_documents(docs)
	print("Number of chunks:", len(chunked_documents), "\n\n")
	return chunked_documents

	def prepare_and_save_vectordb(self):
	"""
	Load, chunk, and create a VectorDB with OpenAI embeddings, and save it.
	"""
	docs = self.__load_all_documents()
	chunked_documents = self.__chunk_documents(docs)
	print("Preparing vectordb...")
	vectordb = Chroma.from_documents(
	documents=chunked_documents,
	embedding=self.embedding,
	persist_directory=self.persist_directory
	)
	print("VectorDB is created and saved.")
	print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n")
	return vectordb