Spaces:

7beshoyarnest
/

QueryMind

Sleeping

App Files Files Community

QueryMind / src /prepare_vector_db.py

7beshoyarnest

Clean initial commit without secrets

d8a070a about 2 months ago

Raw

History Blame Contribute Delete

7.07 kB

	import os
	import yaml
	from pyprojroot import here
	from langchain_chroma import Chroma
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from dotenv import load_dotenv


	class PrepareVectorDB:
	"""
	A class to prepare and manage a Vector Database (VectorDB) using documents from a specified directory.
	The class performs the following tasks:
	- Loads and splits documents (PDFs).
	- Splits the text into chunks based on the specified chunk size and overlap.
	- Embeds the document chunks using a specified embedding model.
	- Stores the embedded vectors in a persistent VectorDB directory.

	Attributes:
	doc_dir (str): Path to the directory containing documents (PDFs) to be processed.
	chunk_size (int): The maximum size of each chunk (in characters) into which the document text will be split.
	chunk_overlap (int): The number of overlapping characters between consecutive chunks.
	embedding_model (str): The name of the embedding model to be used for generating vector representations of text.
	vectordb_dir (str): Directory where the resulting vector database will be stored.
	collection_name (str): The name of the collection to be used within the vector database.

	Methods:
	path_maker(file_name: str, doc_dir: str) -> str:
	Creates a full file path by joining the given directory and file name.

	run() -> None:
	Executes the process of reading documents, splitting text, embedding them into vectors, and
	saving the resulting vector database. If the vector database directory already exists, it skips
	the creation process.
	"""

	def __init__(self,
	doc_dir: str,
	chunk_size: int,
	chunk_overlap: int,
	embedding_model: str,
	vectordb_dir: str,
	collection_name: str
	) -> None:

	self.doc_dir = doc_dir
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.embedding_model = embedding_model
	self.vectordb_dir = vectordb_dir
	self.collection_name = collection_name

	def path_maker(self, file_name: str, doc_dir):
	"""
	Creates a full file path by joining the provided directory and file name.

	Args:
	file_name (str): Name of the file.
	doc_dir (str): Path of the directory.

	Returns:
	str: Full path of the file.
	"""
	return os.path.join(here(doc_dir), file_name)

	def run(self):
	"""
	Executes the main logic to create and store document embeddings in a VectorDB.

	If the vector database directory doesn't exist:
	- It loads PDF documents from the `doc_dir`, splits them into chunks,
	- Embeds the document chunks using the specified embedding model,
	- Stores the embeddings in a persistent VectorDB directory.

	If the directory already exists, it skips the embedding creation process.

	Prints the creation status and the number of vectors in the vector database.

	Returns:
	None
	"""
	if not os.path.exists(here(self.vectordb_dir)):
	# If it doesn't exist, create the directory and create the embeddings
	os.makedirs(here(self.vectordb_dir))
	print(f"Directory '{self.vectordb_dir}' was created.")

	file_list = os.listdir(here(self.doc_dir))
	docs = [PyPDFLoader(self.path_maker(
	fn, self.doc_dir)).load_and_split() for fn in file_list]
	docs_list = [item for sublist in docs for item in sublist]

	text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
	chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
	)
	doc_splits = text_splitter.split_documents(docs_list)
	# Add to vectorDB
	# vectordb = Chroma.from_documents(
	# documents=doc_splits,
	# collection_name=self.collection_name,
	# embedding=HuggingFaceEmbeddings(model_name=self.embedding_model),
	# persist_directory=str(here(self.vectordb_dir))
	# )
	# print("VectorDB is created and saved.")
	# print("Number of vectors in vectordb:",
	# vectordb._collection.count(), "\n\n")
	vectordb = Chroma.from_documents(
	documents=doc_splits,
	collection_name=self.collection_name,
	embedding=HuggingFaceEmbeddings(
	model_name=self.embedding_model
	),
	persist_directory=str(here(self.vectordb_dir))
	)

	print("VectorDB is created and saved.")

	print(
	"Number of vectors in vectordb:",
	vectordb._collection.count(),
	"\n\n"
	)
	else:
	print(f"Directory '{self.vectordb_dir}' already exists.")


	if __name__ == "__main__":
	load_dotenv()
	os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")

	with open(here("configs/tools_config.yml")) as cfg:
	app_config = yaml.load(cfg, Loader=yaml.FullLoader)

	# Uncomment the following configs to run for swiss airline policy document
	chunk_size = app_config["swiss_airline_policy_rag"]["chunk_size"]
	chunk_overlap = app_config["swiss_airline_policy_rag"]["chunk_overlap"]
	embedding_model = app_config["swiss_airline_policy_rag"]["embedding_model"]
	vectordb_dir = app_config["swiss_airline_policy_rag"]["vectordb"]
	collection_name = app_config["swiss_airline_policy_rag"]["collection_name"]
	doc_dir = app_config["swiss_airline_policy_rag"]["unstructured_docs"]

	prepare_db_instance = PrepareVectorDB(
	doc_dir=doc_dir,
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	embedding_model=embedding_model,
	vectordb_dir=vectordb_dir,
	collection_name=collection_name)

	prepare_db_instance.run()

	# Uncomment the following configs to run for stories document
	chunk_size = app_config["stories_rag"]["chunk_size"]
	chunk_overlap = app_config["stories_rag"]["chunk_overlap"]
	embedding_model = app_config["stories_rag"]["embedding_model"]
	vectordb_dir = app_config["stories_rag"]["vectordb"]
	collection_name = app_config["stories_rag"]["collection_name"]
	doc_dir = app_config["stories_rag"]["unstructured_docs"]

	prepare_db_instance = PrepareVectorDB(
	doc_dir=doc_dir,
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	embedding_model=embedding_model,
	vectordb_dir=vectordb_dir,
	collection_name=collection_name)

	prepare_db_instance.run()
	print(here(vectordb_dir))