Spaces:

UltimateRepAIr
/

RepAIr

Sleeping

App Files Files Community

RepAIr / rag /vectorization_functions.py

w3680

updated version

6848c8b 12 months ago

raw

history blame contribute delete

2.29 kB

	# vectorization functions
	#%%
	# General
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import FAISS
	import helper_functions.load_embed_model

	def split_documents(documents, chunk_size=900, chunk_overlap=90): # check chunk size and overlap for our purpose
	"""
	This function splits documents into chunks of given size and overlap.

	Args:
	documents (list): List of documents to be split.
	chunk_size (int): Size of each chunk.
	chunk_overlap (int): Overlap between chunks.

	Returns:
	list: List of text chunks.
	"""
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	separators=["###", "Step ", "\n\n", "\n", ".", " ", ""] # check separators for our purpose
	)
	chunks = text_splitter.split_documents(documents=documents)
	return chunks

	def create_embedding_vector_db(chunks):
	"""
	Uses the open-source embedding model HuggingFaceEmbeddings
	to create embeddings and store those in a vector database called FAISS,
	which allows for efficient similarity search

	Args:
	chunks (list): List of text chunks to be embedded.

	Returns:
	vector_db: The vector database containing the embedded chunks.
	"""
	# create the vector store
	embed_model = helper_functions.load_embed_model.embedding_model # load embedding model
	vector_database = FAISS.from_documents( # stores embeddings # from_documents includes metadata
	documents=chunks,
	embedding=embed_model
	)
	return vector_database # optimize

	# Function to query the vector database and interact with Groq
	def query_vector_db(query, vector_db, k):
	"""
	This function queries the vector database with the user query and retrieves relevant documents

	Args:
	query (str): The user query.
	vector_db: The vector database to query.

	Returns:
	str: The context retrieved from the vector database.

	"""
	# Retrieve relevant documents
	docs = vector_db.similarity_search(query, k) # neigbors k are the chunks # similarity_search: FAISS function
	context = "\n".join([doc.page_content for doc in docs])

	return context