Spaces:

langtech-innovation
/

Aina-RAG

Runtime error

App Files Files Community

Aina-RAG / src /vectorstore.py

nurasaki

Improved no-context response and logs

5b68ef9 5 months ago

raw

history blame contribute delete

4.97 kB

	from langchain_community.vectorstores import FAISS
	# from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_huggingface import HuggingFaceEmbeddings
	from huggingface_hub import snapshot_download
	import logging
	log = logging.getLogger(__name__)

	from termcolor import cprint


	class VectorStore:
	def __init__(self,
	embeddings_model: str,
	vs_local_path: str = None,
	vs_hf_path: str = None,

	# Retrieval parameters
	number_of_contexts: int = 2,
	embedding_score_threshold: float = None,

	# Context formatting parameters
	context_fmt: str = "Context document {num_document}:\n{document_content}",
	join_str: str = "\n\n",
	header_context_str: str = "",
	footer_context_str: str = "",
	no_context_str: str = "Answer 'no relevant context found'.",
	):

	"""Initializes the VectorStore with the given parameters and loads the vectorstore from the specified path.

	Arguments:
	----------
	embeddings_model : str
	The name of the HuggingFace embeddings model to use.
	vs_local_path : str, optional
	Local path to the vectorstore. Defaults to None.
	vs_hf_path : str, optional
	HuggingFace Hub path to the vectorstore. Defaults to None.
	number_of_contexts : int, optional
	Number of top similar contexts to retrieve. Defaults to 2.
	embedding_score_threshold : float, optional
	Minimum similarity score threshold for retrieved documents. Defaults to None.
	context_fmt : str, optional
	Template to format each retrieved document.
	Use only {document_content} or both {num_document} and {document_content} placeholders.
	Defaults to "Context document {num_document}:\n{document_content}".
	join_str : str, optional
	String to join multiple retrieved documents. Defaults to "\n\n".
	no_context_str : str, optional
	String to return if no documents are retrieved. Defaults to "No relevant context found.".
	header_context_str : str, optional
	String to prepend to the final context.
	Defaults to "The following is the context to help you answer the question (sorted from most to least relevant):\n\n".
	footer_context_str : str, optional
	String to append to the final context.
	Defaults to "\n\nAnswer based only on the above context.".
	"""

	log.info("Loading vectorstore...")

	# Retrieval parameters
	self.number_of_contexts = number_of_contexts
	self.embedding_score_threshold = embedding_score_threshold

	# Context formatting parameters
	self.context_fmt = context_fmt
	self.join_str = join_str
	self.header_context_str = header_context_str
	self.footer_context_str = footer_context_str
	self.no_context_str = no_context_str

	embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)
	log.info(f"Loaded embeddings model: {embeddings_model}")

	if vs_hf_path:
	hf_vectorstore = snapshot_download(repo_id=vs_hf_path)
	self.vdb = FAISS.load_local(hf_vectorstore, embeddings, allow_dangerous_deserialization=True)
	log.info(f"Loaded vectorstore from {vs_hf_path}")
	else:
	self.vdb = FAISS.load_local(vs_local_path, embeddings, allow_dangerous_deserialization=True)
	log.info(f"Loaded vectorstore from {vs_local_path}")


	def get_context(self, query,):

	# Retrieve documents
	results = self.vdb.similarity_search_with_relevance_scores(query=query, k=self.number_of_contexts, score_threshold=self.embedding_score_threshold)
	log.info(f"Retrieved {len(results)} documents from the vectorstore.")

	# Return formatted context
	return self._beautiful_context(results)


	def _beautiful_context(self, docs):

	log.info(f"Formatting {len(docs)} contexts...")

	# If no documents are retrieved, return the no_context_str
	if not docs:
	return self.no_context_str

	contexts = []
	for i, doc in enumerate(docs):

	log.info(f"Document {i+1} (score: {doc[1]:.4f}): {repr(doc[0].page_content[:100])}...")

	# Format each context document using the provided template
	context = self.context_fmt.format(num_document=i + 1, document_content=doc[0].page_content)
	contexts.append(context)

	# Join all contexts into a single string and add header and footer
	context = self.header_context_str + self.join_str.join(contexts) + self.footer_context_str


	return context