Aina-RAG / src /vectorstore.py
nurasaki's picture
Improved no-context response and logs
5b68ef9
from langchain_community.vectorstores import FAISS
# from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from huggingface_hub import snapshot_download
import logging
log = logging.getLogger(__name__)
from termcolor import cprint
class VectorStore:
def __init__(self,
embeddings_model: str,
vs_local_path: str = None,
vs_hf_path: str = None,
# Retrieval parameters
number_of_contexts: int = 2,
embedding_score_threshold: float = None,
# Context formatting parameters
context_fmt: str = "Context document {num_document}:\n{document_content}",
join_str: str = "\n\n",
header_context_str: str = "",
footer_context_str: str = "",
no_context_str: str = "Answer 'no relevant context found'.",
):
"""Initializes the VectorStore with the given parameters and loads the vectorstore from the specified path.
Arguments:
----------
embeddings_model : str
The name of the HuggingFace embeddings model to use.
vs_local_path : str, optional
Local path to the vectorstore. Defaults to None.
vs_hf_path : str, optional
HuggingFace Hub path to the vectorstore. Defaults to None.
number_of_contexts : int, optional
Number of top similar contexts to retrieve. Defaults to 2.
embedding_score_threshold : float, optional
Minimum similarity score threshold for retrieved documents. Defaults to None.
context_fmt : str, optional
Template to format each retrieved document.
Use only {document_content} or both {num_document} and {document_content} placeholders.
Defaults to "Context document {num_document}:\n{document_content}".
join_str : str, optional
String to join multiple retrieved documents. Defaults to "\n\n".
no_context_str : str, optional
String to return if no documents are retrieved. Defaults to "No relevant context found.".
header_context_str : str, optional
String to prepend to the final context.
Defaults to "The following is the context to help you answer the question (sorted from most to least relevant):\n\n".
footer_context_str : str, optional
String to append to the final context.
Defaults to "\n\nAnswer based only on the above context.".
"""
log.info("Loading vectorstore...")
# Retrieval parameters
self.number_of_contexts = number_of_contexts
self.embedding_score_threshold = embedding_score_threshold
# Context formatting parameters
self.context_fmt = context_fmt
self.join_str = join_str
self.header_context_str = header_context_str
self.footer_context_str = footer_context_str
self.no_context_str = no_context_str
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)
log.info(f"Loaded embeddings model: {embeddings_model}")
if vs_hf_path:
hf_vectorstore = snapshot_download(repo_id=vs_hf_path)
self.vdb = FAISS.load_local(hf_vectorstore, embeddings, allow_dangerous_deserialization=True)
log.info(f"Loaded vectorstore from {vs_hf_path}")
else:
self.vdb = FAISS.load_local(vs_local_path, embeddings, allow_dangerous_deserialization=True)
log.info(f"Loaded vectorstore from {vs_local_path}")
def get_context(self, query,):
# Retrieve documents
results = self.vdb.similarity_search_with_relevance_scores(query=query, k=self.number_of_contexts, score_threshold=self.embedding_score_threshold)
log.info(f"Retrieved {len(results)} documents from the vectorstore.")
# Return formatted context
return self._beautiful_context(results)
def _beautiful_context(self, docs):
log.info(f"Formatting {len(docs)} contexts...")
# If no documents are retrieved, return the no_context_str
if not docs:
return self.no_context_str
contexts = []
for i, doc in enumerate(docs):
log.info(f"Document {i+1} (score: {doc[1]:.4f}): {repr(doc[0].page_content[:100])}...")
# Format each context document using the provided template
context = self.context_fmt.format(num_document=i + 1, document_content=doc[0].page_content)
contexts.append(context)
# Join all contexts into a single string and add header and footer
context = self.header_context_str + self.join_str.join(contexts) + self.footer_context_str
return context