Spaces:
Sleeping
Sleeping
| """ | |
| This file is used to search the most similar vectors in the database using the faiss library. | |
| used indexer class grabbed from daily-llama repo (https://github.com/Ransaka/daily-llama) | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| from embeddings.embeddings import load_model, model_id | |
| # from daily llama repo | |
| import faiss | |
| class Indexer: | |
| def __init__(self, embed_vec): | |
| self.embeddings_vec = embed_vec | |
| self.build_index() | |
| def build_index(self): | |
| """ | |
| Build the index for the embeddings. | |
| This function initializes the index for the embeddings. It calculates the dimension (self.d) | |
| of the embeddings vector and creates an IndexFlatL2 object (self.index) for the given dimension. | |
| It then adds the embeddings vector (self.embeddings_vec) to the index. | |
| Parameters: | |
| - None | |
| Return: | |
| - None | |
| """ | |
| self.d = self.embeddings_vec.shape[1] | |
| self.index = faiss.IndexFlatL2(self.d) | |
| self.index.add(self.embeddings_vec) | |
| def topk(self, vector, k = 4): | |
| """ | |
| A function that takes in a vector and an optional parameter k and returns the indices of the k nearest neighbors in the index. | |
| Parameters: | |
| vector: A numpy array representing the input vector. | |
| k (optional): An integer representing the number of nearest neighbors to retrieve. Defaults to 4 if not specified. | |
| Returns: | |
| I: A numpy array containing the indices of the k nearest neighbors in the index. | |
| """ | |
| # vec = self.retreaver.encode(text)['embeddings'].detach().cpu().numpy() | |
| _, I = self.index.search(vector, k) | |
| return I | |
| def get_embeddings_vec(file_path): | |
| """ | |
| This function loads the embeddings from the given file path. | |
| Parameters: | |
| - file_path: A string representing the path to the embeddings file. | |
| Return: | |
| - embeddings_vec: A numpy array containing the embeddings. | |
| """ | |
| return np.load(file_path) | |
| def get_similar(indexer, text_embeddings, top_k = 5): | |
| """ | |
| This function returns the top k similar sentences for the given query. | |
| Parameters: | |
| - indexer: An Indexer object representing the indexer for the embeddings. | |
| - text_embeddings: A np.array representing the query embeddings. | |
| - top_k (optional): An integer representing the number of nearest neighbors to retrieve. Defaults to 4 if not specified. | |
| Return: | |
| - top_results: A numpy array containing the indices of the k nearest neighbors in the index. | |
| """ | |
| return indexer.topk(text_embeddings,k=top_k).flatten() | |
| def search_demo(test_queries:list=None,top_k:int=1): | |
| """ | |
| This function returns the top k similar sentences for the given query. | |
| """ | |
| model = load_model(model_id) | |
| embeddings_vec = get_embeddings_vec(r"data\top_cluster_embeddings.npy") | |
| indexer = Indexer(embeddings_vec) | |
| cluster_dataset = pd.read_csv(r"data\top_cluster_dataset.csv",usecols=['Headline']) | |
| search_space = cluster_dataset['Headline'].values.tolist() | |
| if test_queries is None: | |
| test_queries = [ | |
| "ක්ෂය රෝග මර්දන ව්යාපාරයේ පී.සී.ආර්. යන්ත්ර 36 භාවිතයට ගන්නැයි ඉල්ලීමක්", | |
| "පොළොන්නරුව මහරෝහලේ අකුරට වැඩ කිරීමේ වෘත්තීය ක්රියාමාර්ගයක්", | |
| "අංගොඩ අයි ඩී එච් රෝහලේ ඩෙංගු විශේෂ ප්රතිකාර ඒකකය තවම නැහැ ", | |
| "කමිටු ගැන විශ්වාසයක් නැහැ - මාළඹේ පෞද්ගලික වෛද්ය විද්යාලයීය දෙමාපිය සංසදය" | |
| ] | |
| for query in test_queries: | |
| query_embeddings = model.encode(query).reshape(1,-1) | |
| print("Query: ", query) | |
| print("Results: ") | |
| for index in get_similar(indexer, query_embeddings, top_k = top_k): | |
| print("\t-",search_space[index]) | |
| print() |