Spaces:

Vitomir
/

search_engine

Sleeping

search_engine / models /prompt_search_engine.py

Vitomir Jovanović

Glancing + new data

591de4e about 1 year ago

2.74 kB

	from typing import Sequence, List, Tuple
	import numpy as np
	from sentence_transformers import SentenceTransformer
	import faiss

	class PromptSearchEngine:
	'''Instanciate the language model and index for searching the most similar prompts. Performs the semantic search.'''
	def __init__(self, model_name='bert-base-nli-mean-tokens'):
	print("Search engine started!")
	self.model = SentenceTransformer(model_name)
	# Initialize FAISS index with right number of dimensions
	self.embedding_dimension = self.model.get_sentence_embedding_dimension()
	self.index = faiss.IndexFlatL2(self.embedding_dimension) # Euclidian distance index - brute force for small datasets
	self.prompts_track = [] # To keep track of original prompts for returning results


	def add_prompts_to_vector_database(self, prompts):
	print("Data encoding started...")
	embeddings = self.model.encode(prompts)
	self.index.add(np.array(embeddings).astype('float32'))
	self.prompts_track.extend(prompts)
	print("Data encoding completed!")


	def most_similar(self, query, top_k=5):
	# Encode the
	print('Finding the most similar vectors')
	query_embedding = self.model.encode([query]).astype('float32')

	# Optimizovana pretraga ali moramo promeniti vrstu indeksa za pretragu kod stvarne upotrebe
	distances, indices = self.index.search(query_embedding, top_k)

	# Retrieve the corresponding prompts for the found indices
	similar_prompts = [self.prompts_track[idx] for idx in indices[0]]

	return similar_prompts, distances[0] # Return both the similar prompts and their distances


	def cosine_similarity(self, query_vector, index):
	"""Compute the cosine similarity between a query vector and a set of corpus vectors.
	Args: query_vector: The query vector to compare against the corpus vectors. corpus_vectors: The set of corpus vectors to compare against the query vector.
	Returns: The cosine similarity between the query vector and the corpus vectors.
	"""
	print('Searching for all similarities...')
	query_vector = np.array(query_vector).astype('float32')
	query_norm = query_vector / np.linalg.norm(query_vector)

	# Get all vectors from FAISS
	index_vectors = index.reconstruct_n(0, index.ntotal) # Reconstruct all vectors in the index
	index_norms = np.linalg.norm(index_vectors, axis=1, keepdims=True)
	normalized_index_vectors = index_vectors / index_norms
	cosine_similarities = np.dot(normalized_index_vectors, query_norm.T)

	return cosine_similarities