Spaces:

ericlkc
/

in-class-01

Sleeping

App Files Files Community

in-class-01 / app.py

ericlkc

in-class 01 demo

2913b41 verified about 2 years ago

raw

history blame contribute delete

11.1 kB

	"""
	In this code block, you can develop a class for Embeddings -
	That can fetch embeddings of different kinds for the purpose of "Semantic Search"
	"""

	from sentence_transformers import SentenceTransformer
	import numpy as np
	import pickle

	import numpy.linalg as la


	class Embeddings:

	def __init__(self):
	"""
	Initialize the class
	"""
	self.glove_embedding_dimension = 50

	def download_glove_embeddings(self):
	"""
	Download glove embeddings from web or from your gdrive if in optimized format
	"""
	# use data from gdrive
	embeddings_temp = "/content/drive/MyDrive/LLM596/embeddings_50d_temp.npy"

	word_index_temp = "/content/drive/MyDrive/LLM596/word_index_dict_50d_temp.pkl"

	def load_glove_embeddings(self, embedding_dimension):
	# load data
	word_index_temp = "word_index_dict_50d_temp.pkl"
	embeddings_temp = "embeddings_50d_temp.npy"

	# Load word index dictionary
	word_index_dict = pickle.load(open(word_index_temp, "rb"), encoding="latin")

	# Load embeddings numpy
	embeddings = np.load(embeddings_temp)

	return word_index_dict, embeddings

	def get_glove_embedding(self, word, word_index_dict, embeddings):
	"""
	Retrieve GloVe embedding of a specific dimension
	"""
	word = word.lower()
	if word in word_index_dict:
	return embeddings[word_index_dict[word]]
	else:
	return np.zeros(self.glove_embedding_dimension)

	def embeddings_before_answer(self, word_index_dict, positive_words, negative_words, embeddings):
	new_embedding = np.zeros(self.glove_embedding_dimension)

	# for negative words
	for word in negative_words:
	new_embedding -= self.get_glove_embedding(word, word_index_dict, embeddings)

	# for positive words
	for word in positive_words:
	new_embedding += self.get_glove_embedding(word, word_index_dict, embeddings)

	return new_embedding

	def get_sentence_transformer_embedding(self, sentence, transformer_name="all-MiniLM-L6-v2"):
	"""
	Encode a sentence using sentence transformer and return embedding
	"""

	sentenceTransformer = SentenceTransformer(transformer_name)

	return sentenceTransformer.encode(sentence)

	def get_averaged_glove_embeddings(self, sentence, embeddings_dict):
	words = sentence.split(" ")
	# Initialize an array of zeros for the embedding
	glove_embedding = np.zeros(embeddings_dict['embeddings'].shape[1])

	count_words = 0
	for word in words:
	word = word.lower() # Convert to lowercase to match the embeddings dictionary
	if word in embeddings_dict['word_index']:
	# Sum up embeddings for each word
	glove_embedding += embeddings_dict['embeddings'][embeddings_dict['word_index'][word]]
	count_words += 1

	if count_words > 0:
	# Average the embeddings
	glove_embedding /= count_words

	return glove_embedding


	class Search:

	def __init__(self, embeddings_model):
	self.embeddings_model = embeddings_model

	def cosine_similarity(self, x, y):

	return np.dot(x, y) / max(la.norm(x) * la.norm(y), 1e-3)

	def normalize_func(self, vector):
	norm = np.linalg.norm(vector)
	if norm == 0:
	return vector
	return vector / norm

	def find_closest_words(self, current_embedding, answer_list, word_index_dict, embeddings):
	"""
	Find the closest word to the target embedding from a list of answer_list
	"""
	highest_similarity = -50
	closest_answer = None

	for choice in answer_list:
	choice_embedding = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings)
	similarity = self.cosine_similarity(current_embedding, choice_embedding)
	if similarity > highest_similarity:
	highest_similarity = similarity
	closest_answer = choice

	return closest_answer

	def find_word_as(self, current_relation, target_word, answer_list, word_index_dict, embeddings):

	base_vector_a = self.embeddings_model.get_glove_embedding(current_relation[0], word_index_dict, embeddings)
	base_vector_b = self.embeddings_model.get_glove_embedding(current_relation[1], word_index_dict, embeddings)
	target_vector = self.embeddings_model.get_glove_embedding(target_word, word_index_dict, embeddings)

	ref_difference = self.normalize_func(base_vector_b - base_vector_a)

	answer = None
	highest_similarity = -50

	for choice in answer_list:
	choice_vector = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings)
	choice_difference = self.normalize_func(choice_vector - target_vector)
	similarity = self.cosine_similarity(ref_difference, choice_difference)
	if similarity > highest_similarity:
	highest_similarity = similarity
	answer = choice

	return answer

	def find_similarity_scores(self, current_embedding, choices, word_index_dict, embeddings):

	similarity_scores = {}

	for choice in choices:
	choice_embedding = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings)
	similarity = self.cosine_similarity(current_embedding, choice_embedding)
	similarity_scores[choice] = similarity

	return similarity_scores

	def get_topK_similar_categories(self, sentence, categories, top_k=10):
	"""
	Return the most similar categories to a given sentence -
	This is a baseline implementation of a semantic search engine
	"""

	# Implement your code here
	sentence_embedding = self.embeddings_model.get_sentence_transformer_embedding(sentence)

	similarities = {}
	for category, category_embedding in categories.items():
	similarity = self.cosine_similarity(sentence_embedding, category_embedding)
	similarities[category] = similarity
	# print(similarity)

	# sorted_categories ={}
	# sorted_categories = sorted(similarities, key=lambda x: x[1], reverse=True)

	sorted_cosine_sim = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True))

	# Return top K categories
	return sorted_cosine_sim


	def plot_alatirchart(sorted_cosine_scores_models):
	models = list(sorted_cosine_scores_models.keys())
	tabs = st.tabs(models)
	figs = {}
	for model in models:
	# modified
	figs[model] = plot_piechart_helper(sorted_cosine_scores_models[model])

	for index in range(len(tabs)):
	with tabs[index]:
	st.pyplot(figs[models[index]])


	import matplotlib.pyplot as plt


	def plot_pie_chart(category_simiarity_scores):
	categories = list(category_simiarity_scores.keys())
	cur_similarities = list(category_simiarity_scores.values())

	similarities = [similar / sum(cur_similarities) for similar in cur_similarities]

	fig, ax = plt.subplots()
	ax.pie(similarities, labels=categories,
	autopct="%1.1f%%",
	startangle=90)
	ax.axis('equal')
	plt.show()


	def plot_piechart_helper(sorted_cosine_scores_items):
	sorted_cosine_scores = np.array(list(sorted_cosine_scores_items.values()))
	categories_sorted = list(sorted_cosine_scores_items.keys())

	fig, ax = plt.subplots(figsize=(3, 3))
	my_explode = np.zeros(len(categories_sorted))
	my_explode[0] = 0.2
	if len(categories_sorted) == 3:
	my_explode[1] = 0.1
	elif len(categories_sorted) > 3:
	my_explode[2] = 0.05

	ax.pie(
	sorted_cosine_scores,
	labels=categories_sorted,
	autopct="%1.1f%%",
	explode=my_explode,
	)

	return fig


	import streamlit as st

	### Text Search ###
	st.sidebar.title("GloVe Twitter")
	st.sidebar.markdown(
	"""
	GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Pretrained on
	2 billion tweets with vocabulary size of 1.2 million. Download from [Stanford NLP](http://nlp.stanford.edu/data/glove.twitter.27B.zip).

	Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation.
	"""
	)

	if 'categories' not in st.session_state:
	st.session_state['categories'] = "Flowers Colors Cars Weather Food"
	if 'text_search' not in st.session_state:
	st.session_state['text_search'] = "Roses are red, trucks are blue, and Seattle is grey right now"

	embeddings_model = Embeddings()

	model_type = st.sidebar.selectbox("Choose the model", ("25d", "50d"), index=1)

	st.title("Demo in in-class coding")
	st.subheader(
	"Pass in space separated categories you want this search demo to be about."
	)

	# categories of user input
	user_categories = st.text_input(
	label="Categories", value=st.session_state.categories
	)

	st.session_state.categories = user_categories.split(" ")

	print(st.session_state.get("categories"))

	print(type(st.session_state.get("categories")))

	st.subheader("Pass in an input word or even a sentence")
	user_text_search = st.text_input(
	label="Input your sentence",
	value=st.session_state.text_search,
	)

	st.session_state.text_search = user_text_search

	# Load glove embeddings
	word_index_dict, embeddings = embeddings_model.load_glove_embeddings(model_type)

	category_embeddings = {category: embeddings_model.get_sentence_transformer_embedding(category) for category in
	st.session_state.categories}

	search_using_cos = Search(embeddings_model)

	# Find closest word to an input word
	if st.session_state.text_search:
	# sentence transformer embeddings
	print("sentence transformer Embedding")
	embeddings_metadata = {
	"word_index_dict": word_index_dict,
	"embeddings": embeddings,
	"model_type": model_type,
	"text_search": st.session_state.text_search
	}
	with st.spinner("Obtaining Cosine similarity for Glove..."):
	sorted_cosine_sim_transformer = search_using_cos.get_topK_similar_categories(
	st.session_state.text_search, category_embeddings
	)

	# Results and Plot Pie Chart for Glove
	print("Categories are: ", st.session_state.categories)
	st.subheader(
	"Closest word I have between: "
	+ " ".join(st.session_state.categories)
	+ " as per different Embeddings"
	)

	# print(sorted_cosine_sim_glove)
	print(sorted_cosine_sim_transformer)
	print(list(sorted_cosine_sim_transformer.keys())[0])

	st.write(
	f"Closest category using sentence transformer embeddings : {list(sorted_cosine_sim_transformer.keys())[0]}")

	plot_alatirchart(
	{
	"sentence_transformer_384": sorted_cosine_sim_transformer,
	}
	)

	st.write("")
	st.write(
	"Demo developed by Kechen Liu"
	)