Spaces:
Sleeping
Sleeping
| """ | |
| In this code block, you can develop a class for Embeddings - | |
| That can fetch embeddings of different kinds for the purpose of "Semantic Search" | |
| """ | |
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| import pickle | |
| import numpy.linalg as la | |
| class Embeddings: | |
| def __init__(self): | |
| """ | |
| Initialize the class | |
| """ | |
| self.glove_embedding_dimension = 50 | |
| def download_glove_embeddings(self): | |
| """ | |
| Download glove embeddings from web or from your gdrive if in optimized format | |
| """ | |
| # use data from gdrive | |
| embeddings_temp = "/content/drive/MyDrive/LLM596/embeddings_50d_temp.npy" | |
| word_index_temp = "/content/drive/MyDrive/LLM596/word_index_dict_50d_temp.pkl" | |
| def load_glove_embeddings(self, embedding_dimension): | |
| # load data | |
| word_index_temp = "word_index_dict_50d_temp.pkl" | |
| embeddings_temp = "embeddings_50d_temp.npy" | |
| # Load word index dictionary | |
| word_index_dict = pickle.load(open(word_index_temp, "rb"), encoding="latin") | |
| # Load embeddings numpy | |
| embeddings = np.load(embeddings_temp) | |
| return word_index_dict, embeddings | |
| def get_glove_embedding(self, word, word_index_dict, embeddings): | |
| """ | |
| Retrieve GloVe embedding of a specific dimension | |
| """ | |
| word = word.lower() | |
| if word in word_index_dict: | |
| return embeddings[word_index_dict[word]] | |
| else: | |
| return np.zeros(self.glove_embedding_dimension) | |
| def embeddings_before_answer(self, word_index_dict, positive_words, negative_words, embeddings): | |
| new_embedding = np.zeros(self.glove_embedding_dimension) | |
| # for negative words | |
| for word in negative_words: | |
| new_embedding -= self.get_glove_embedding(word, word_index_dict, embeddings) | |
| # for positive words | |
| for word in positive_words: | |
| new_embedding += self.get_glove_embedding(word, word_index_dict, embeddings) | |
| return new_embedding | |
| def get_sentence_transformer_embedding(self, sentence, transformer_name="all-MiniLM-L6-v2"): | |
| """ | |
| Encode a sentence using sentence transformer and return embedding | |
| """ | |
| sentenceTransformer = SentenceTransformer(transformer_name) | |
| return sentenceTransformer.encode(sentence) | |
| def get_averaged_glove_embeddings(self, sentence, embeddings_dict): | |
| words = sentence.split(" ") | |
| # Initialize an array of zeros for the embedding | |
| glove_embedding = np.zeros(embeddings_dict['embeddings'].shape[1]) | |
| count_words = 0 | |
| for word in words: | |
| word = word.lower() # Convert to lowercase to match the embeddings dictionary | |
| if word in embeddings_dict['word_index']: | |
| # Sum up embeddings for each word | |
| glove_embedding += embeddings_dict['embeddings'][embeddings_dict['word_index'][word]] | |
| count_words += 1 | |
| if count_words > 0: | |
| # Average the embeddings | |
| glove_embedding /= count_words | |
| return glove_embedding | |
| class Search: | |
| def __init__(self, embeddings_model): | |
| self.embeddings_model = embeddings_model | |
| def cosine_similarity(self, x, y): | |
| return np.dot(x, y) / max(la.norm(x) * la.norm(y), 1e-3) | |
| def normalize_func(self, vector): | |
| norm = np.linalg.norm(vector) | |
| if norm == 0: | |
| return vector | |
| return vector / norm | |
| def find_closest_words(self, current_embedding, answer_list, word_index_dict, embeddings): | |
| """ | |
| Find the closest word to the target embedding from a list of answer_list | |
| """ | |
| highest_similarity = -50 | |
| closest_answer = None | |
| for choice in answer_list: | |
| choice_embedding = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings) | |
| similarity = self.cosine_similarity(current_embedding, choice_embedding) | |
| if similarity > highest_similarity: | |
| highest_similarity = similarity | |
| closest_answer = choice | |
| return closest_answer | |
| def find_word_as(self, current_relation, target_word, answer_list, word_index_dict, embeddings): | |
| base_vector_a = self.embeddings_model.get_glove_embedding(current_relation[0], word_index_dict, embeddings) | |
| base_vector_b = self.embeddings_model.get_glove_embedding(current_relation[1], word_index_dict, embeddings) | |
| target_vector = self.embeddings_model.get_glove_embedding(target_word, word_index_dict, embeddings) | |
| ref_difference = self.normalize_func(base_vector_b - base_vector_a) | |
| answer = None | |
| highest_similarity = -50 | |
| for choice in answer_list: | |
| choice_vector = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings) | |
| choice_difference = self.normalize_func(choice_vector - target_vector) | |
| similarity = self.cosine_similarity(ref_difference, choice_difference) | |
| if similarity > highest_similarity: | |
| highest_similarity = similarity | |
| answer = choice | |
| return answer | |
| def find_similarity_scores(self, current_embedding, choices, word_index_dict, embeddings): | |
| similarity_scores = {} | |
| for choice in choices: | |
| choice_embedding = self.embeddings_model.get_glove_embedding(choice, word_index_dict, embeddings) | |
| similarity = self.cosine_similarity(current_embedding, choice_embedding) | |
| similarity_scores[choice] = similarity | |
| return similarity_scores | |
| def get_topK_similar_categories(self, sentence, categories, top_k=10): | |
| """ | |
| Return the most similar categories to a given sentence - | |
| This is a baseline implementation of a semantic search engine | |
| """ | |
| # Implement your code here | |
| sentence_embedding = self.embeddings_model.get_sentence_transformer_embedding(sentence) | |
| similarities = {} | |
| for category, category_embedding in categories.items(): | |
| similarity = self.cosine_similarity(sentence_embedding, category_embedding) | |
| similarities[category] = similarity | |
| # print(similarity) | |
| # sorted_categories ={} | |
| # sorted_categories = sorted(similarities, key=lambda x: x[1], reverse=True) | |
| sorted_cosine_sim = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True)) | |
| # Return top K categories | |
| return sorted_cosine_sim | |
| def plot_alatirchart(sorted_cosine_scores_models): | |
| models = list(sorted_cosine_scores_models.keys()) | |
| tabs = st.tabs(models) | |
| figs = {} | |
| for model in models: | |
| # modified | |
| figs[model] = plot_piechart_helper(sorted_cosine_scores_models[model]) | |
| for index in range(len(tabs)): | |
| with tabs[index]: | |
| st.pyplot(figs[models[index]]) | |
| import matplotlib.pyplot as plt | |
| def plot_pie_chart(category_simiarity_scores): | |
| categories = list(category_simiarity_scores.keys()) | |
| cur_similarities = list(category_simiarity_scores.values()) | |
| similarities = [similar / sum(cur_similarities) for similar in cur_similarities] | |
| fig, ax = plt.subplots() | |
| ax.pie(similarities, labels=categories, | |
| autopct="%1.1f%%", | |
| startangle=90) | |
| ax.axis('equal') | |
| plt.show() | |
| def plot_piechart_helper(sorted_cosine_scores_items): | |
| sorted_cosine_scores = np.array(list(sorted_cosine_scores_items.values())) | |
| categories_sorted = list(sorted_cosine_scores_items.keys()) | |
| fig, ax = plt.subplots(figsize=(3, 3)) | |
| my_explode = np.zeros(len(categories_sorted)) | |
| my_explode[0] = 0.2 | |
| if len(categories_sorted) == 3: | |
| my_explode[1] = 0.1 | |
| elif len(categories_sorted) > 3: | |
| my_explode[2] = 0.05 | |
| ax.pie( | |
| sorted_cosine_scores, | |
| labels=categories_sorted, | |
| autopct="%1.1f%%", | |
| explode=my_explode, | |
| ) | |
| return fig | |
| import streamlit as st | |
| ### Text Search ### | |
| st.sidebar.title("GloVe Twitter") | |
| st.sidebar.markdown( | |
| """ | |
| GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Pretrained on | |
| 2 billion tweets with vocabulary size of 1.2 million. Download from [Stanford NLP](http://nlp.stanford.edu/data/glove.twitter.27B.zip). | |
| Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. *GloVe: Global Vectors for Word Representation*. | |
| """ | |
| ) | |
| if 'categories' not in st.session_state: | |
| st.session_state['categories'] = "Flowers Colors Cars Weather Food" | |
| if 'text_search' not in st.session_state: | |
| st.session_state['text_search'] = "Roses are red, trucks are blue, and Seattle is grey right now" | |
| embeddings_model = Embeddings() | |
| model_type = st.sidebar.selectbox("Choose the model", ("25d", "50d"), index=1) | |
| st.title("Demo in in-class coding") | |
| st.subheader( | |
| "Pass in space separated categories you want this search demo to be about." | |
| ) | |
| # categories of user input | |
| user_categories = st.text_input( | |
| label="Categories", value=st.session_state.categories | |
| ) | |
| st.session_state.categories = user_categories.split(" ") | |
| print(st.session_state.get("categories")) | |
| print(type(st.session_state.get("categories"))) | |
| st.subheader("Pass in an input word or even a sentence") | |
| user_text_search = st.text_input( | |
| label="Input your sentence", | |
| value=st.session_state.text_search, | |
| ) | |
| st.session_state.text_search = user_text_search | |
| # Load glove embeddings | |
| word_index_dict, embeddings = embeddings_model.load_glove_embeddings(model_type) | |
| category_embeddings = {category: embeddings_model.get_sentence_transformer_embedding(category) for category in | |
| st.session_state.categories} | |
| search_using_cos = Search(embeddings_model) | |
| # Find closest word to an input word | |
| if st.session_state.text_search: | |
| # sentence transformer embeddings | |
| print("sentence transformer Embedding") | |
| embeddings_metadata = { | |
| "word_index_dict": word_index_dict, | |
| "embeddings": embeddings, | |
| "model_type": model_type, | |
| "text_search": st.session_state.text_search | |
| } | |
| with st.spinner("Obtaining Cosine similarity for Glove..."): | |
| sorted_cosine_sim_transformer = search_using_cos.get_topK_similar_categories( | |
| st.session_state.text_search, category_embeddings | |
| ) | |
| # Results and Plot Pie Chart for Glove | |
| print("Categories are: ", st.session_state.categories) | |
| st.subheader( | |
| "Closest word I have between: " | |
| + " ".join(st.session_state.categories) | |
| + " as per different Embeddings" | |
| ) | |
| # print(sorted_cosine_sim_glove) | |
| print(sorted_cosine_sim_transformer) | |
| print(list(sorted_cosine_sim_transformer.keys())[0]) | |
| st.write( | |
| f"Closest category using sentence transformer embeddings : {list(sorted_cosine_sim_transformer.keys())[0]}") | |
| plot_alatirchart( | |
| { | |
| "sentence_transformer_384": sorted_cosine_sim_transformer, | |
| } | |
| ) | |
| st.write("") | |
| st.write( | |
| "Demo developed by Kechen Liu" | |
| ) | |