Spaces:
Sleeping
Sleeping
Arko Banik
commited on
Commit
·
5c9e161
1
Parent(s):
d02c4b6
produces pie chart but breaks when trying to change category
Browse files- README.md +4 -4
- app.py +45 -54
- embeddings_25d_temp.npy +3 -0
- embeddings_50d_temp.npy +3 -0
- requirements.txt +0 -0
- word_index_dict_25d_temp.pkl +3 -0
- word_index_dict_50d_temp.pkl +3 -0
README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: streamlit
|
| 7 |
sdk_version: 1.30.0
|
| 8 |
app_file: app.py
|
|
|
|
| 1 |
---
|
| 2 |
+
title: MiniProject1 P4
|
| 3 |
+
emoji: 🌍
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: red
|
| 6 |
sdk: streamlit
|
| 7 |
sdk_version: 1.30.0
|
| 8 |
app_file: app.py
|
app.py
CHANGED
|
@@ -4,7 +4,6 @@ import numpy.linalg as la
|
|
| 4 |
import pickle
|
| 5 |
import os
|
| 6 |
import gdown
|
| 7 |
-
#import sentence_transformers
|
| 8 |
from sentence_transformers import SentenceTransformer
|
| 9 |
import matplotlib.pyplot as plt
|
| 10 |
import math
|
|
@@ -21,11 +20,8 @@ def cosine_similarity(x, y):
|
|
| 21 |
"""
|
| 22 |
##################################
|
| 23 |
### TODO: Add code here ##########
|
| 24 |
-
cos_sim = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))
|
| 25 |
-
exp_cos = np.exp(cos_sim) ######## find formula for exponentiated cosine similarity
|
| 26 |
-
|
| 27 |
##################################
|
| 28 |
-
return
|
| 29 |
|
| 30 |
|
| 31 |
# Function to Load Glove Embeddings
|
|
@@ -67,7 +63,7 @@ def download_glove_embeddings_gdrive(model_type):
|
|
| 67 |
gdown.download(id=embeddings_id, output=embeddings_temp, quiet=False)
|
| 68 |
|
| 69 |
|
| 70 |
-
@st.cache_data()
|
| 71 |
def load_glove_embeddings_gdrive(model_type):
|
| 72 |
word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
|
| 73 |
embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
|
|
@@ -128,22 +124,18 @@ def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, mode
|
|
| 128 |
embedding = np.zeros(int(model_type.split("d")[0]))
|
| 129 |
##################################
|
| 130 |
##### TODO: Add code here ########
|
| 131 |
-
|
| 132 |
-
#glove_word_set= load_glove_embeddings_gdrive(model_type)
|
| 133 |
-
|
| 134 |
-
for word in sentence:
|
| 135 |
-
#print(sentence)
|
| 136 |
-
words = [word.strip('.,?!').lower() for word in sentence.split()]
|
| 137 |
-
total = 0
|
| 138 |
-
for w in words:
|
| 139 |
-
if w in embeddings:
|
| 140 |
-
embed += embeddings[w]
|
| 141 |
-
total +=1
|
| 142 |
-
if total != 0:
|
| 143 |
-
embed = embed/total
|
| 144 |
-
|
| 145 |
-
return embed
|
| 146 |
##################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
|
| 149 |
def get_category_embeddings(embeddings_metadata):
|
|
@@ -182,7 +174,7 @@ def get_sorted_cosine_similarity(embeddings_metadata):
|
|
| 182 |
(50 pts)
|
| 183 |
"""
|
| 184 |
categories = st.session_state.categories.split(" ")
|
| 185 |
-
|
| 186 |
if embeddings_metadata["embedding_model"] == "glove":
|
| 187 |
word_index_dict = embeddings_metadata["word_index_dict"]
|
| 188 |
embeddings = embeddings_metadata["embeddings"]
|
|
@@ -194,11 +186,10 @@ def get_sorted_cosine_similarity(embeddings_metadata):
|
|
| 194 |
|
| 195 |
##########################################
|
| 196 |
## TODO: Get embeddings for categories ###
|
| 197 |
-
cat_embed = []
|
| 198 |
-
for cat in categories:
|
| 199 |
-
cat_embed.append(get_glove_embeddings(cat))
|
| 200 |
-
|
| 201 |
##########################################
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
else:
|
| 204 |
model_name = embeddings_metadata["model_name"]
|
|
@@ -208,38 +199,36 @@ def get_sorted_cosine_similarity(embeddings_metadata):
|
|
| 208 |
category_embeddings = st.session_state["cat_embed_" + model_name]
|
| 209 |
|
| 210 |
print("text_search = ", st.session_state.text_search)
|
|
|
|
| 211 |
if model_name:
|
| 212 |
input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search, model_name=model_name)
|
| 213 |
else:
|
| 214 |
input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search)
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
sorted_list = sorted(cat_scores, key=lambda x: x[1])
|
| 230 |
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
|
| 242 |
-
return
|
| 243 |
|
| 244 |
|
| 245 |
def plot_piechart(sorted_cosine_scores_items):
|
|
@@ -397,7 +386,8 @@ if st.session_state.text_search:
|
|
| 397 |
}
|
| 398 |
with st.spinner("Obtaining Cosine similarity for Glove..."):
|
| 399 |
sorted_cosine_sim_glove = get_sorted_cosine_similarity(
|
| 400 |
-
st.session_state.text_search,
|
|
|
|
| 401 |
)
|
| 402 |
|
| 403 |
# Sentence transformer embeddings
|
|
@@ -405,7 +395,8 @@ if st.session_state.text_search:
|
|
| 405 |
embeddings_metadata = {"embedding_model": "transformers", "model_name": ""}
|
| 406 |
with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
|
| 407 |
sorted_cosine_sim_transformer = get_sorted_cosine_similarity(
|
| 408 |
-
st.session_state.text_search,
|
|
|
|
| 409 |
)
|
| 410 |
|
| 411 |
# Results and Plot Pie Chart for Glove
|
|
|
|
| 4 |
import pickle
|
| 5 |
import os
|
| 6 |
import gdown
|
|
|
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
import matplotlib.pyplot as plt
|
| 9 |
import math
|
|
|
|
| 20 |
"""
|
| 21 |
##################################
|
| 22 |
### TODO: Add code here ##########
|
|
|
|
|
|
|
|
|
|
| 23 |
##################################
|
| 24 |
+
return np.exp(np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y)))
|
| 25 |
|
| 26 |
|
| 27 |
# Function to Load Glove Embeddings
|
|
|
|
| 63 |
gdown.download(id=embeddings_id, output=embeddings_temp, quiet=False)
|
| 64 |
|
| 65 |
|
| 66 |
+
# @st.cache_data()
|
| 67 |
def load_glove_embeddings_gdrive(model_type):
|
| 68 |
word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
|
| 69 |
embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
|
|
|
|
| 124 |
embedding = np.zeros(int(model_type.split("d")[0]))
|
| 125 |
##################################
|
| 126 |
##### TODO: Add code here ########
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
##################################
|
| 128 |
+
|
| 129 |
+
words = [word.strip('.,?!').lower() for word in sentence.split()]
|
| 130 |
+
total = 0
|
| 131 |
+
for w in words:
|
| 132 |
+
if w in word_index_dict:
|
| 133 |
+
embedding += embeddings[word_index_dict[w]]
|
| 134 |
+
total +=1
|
| 135 |
+
if total != 0:
|
| 136 |
+
embedding = embedding/total
|
| 137 |
+
|
| 138 |
+
return embedding
|
| 139 |
|
| 140 |
|
| 141 |
def get_category_embeddings(embeddings_metadata):
|
|
|
|
| 174 |
(50 pts)
|
| 175 |
"""
|
| 176 |
categories = st.session_state.categories.split(" ")
|
| 177 |
+
cosine_sim = {}
|
| 178 |
if embeddings_metadata["embedding_model"] == "glove":
|
| 179 |
word_index_dict = embeddings_metadata["word_index_dict"]
|
| 180 |
embeddings = embeddings_metadata["embeddings"]
|
|
|
|
| 186 |
|
| 187 |
##########################################
|
| 188 |
## TODO: Get embeddings for categories ###
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
##########################################
|
| 190 |
+
category_embeddings = {}
|
| 191 |
+
for cat in categories:
|
| 192 |
+
category_embeddings[cat] = get_glove_embeddings(cat, word_index_dict, embeddings, model_type)
|
| 193 |
|
| 194 |
else:
|
| 195 |
model_name = embeddings_metadata["model_name"]
|
|
|
|
| 199 |
category_embeddings = st.session_state["cat_embed_" + model_name]
|
| 200 |
|
| 201 |
print("text_search = ", st.session_state.text_search)
|
| 202 |
+
print(category_embeddings)
|
| 203 |
if model_name:
|
| 204 |
input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search, model_name=model_name)
|
| 205 |
else:
|
| 206 |
input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search)
|
| 207 |
+
|
| 208 |
+
cat_scores = []
|
| 209 |
+
for index in range(len(categories)):
|
| 210 |
+
##########################################
|
| 211 |
+
# TODO: Compute cosine similarity between input sentence and categories
|
| 212 |
+
# TODO: Update category embeddings if category not found
|
| 213 |
+
##########################################
|
| 214 |
+
cat = categories[index]
|
| 215 |
+
cat_embed = category_embeddings[cat]
|
| 216 |
+
# Calc cosine sim
|
| 217 |
+
cat_scores.append((index, np.dot(input_embedding,cat_embed)))
|
| 218 |
+
# Store doc_id and score as a tuple
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
+
|
| 221 |
+
sorted_list = sorted(cat_scores, key=lambda x: x[1])
|
| 222 |
+
sorted_list = sorted_list[::-1]
|
| 223 |
+
|
| 224 |
+
sorted_cats = [element[0] for element in sorted_list]
|
| 225 |
+
|
| 226 |
+
#flip sorting order
|
| 227 |
+
# Add list to Map
|
| 228 |
+
# for cat_pair in sorted_cats:
|
| 229 |
+
# cosine_sim[cat_pair[0]] = cat_pair[1]
|
| 230 |
|
| 231 |
+
return sorted_list
|
| 232 |
|
| 233 |
|
| 234 |
def plot_piechart(sorted_cosine_scores_items):
|
|
|
|
| 386 |
}
|
| 387 |
with st.spinner("Obtaining Cosine similarity for Glove..."):
|
| 388 |
sorted_cosine_sim_glove = get_sorted_cosine_similarity(
|
| 389 |
+
# st.session_state.text_search,
|
| 390 |
+
embeddings_metadata
|
| 391 |
)
|
| 392 |
|
| 393 |
# Sentence transformer embeddings
|
|
|
|
| 395 |
embeddings_metadata = {"embedding_model": "transformers", "model_name": ""}
|
| 396 |
with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
|
| 397 |
sorted_cosine_sim_transformer = get_sorted_cosine_similarity(
|
| 398 |
+
# st.session_state.text_search,
|
| 399 |
+
embeddings_metadata
|
| 400 |
)
|
| 401 |
|
| 402 |
# Results and Plot Pie Chart for Glove
|
embeddings_25d_temp.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5eec0acf13b5c7d7c3bd178c1c84332347b9c0d55a474e37f4313e5289aacde3
|
| 3 |
+
size 238702880
|
embeddings_50d_temp.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e74f88cde3ff2e36c815d13955c67983cf6f81829d2582cb6789c10786e5ef66
|
| 3 |
+
size 477405680
|
requirements.txt
CHANGED
|
Binary files a/requirements.txt and b/requirements.txt differ
|
|
|
word_index_dict_25d_temp.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:674af352f703098ef122f6a8db7c5e08c5081829d49daea32e5aeac1fe582900
|
| 3 |
+
size 60284151
|
word_index_dict_50d_temp.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:674af352f703098ef122f6a8db7c5e08c5081829d49daea32e5aeac1fe582900
|
| 3 |
+
size 60284151
|