yuanjunchai
commited on
Commit
·
de5ed54
1
Parent(s):
8266b4e
add application files
Browse files
app.py
CHANGED
|
@@ -140,10 +140,7 @@ def update_category_embeddings(embeddings_metadata):
|
|
| 140 |
get_category_embeddings(embeddings_metadata)
|
| 141 |
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
### Plotting utility functions
|
| 146 |
-
|
| 147 |
def plot_piechart(sorted_cosine_scores_items):
|
| 148 |
sorted_cosine_scores = np.array([
|
| 149 |
sorted_cosine_scores_items[index][1]
|
|
@@ -243,25 +240,17 @@ def cosine_similarity(x, y):
|
|
| 243 |
3. Return exponentiated cosine similarity
|
| 244 |
(20 pts)
|
| 245 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
dot_product = np.dot(x, y)
|
| 250 |
-
|
| 251 |
-
norm_x = np.linalg.norm(x)
|
| 252 |
-
norm_y = np.linalg.norm(y)
|
| 253 |
-
|
| 254 |
-
if norm_x == 0 or norm_y == 0:
|
| 255 |
-
cosine_sim = 0
|
| 256 |
-
else:
|
| 257 |
-
cosine_sim = dot_product / (norm_x * norm_y)
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
exp_cosine_sim = math.exp(cosine_sim)
|
| 261 |
|
| 262 |
return exp_cosine_sim
|
| 263 |
|
| 264 |
-
|
| 265 |
|
| 266 |
# Task II: Average Glove Embedding Calculation
|
| 267 |
def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, model_type=50):
|
|
@@ -274,22 +263,23 @@ def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, mode
|
|
| 274 |
5. Return averaged embeddings
|
| 275 |
(30 pts)
|
| 276 |
"""
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
embedding_dim = np.zeros(int(model_type.split("d")[0]))
|
| 280 |
-
embedding = np.zeros(embedding_dim)
|
| 281 |
|
|
|
|
| 282 |
words = sentence.split()
|
| 283 |
-
|
| 284 |
-
valid_word_count = 0
|
| 285 |
|
| 286 |
for word in words:
|
| 287 |
-
if word
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
return embedding
|
| 295 |
|
|
@@ -395,7 +385,7 @@ if __name__ == "__main__":
|
|
| 395 |
key="text_search",
|
| 396 |
value="Roses are red, trucks are blue, and Seattle is grey right now",
|
| 397 |
)
|
| 398 |
-
|
| 399 |
|
| 400 |
# Download glove embeddings if it doesn't exist
|
| 401 |
embeddings_path = "embeddings_" + str(model_type) + "_temp.npy"
|
|
@@ -426,7 +416,7 @@ if __name__ == "__main__":
|
|
| 426 |
}
|
| 427 |
with st.spinner("Obtaining Cosine similarity for Glove..."):
|
| 428 |
sorted_cosine_sim_glove = get_sorted_cosine_similarity(
|
| 429 |
-
|
| 430 |
)
|
| 431 |
|
| 432 |
# Sentence transformer embeddings
|
|
@@ -434,7 +424,7 @@ if __name__ == "__main__":
|
|
| 434 |
embeddings_metadata = {"embedding_model": "transformers", "model_name": ""}
|
| 435 |
with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
|
| 436 |
sorted_cosine_sim_transformer = get_sorted_cosine_similarity(
|
| 437 |
-
|
| 438 |
)
|
| 439 |
|
| 440 |
# Results and Plot Pie Chart for Glove
|
|
|
|
| 140 |
get_category_embeddings(embeddings_metadata)
|
| 141 |
|
| 142 |
|
|
|
|
|
|
|
| 143 |
### Plotting utility functions
|
|
|
|
| 144 |
def plot_piechart(sorted_cosine_scores_items):
|
| 145 |
sorted_cosine_scores = np.array([
|
| 146 |
sorted_cosine_scores_items[index][1]
|
|
|
|
| 240 |
3. Return exponentiated cosine similarity
|
| 241 |
(20 pts)
|
| 242 |
"""
|
| 243 |
+
x_norm = np.linalg.norm(x)
|
| 244 |
+
y_norm = np.linalg.norm(y)
|
| 245 |
+
if x_norm == 0 or y_norm == 0:
|
| 246 |
+
raise ValueError("Cannot compute cosine similarity with zero vector")
|
| 247 |
+
|
| 248 |
+
cosine_sim = np.dot(x, y) / (x_norm * y_norm)
|
| 249 |
|
| 250 |
+
exp_cosine_sim = np.exp(cosine_sim)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
return exp_cosine_sim
|
| 253 |
|
|
|
|
| 254 |
|
| 255 |
# Task II: Average Glove Embedding Calculation
|
| 256 |
def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, model_type=50):
|
|
|
|
| 263 |
5. Return averaged embeddings
|
| 264 |
(30 pts)
|
| 265 |
"""
|
| 266 |
+
embedding = np.zeros(int(model_type.split("d")[0]))
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
+
# Split sentence into words
|
| 269 |
words = sentence.split()
|
| 270 |
+
valid_words = 0
|
|
|
|
| 271 |
|
| 272 |
for word in words:
|
| 273 |
+
# Check if the word is in the word_index_dict
|
| 274 |
+
if word in word_index_dict:
|
| 275 |
+
word_idx = word_index_dict[word]
|
| 276 |
+
embedding += embeddings[word_idx]
|
| 277 |
+
valid_words += 1
|
| 278 |
+
|
| 279 |
+
if valid_words > 0:
|
| 280 |
+
raise ValueError("No valid words in sentence")
|
| 281 |
+
|
| 282 |
+
embedding /= valid_words
|
| 283 |
|
| 284 |
return embedding
|
| 285 |
|
|
|
|
| 385 |
key="text_search",
|
| 386 |
value="Roses are red, trucks are blue, and Seattle is grey right now",
|
| 387 |
)
|
| 388 |
+
st.session_state.text_search = text_search
|
| 389 |
|
| 390 |
# Download glove embeddings if it doesn't exist
|
| 391 |
embeddings_path = "embeddings_" + str(model_type) + "_temp.npy"
|
|
|
|
| 416 |
}
|
| 417 |
with st.spinner("Obtaining Cosine similarity for Glove..."):
|
| 418 |
sorted_cosine_sim_glove = get_sorted_cosine_similarity(
|
| 419 |
+
embeddings_metadata
|
| 420 |
)
|
| 421 |
|
| 422 |
# Sentence transformer embeddings
|
|
|
|
| 424 |
embeddings_metadata = {"embedding_model": "transformers", "model_name": ""}
|
| 425 |
with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
|
| 426 |
sorted_cosine_sim_transformer = get_sorted_cosine_similarity(
|
| 427 |
+
embeddings_metadata
|
| 428 |
)
|
| 429 |
|
| 430 |
# Results and Plot Pie Chart for Glove
|