Spaces:

vives
/

bert_cvent_top_k_sim

Runtime error

App Files Files Community

vives commited on May 23, 2022

Commit

caee7a0

1 Parent(s): 560e686

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -3

app.py CHANGED Viewed

@@ -4,11 +4,13 @@ from sklearn.metrics.pairwise import cosine_similarity
 import streamlit as st
 import torch
 import pickle
 model_checkpoint = "vives/distilbert-base-uncased-finetuned-cvent-2019_2022"
 model = AutoModelForMaskedLM.from_pretrained(model_checkpoint, output_hidden_states=True)
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
 text = st.text_input("Enter word or key-phrase")
 exclude_words = st.radio("exclude_words",[True,False], help="Exclude results that contain any words in the query (i.e exclude 'hot coffee' if the query is 'cold coffee')")
 exclude_text = st.radio("exclude_text",[True,False], help="Exclude results that contain the query (i.e exclude 'tomato soup recipe' if the query is 'tomato soup')")
@@ -17,12 +19,17 @@ k = st.number_input("Top k nearest key-phrases",1,10,5)
 with st.sidebar:
   diversify_box = st.checkbox("Diversify results",True)
   if diversify_box:
-    k = st.number_input("Top k nearest key-phrases",10,30,20)
 with open("kp_dict_merged.pickle",'rb') as handle:
   kp_dict = pickle.load(handle)
 for key in kp_dict.keys():
   kp_dict[key] = kp_dict[key].detach().numpy()
 def calculate_top_k(out, tokens,text,exclude_text=False,exclude_words=False, k=5):
   sim_dict = {}
@@ -65,10 +72,30 @@ def pool_embeddings(out, tok):
   mean_pooled = summed / summed_mask
   return mean_pooled
 if text:
   new_tokens = concat_tokens([text])
   new_tokens.pop("KPS")
   with torch.no_grad():
     outputs = model(**new_tokens)
-  sim_dict = calculate_top_k(outputs, new_tokens, text, exclude_text=exclude_text,exclude_words=exclude_words,k=k)
-  st.json(sim_dict)

 import streamlit as st
 import torch
 import pickle
+import itertools
 model_checkpoint = "vives/distilbert-base-uncased-finetuned-cvent-2019_2022"
 model = AutoModelForMaskedLM.from_pretrained(model_checkpoint, output_hidden_states=True)
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
 text = st.text_input("Enter word or key-phrase")
 exclude_words = st.radio("exclude_words",[True,False], help="Exclude results that contain any words in the query (i.e exclude 'hot coffee' if the query is 'cold coffee')")
 exclude_text = st.radio("exclude_text",[True,False], help="Exclude results that contain the query (i.e exclude 'tomato soup recipe' if the query is 'tomato soup')")
 with st.sidebar:
   diversify_box = st.checkbox("Diversify results",True)
   if diversify_box:
+    k_diversify = st.number_input("Set of key-phrases to diversify from",10,30,20)
+#load kp dict
 with open("kp_dict_merged.pickle",'rb') as handle:
   kp_dict = pickle.load(handle)
 for key in kp_dict.keys():
   kp_dict[key] = kp_dict[key].detach().numpy()
+#load cosine distances of kp dict
+with open("cosine_kp.pickle",'rb') as handle:
+  cosine_kp = pickle.load(handle)
 def calculate_top_k(out, tokens,text,exclude_text=False,exclude_words=False, k=5):
   sim_dict = {}
   mean_pooled = summed / summed_mask
   return mean_pooled
+def extract_idxs(top_dict, kp_dict):
+  idxs = []
+  c = 0
+  for i in list(kp_dict.keys()):
+    if i in top_dict.keys():
+      idxs.append(c)
+    c+=1
+  return idxs
 if text:
   new_tokens = concat_tokens([text])
   new_tokens.pop("KPS")
   with torch.no_grad():
     outputs = model(**new_tokens)
+  if not diversify_box:
+    sim_dict = calculate_top_k(outputs, new_tokens, text, exclude_text=exclude_text,exclude_words=exclude_words,k=k)
+    st.json(sim_dict)
+  else:
+    sim_dict = calculate_top_k(outputs, new_tokens, text, exclude_text=exclude_text,exclude_words=exclude_words,k=k_diversify)
+    idxs = extract_idxs(sim_dict, kp_dict)
+    distances_candidates = cosine_kp[np.ix_(idxs, idxs)]
+    min_sim = np.inf
+    candidate = None
+    for combination in itertools.combinations(range(len(idxs)), k):
+      sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])