Spaces:
Build error
Build error
taskswithcode
commited on
Commit
·
889fe1c
1
Parent(s):
aea620e
Fix
Browse files- app.py +1 -1
- twc_clustering.py +29 -5
app.py
CHANGED
|
@@ -160,7 +160,7 @@ def display_results(orig_sentences,results,response_info,app_mode,model_name):
|
|
| 160 |
main_sent = f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">{response_info}<br/><br/></div>"
|
| 161 |
main_sent += f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">Showing results for model: <b>{model_name}</b></div>"
|
| 162 |
score_text = "cosine distance"
|
| 163 |
-
main_sent += f"<div style=\"font-size:14px; color: #6f6f6f; text-align: left\">Clustering by {score_text}. <b>{len(results['clusters'])} clusters</b>. mean:{results['info']['mean']:.2f}
|
| 164 |
body_sent = []
|
| 165 |
download_data = {}
|
| 166 |
for i in range(len(results["clusters"])):
|
|
|
|
| 160 |
main_sent = f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">{response_info}<br/><br/></div>"
|
| 161 |
main_sent += f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">Showing results for model: <b>{model_name}</b></div>"
|
| 162 |
score_text = "cosine distance"
|
| 163 |
+
main_sent += f"<div style=\"font-size:14px; color: #6f6f6f; text-align: left\">Clustering by {score_text}. <b>{len(results['clusters'])} clusters</b>. mean:{results['info']['mean']:.2f}; std:{results['info']['std']:.2f}; current threshold:{results['info']['current_threshold']}<br/>Threshold hints:{str(results['info']['zscores'])}<br/>Overlap stats(overlap,freq):{str(results['info']['overlap'])}</div>"
|
| 164 |
body_sent = []
|
| 165 |
download_data = {}
|
| 166 |
for i in range(len(results["clusters"])):
|
twc_clustering.py
CHANGED
|
@@ -14,7 +14,7 @@ class TWCClustering:
|
|
| 14 |
print("In Zscore Clustering")
|
| 15 |
|
| 16 |
def compute_matrix(self,embeddings):
|
| 17 |
-
print("Computing similarity matrix ...)")
|
| 18 |
embeddings= np.array(embeddings)
|
| 19 |
start = time.time()
|
| 20 |
vec_a = embeddings.T #vec_a shape (1024,)
|
|
@@ -23,7 +23,7 @@ class TWCClustering:
|
|
| 23 |
similarity_matrix = np.inner(vec_a,vec_a)
|
| 24 |
end = time.time()
|
| 25 |
time_val = (end-start)*1000
|
| 26 |
-
print(f"Similarity matrix computation complete. Time taken:{(time_val/(1000*60)):.2f} minutes")
|
| 27 |
return similarity_matrix
|
| 28 |
|
| 29 |
def get_terms_above_threshold(self,matrix,embeddings,pivot_index,threshold):
|
|
@@ -63,6 +63,24 @@ class TWCClustering:
|
|
| 63 |
return {"pivot_index":center_index,"orig_index":pivot_index,"neighs":sorted_d}
|
| 64 |
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
def cluster(self,output_file,texts,embeddings,threshold = 1.5):
|
| 67 |
matrix = self.compute_matrix(embeddings)
|
| 68 |
mean = np.mean(matrix)
|
|
@@ -71,13 +89,14 @@ class TWCClustering:
|
|
| 71 |
inc = 0
|
| 72 |
value = mean
|
| 73 |
while (value < 1):
|
| 74 |
-
zscores.append(round(value,2))
|
| 75 |
inc += 1
|
| 76 |
value = mean + inc*std
|
| 77 |
-
print("In clustering:",round(std,2),zscores)
|
| 78 |
cluster_dict = {}
|
| 79 |
cluster_dict["clusters"] = []
|
| 80 |
picked_dict = {}
|
|
|
|
| 81 |
|
| 82 |
for i in range(len(embeddings)):
|
| 83 |
if (i in picked_dict):
|
|
@@ -86,8 +105,13 @@ class TWCClustering:
|
|
| 86 |
arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
|
| 87 |
cluster_info = self.find_pivot_subgraph(i,arr,matrix,zscore)
|
| 88 |
self.update_picked_dict(picked_dict,cluster_info["neighs"])
|
|
|
|
| 89 |
cluster_dict["clusters"].append(cluster_info)
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
return cluster_dict
|
| 92 |
|
| 93 |
|
|
|
|
| 14 |
print("In Zscore Clustering")
|
| 15 |
|
| 16 |
def compute_matrix(self,embeddings):
|
| 17 |
+
#print("Computing similarity matrix ...)")
|
| 18 |
embeddings= np.array(embeddings)
|
| 19 |
start = time.time()
|
| 20 |
vec_a = embeddings.T #vec_a shape (1024,)
|
|
|
|
| 23 |
similarity_matrix = np.inner(vec_a,vec_a)
|
| 24 |
end = time.time()
|
| 25 |
time_val = (end-start)*1000
|
| 26 |
+
#print(f"Similarity matrix computation complete. Time taken:{(time_val/(1000*60)):.2f} minutes")
|
| 27 |
return similarity_matrix
|
| 28 |
|
| 29 |
def get_terms_above_threshold(self,matrix,embeddings,pivot_index,threshold):
|
|
|
|
| 63 |
return {"pivot_index":center_index,"orig_index":pivot_index,"neighs":sorted_d}
|
| 64 |
|
| 65 |
|
| 66 |
+
def update_overlap_stats(self,overlap_dict,cluster_info):
|
| 67 |
+
arr = list(cluster_info["neighs"].keys())
|
| 68 |
+
for val in arr:
|
| 69 |
+
if (val not in overlap_dict):
|
| 70 |
+
overlap_dict[val] = 1
|
| 71 |
+
else:
|
| 72 |
+
overlap_dict[val] += 1
|
| 73 |
+
|
| 74 |
+
def bucket_overlap(self,overlap_dict):
|
| 75 |
+
bucket_dict = {}
|
| 76 |
+
for key in overlap_dict:
|
| 77 |
+
if (overlap_dict[key] not in bucket_dict):
|
| 78 |
+
bucket_dict[overlap_dict[key]] = 1
|
| 79 |
+
else:
|
| 80 |
+
bucket_dict[overlap_dict[key]] += 1
|
| 81 |
+
sorted_d = OrderedDict(sorted(bucket_dict.items(), key=lambda kv: kv[1], reverse=False))
|
| 82 |
+
return sorted_d
|
| 83 |
+
|
| 84 |
def cluster(self,output_file,texts,embeddings,threshold = 1.5):
|
| 85 |
matrix = self.compute_matrix(embeddings)
|
| 86 |
mean = np.mean(matrix)
|
|
|
|
| 89 |
inc = 0
|
| 90 |
value = mean
|
| 91 |
while (value < 1):
|
| 92 |
+
zscores.append({"threshold":inc,"cosine":round(value,2)})
|
| 93 |
inc += 1
|
| 94 |
value = mean + inc*std
|
| 95 |
+
#print("In clustering:",round(std,2),zscores)
|
| 96 |
cluster_dict = {}
|
| 97 |
cluster_dict["clusters"] = []
|
| 98 |
picked_dict = {}
|
| 99 |
+
overlap_dict = {}
|
| 100 |
|
| 101 |
for i in range(len(embeddings)):
|
| 102 |
if (i in picked_dict):
|
|
|
|
| 105 |
arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
|
| 106 |
cluster_info = self.find_pivot_subgraph(i,arr,matrix,zscore)
|
| 107 |
self.update_picked_dict(picked_dict,cluster_info["neighs"])
|
| 108 |
+
self.update_overlap_stats(overlap_dict,cluster_info)
|
| 109 |
cluster_dict["clusters"].append(cluster_info)
|
| 110 |
+
curr_threshold = f"{threshold} (cosine:{mean+threshold*std:.2f})"
|
| 111 |
+
sorted_d = OrderedDict(sorted(overlap_dict.items(), key=lambda kv: kv[1], reverse=True))
|
| 112 |
+
#print(sorted_d)
|
| 113 |
+
sorted_d = self.bucket_overlap(overlap_dict)
|
| 114 |
+
cluster_dict["info"] ={"mean":mean,"std":std,"current_threshold":curr_threshold,"zscores":zscores,"overlap":list(sorted_d.items())}
|
| 115 |
return cluster_dict
|
| 116 |
|
| 117 |
|