Milad Alshomary
commited on
Commit
·
685033a
1
Parent(s):
95d09b1
updates
Browse files
utils/clustering_utils.py
CHANGED
|
@@ -139,6 +139,15 @@ def clustering_author(background_corpus_df: pd.DataFrame,
|
|
| 139 |
background_corpus_df['cluster_label'] = final_labels_for_df
|
| 140 |
return background_corpus_df
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
if eps_values is None:
|
| 143 |
if metric == 'cosine':
|
| 144 |
eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
|
|
|
|
| 139 |
background_corpus_df['cluster_label'] = final_labels_for_df
|
| 140 |
return background_corpus_df
|
| 141 |
|
| 142 |
+
# For cosine metric, normalize embeddings to unit length.
|
| 143 |
+
# This is standard practice as cosine similarity is equivalent to Euclidean
|
| 144 |
+
# distance on L2-normalized vectors. DBSCAN's 'cosine' metric internally
|
| 145 |
+
# works with these normalized distances.
|
| 146 |
+
if metric == 'cosine':
|
| 147 |
+
from sklearn.preprocessing import normalize
|
| 148 |
+
print("Normalizing embeddings for cosine distance...")
|
| 149 |
+
X = normalize(X, norm='l2', axis=1)
|
| 150 |
+
|
| 151 |
if eps_values is None:
|
| 152 |
if metric == 'cosine':
|
| 153 |
eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
|