Milad Alshomary commited on
Commit
685033a
·
1 Parent(s): 95d09b1
Files changed (1) hide show
  1. utils/clustering_utils.py +9 -0
utils/clustering_utils.py CHANGED
@@ -139,6 +139,15 @@ def clustering_author(background_corpus_df: pd.DataFrame,
139
  background_corpus_df['cluster_label'] = final_labels_for_df
140
  return background_corpus_df
141
 
 
 
 
 
 
 
 
 
 
142
  if eps_values is None:
143
  if metric == 'cosine':
144
  eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
 
139
  background_corpus_df['cluster_label'] = final_labels_for_df
140
  return background_corpus_df
141
 
142
+ # For cosine metric, normalize embeddings to unit length.
143
+ # This is standard practice as cosine similarity is equivalent to Euclidean
144
+ # distance on L2-normalized vectors. DBSCAN's 'cosine' metric internally
145
+ # works with these normalized distances.
146
+ if metric == 'cosine':
147
+ from sklearn.preprocessing import normalize
148
+ print("Normalizing embeddings for cosine distance...")
149
+ X = normalize(X, norm='l2', axis=1)
150
+
151
  if eps_values is None:
152
  if metric == 'cosine':
153
  eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]