Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

Milad Alshomary commited on Oct 20

Commit

bd0cb8d

1 Parent(s): 016bb2f

updates

Browse files

Files changed (2) hide show

cluster_corpus.py +14 -0
utils/clustering_utils.py +136 -61

cluster_corpus.py CHANGED Viewed

@@ -37,6 +37,11 @@ def main():
         type=str,
         help="Path to the corpus file (.csv or .pkl)."
     )
     parser.add_argument(
         "model_name",
         type=str,
@@ -65,6 +70,7 @@ def main():
     # 1. Load the corpus
     corpus_df = load_corpus(args.corpus_path)
     # 2. Generate style embeddings
     print(f"\nGenerating style embeddings with model: {args.model_name}")
@@ -76,6 +82,13 @@ def main():
         model_name=args.model_name,
         task_authors_df=None
     )
     embedding_col_name = f'{args.model_name.split("/")[-1]}_style_embedding'
     print(f"Embeddings generated and stored in column '{embedding_col_name}'.")
@@ -83,6 +96,7 @@ def main():
     print(f"\nPerforming DBSCAN clustering with metric='{args.metric}' and min_samples={args.min_samples}...")
     clustered_df = clustering_author(
         background_corpus_df=clustered_df,
         embedding_clm=embedding_col_name,
         min_samples=args.min_samples,
         metric=args.metric

         type=str,
         help="Path to the corpus file (.csv or .pkl)."
     )
+    parser.add_argument(
+        "test_corpus_path",
+        type=str,
+        help="Path to the test corpus file (.csv or .pkl)."
+    )
     parser.add_argument(
         "model_name",
         type=str,
     # 1. Load the corpus
     corpus_df = load_corpus(args.corpus_path)
+    test_corpus_df = load_corpus(args.test_corpus_path)
     # 2. Generate style embeddings
     print(f"\nGenerating style embeddings with model: {args.model_name}")
         model_name=args.model_name,
         task_authors_df=None
     )
+    clustered_test_df, _ = cached_generate_style_embedding(
+        background_corpus_df=test_corpus_df,
+        text_clm='fullText',
+        model_name=args.model_name,
+        task_authors_df=None
+    )
     embedding_col_name = f'{args.model_name.split("/")[-1]}_style_embedding'
     print(f"Embeddings generated and stored in column '{embedding_col_name}'.")
     print(f"\nPerforming DBSCAN clustering with metric='{args.metric}' and min_samples={args.min_samples}...")
     clustered_df = clustering_author(
         background_corpus_df=clustered_df,
+        test_corpus_df=clustered_test_df,
         embedding_clm=embedding_col_name,
         min_samples=args.min_samples,
         metric=args.metric

utils/clustering_utils.py CHANGED Viewed

@@ -5,7 +5,7 @@ from sklearn.cluster import DBSCAN
 from sklearn.metrics import silhouette_score
 # Required for analyze_space_distance_preservation
 from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
-from scipy.stats import pearsonr
 from typing import List, Dict, Any
 import json
@@ -30,63 +30,35 @@ def sample_ds(input_file, output_file, num_insts=10000, min_num_text_per_inst=0,
     df = pd.DataFrame(out_list)
     df.to_pickle(output_file)
-def _find_best_dbscan_eps(X: np.ndarray,
-                          eps_values: List[float],
-                          min_samples: int,
-                          metric: str) -> tuple[float | None, np.ndarray | None, float]:
     """
-    Iterates through eps_values for DBSCAN and returns the parameters
-    that yield the highest silhouette score.
     Args:
         X (np.ndarray): The input data (embeddings).
-        eps_values (List[float]): List of eps values to try.
-        min_samples (int): DBSCAN min_samples parameter.
-        metric (str): Distance metric for DBSCAN and silhouette score.
     Returns:
-        tuple[float | None, np.ndarray | None, float]:
-            - best_eps: The eps value that resulted in the best score. None if no suitable clustering.
-            - best_labels: The cluster labels from the best DBSCAN run. None if no suitable clustering.
-            - best_score: The highest silhouette score achieved.
     """
-    best_score = -1.001  # Silhouette score is in [-1, 1]
-    best_labels = None
-    best_eps = None
-    for eps in eps_values:
-        if eps <= 1e-9:  # eps must be positive
-            continue
-        db = DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
-        labels = db.fit_predict(X)
-        unique_labels_set = set(labels)
-        n_clusters_ = len(unique_labels_set) - (1 if -1 in unique_labels_set else 0)
-        if n_clusters_ > 1:
-            clustered_mask = (labels != -1)
-            if np.sum(clustered_mask) >= 2:  # Need at least 2 non-noise points
-                X_clustered = X[clustered_mask]
-                labels_clustered = labels[clustered_mask]
-                try:
-                    score = silhouette_score(X_clustered, labels_clustered, metric=metric)
-                    if score > best_score:
-                        best_score = score
-                        best_labels = labels.copy()
-                        best_eps = eps
-                    print('EPS:', eps, 'SCORE:', score)
-                except ValueError:  # Catch errors from silhouette_score
-                    pass
-        elif n_clusters_ == 1 and best_labels is None: # Fallback for single cluster
-            if not all(l == -1 for l in labels):
-                current_score_for_single_cluster = -0.5 # Nominal score
-                if current_score_for_single_cluster > best_score:
-                    best_score = current_score_for_single_cluster
-                    best_labels = labels.copy()
-                    best_eps = eps
-    return best_eps, best_labels, best_score
 def clustering_author(background_corpus_df: pd.DataFrame,
                       embedding_clm: str = 'style_embedding',
                       eps_values: List[float] = None,
                       min_samples: int = 5,
@@ -178,14 +150,62 @@ def clustering_author(background_corpus_df: pd.DataFrame,
         print(f"Warning: `eps_values` not provided. Using default range for metric '{metric}': {eps_values}. "
               f"It's recommended to supply `eps_values` tuned to your data.")
-    print(f"Performing DBSCAN clustering (min_samples={min_samples}, metric='{metric}') with eps values: "
-          f"{[f'{e:.2f}' for e in eps_values]}")
-    best_eps, best_labels, best_score = _find_best_dbscan_eps(X, eps_values, min_samples, metric)
     if best_labels is not None:
         num_found_clusters = len(set(best_labels) - {-1})
-        print(f"Best clustering found: eps={best_eps:.2f}, Silhouette Score={best_score:.4f} ({num_found_clusters} clusters).")
         for i, label in enumerate(best_labels):
             original_df_idx = original_indices[i]
             final_labels_for_df.iloc[original_df_idx] = label
@@ -334,17 +354,72 @@ def analyze_space_distance_preservation(
        distances_original_space.size != distances_new_space.size:
         return None # Mismatch or empty distances
-    # Handle cases where variance is zero in one of the distance arrays (leads to NaN correlation)
-    if np.all(distances_new_space == distances_new_space[0]) or \
-       np.all(distances_original_space == distances_original_space[0]):
-        return 0.0 # Correlation is undefined or 0 if one variable is constant
     try:
-        correlation, _ = pearsonr(distances_original_space, distances_new_space)
-    except ValueError: # Should be caught by variance checks, but as a safeguard
         return None
     if np.isnan(correlation):
         return 0.0 # Default for NaN correlation
-    return correlation

 from sklearn.metrics import silhouette_score
 # Required for analyze_space_distance_preservation
 from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
+from scipy.stats import pearsonr, ConstantInputWarning
 from typing import List, Dict, Any
 import json
     df = pd.DataFrame(out_list)
     df.to_pickle(output_file)
+def _calculate_silhouette_score(X: np.ndarray, labels: np.ndarray, metric: str) -> float | None:
     """
+    Calculates the silhouette score for a given clustering result.
     Args:
         X (np.ndarray): The input data (embeddings).
+        labels (np.ndarray): The cluster labels for each point in X.
+        metric (str): The distance metric used for the score calculation.
     Returns:
+        float | None: The silhouette score, or None if it cannot be computed.
     """
+    unique_labels_set = set(labels)
+    n_clusters_ = len(unique_labels_set) - (1 if -1 in unique_labels_set else 0)
+    if n_clusters_ > 1:
+        clustered_mask = (labels != -1)
+        if np.sum(clustered_mask) > 1:
+            X_clustered = X[clustered_mask]
+            labels_clustered = labels[clustered_mask]
+            try:
+                return silhouette_score(X_clustered, labels_clustered, metric=metric)
+            except ValueError:
+                return None
+    return None
 def clustering_author(background_corpus_df: pd.DataFrame,
+                      test_corpus_df: pd.DataFrame = None,
                       embedding_clm: str = 'style_embedding',
                       eps_values: List[float] = None,
                       min_samples: int = 5,
         print(f"Warning: `eps_values` not provided. Using default range for metric '{metric}': {eps_values}. "
               f"It's recommended to supply `eps_values` tuned to your data.")
+    print(f"\n--- Starting DBSCAN Clustering & Evaluation ---")
+    print(f"Metric: '{metric}', Min Samples: {min_samples}, EPS values: {[f'{e:.2f}' for e in eps_values]}")
+    best_score = -1.001
+    best_labels = None
+    best_eps = None
+    # This loop now lives in `clustering_author` to have access to the full DataFrame for evaluation.
+    for eps in eps_values:
+        if eps <= 1e-9: continue
+        print(f"\nTesting eps = {eps:.3f}...")
+        db = DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
+        current_labels = db.fit_predict(X)
+        # --- Evaluation Step 1: Silhouette Score ---
+        score = _calculate_silhouette_score(X, current_labels, metric)
+        if score is not None:
+            print(f"  - Silhouette Score: {score:.4f}")
+            if score > best_score:
+                best_score = score
+                best_labels = current_labels.copy()
+                best_eps = eps
+        else:
+            print("  - Silhouette Score: N/A (not enough clusters found)")
+        # --- Evaluation Step 2: Distance Preservation ---
+        # Temporarily assign labels to a copy of the DataFrame for evaluation
+        temp_df = background_corpus_df.copy()
+        temp_labels_for_df = pd.Series(-1, index=temp_df.index, dtype=int)
+        temp_labels_for_df.iloc[original_indices] = current_labels
+        temp_df['cluster_label'] = temp_labels_for_df
+        correlation = analyze_space_distance_preservation(temp_df, embedding_clm, 'cluster_label')
+        if correlation is not None:
+            print(f"  - Distance Preservation (Pearson r): {correlation:.4f}")
+        else:
+            print("  - Distance Preservation (Pearson r): N/A (not enough clusters/data)")
+        # --- Evaluation Step 3: Distance Preservation on Test Corpus (if provided) ---
+        if test_corpus_df is not None:
+            # We need the centroids from the current clustering of the background corpus
+            centroids = _compute_cluster_centroids(temp_df[temp_df['cluster_label'] != -1], embedding_clm, 'cluster_label')
+            test_correlation = evaluate_test_set_distance_preservation(test_corpus_df, centroids, embedding_clm)
+            if test_correlation is not None:
+                print(f"  - Test Set Distance Preservation (Pearson r): {test_correlation:.4f}")
+            else:
+                print("  - Test Set Distance Preservation (Pearson r): N/A (not enough test data or clusters)")
+        print('Eps {}, #clusters {}, solihouette {}, Pearson {}'.format(eps, len(set(current_labels) - {-1}), score, test_correlation))
     if best_labels is not None:
         num_found_clusters = len(set(best_labels) - {-1})
+        print(f"\n--- Best Clustering Result ---")
+        print(f"Best eps: {best_eps:.3f} yielded the highest Silhouette Score: {best_score:.4f} ({num_found_clusters} clusters).")
         for i, label in enumerate(best_labels):
             original_df_idx = original_indices[i]
             final_labels_for_df.iloc[original_df_idx] = label
        distances_original_space.size != distances_new_space.size:
         return None # Mismatch or empty distances
     try:
+        # Catching ConstantInputWarning that pearsonr can raise
+        import warnings
+        with warnings.catch_warnings():
+            warnings.filterwarnings('error', category=ConstantInputWarning)
+            correlation, _ = pearsonr(distances_original_space, distances_new_space)
+    except (ValueError, ConstantInputWarning):
+        # This happens if one of the distance arrays has zero variance (all distances are the same).
+        # This is a valid case where correlation is undefined or 0.
+        return 0.0
+    except Exception: # Safeguard for other unexpected errors
         return None
     if np.isnan(correlation):
         return 0.0 # Default for NaN correlation
+    return correlation
+def evaluate_test_set_distance_preservation(
+    test_df: pd.DataFrame,
+    centroids_map: Dict[Any, np.ndarray],
+    embedding_clm: str = 'style_embedding'
+) -> float | None:
+    """
+    Evaluates how well a centroid space (from a background corpus) preserves
+    distances for a separate test corpus.
+    Args:
+        test_df (pd.DataFrame): The test corpus DataFrame with embeddings.
+        centroids_map (Dict[Any, np.ndarray]): A map of cluster IDs to centroid vectors,
+                                               pre-computed from the background corpus.
+        embedding_clm (str): The name of the embedding column.
+    Returns:
+        float | None: Pearson correlation coefficient, or None if analysis is not possible.
+    """
+    if test_df.shape[0] < 2:
+        return None # Need at least 2 items for pairwise distances
+    if not centroids_map or len(centroids_map) < 2:
+        return None # Need at least 2 centroids to define a meaningful projected space
+    # 1. Get original embeddings and distances for the test set
+    test_embeddings_matrix = _safe_embeddings_to_matrix(test_df[embedding_clm])
+    if test_embeddings_matrix.ndim != 2 or test_embeddings_matrix.shape[0] < 2:
+        return None # Not enough valid embeddings in the test set
+    distances_original_space = _get_pairwise_cosine_distances(test_embeddings_matrix)
+    # 2. Project test embeddings into the centroid space and get new distances
+    projected_embeddings_matrix = _project_to_centroid_space(test_embeddings_matrix, centroids_map)
+    if projected_embeddings_matrix.ndim != 2 or projected_embeddings_matrix.shape[1] < 2:
+        return None # Projection failed or resulted in a space with <2 dimensions
+    distances_new_space = _get_pairwise_cosine_distances(projected_embeddings_matrix)
+    # 3. Calculate Pearson correlation
+    if distances_original_space.size != distances_new_space.size or distances_original_space.size == 0:
+        return None
+    try:
+        import warnings
+        with warnings.catch_warnings():
+            warnings.filterwarnings('error', category=ConstantInputWarning)
+            correlation, _ = pearsonr(distances_original_space, distances_new_space)
+    except (ValueError, ConstantInputWarning):
+        return 0.0 # Zero variance in one of the distance sets
+    return correlation if not np.isnan(correlation) else 0.0