Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

Milad Alshomary commited on Oct 20

Commit

ea3113e

1 Parent(s): b623cb3

updates

Browse files

Files changed (3) hide show

cluster_corpus.py +101 -0
utils/clustering_utils.py +28 -4
utils/interp_space_utils.py +0 -1

cluster_corpus.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import argparse
+import pandas as pd
+import numpy as np
+import os
+from utils.interp_space_utils import cached_generate_style_embedding
+from utils.clustering_utils import clustering_author
+def load_corpus(filepath: str) -> pd.DataFrame:
+    """
+    Loads a corpus from a CSV or Pickle file into a pandas DataFrame.
+    The file is expected to have 'authorID' and 'fullText' columns.
+    """
+    print(f"Loading corpus from {filepath}...")
+    if filepath.endswith('.csv'):
+        df = pd.read_csv(filepath)
+    elif filepath.endswith('.pkl'):
+        df = pd.read_pickle(filepath)
+    else:
+        raise ValueError("Unsupported file format. Please use .csv or .pkl")
+    if 'authorID' not in df.columns or 'fullText' not in df.columns:
+        raise ValueError("Corpus must contain 'authorID' and 'fullText' columns.")
+    print(f"Corpus loaded successfully with {len(df)} documents.")
+    return df
+def main():
+    """
+    Main function to run the clustering workflow.
+    """
+    parser = argparse.ArgumentParser(
+        description="Generate style embeddings and cluster a corpus of documents."
+    )
+    parser.add_argument(
+        "corpus_path",
+        type=str,
+        help="Path to the corpus file (.csv or .pkl)."
+    )
+    parser.add_argument(
+        "model_name",
+        type=str,
+        help="Hugging Face model name for sentence-transformer embeddings (e.g., 'AnnaWegmann/Style-Embedding')."
+    )
+    parser.add_argument(
+        "output_path",
+        type=str,
+        help="Path to save the output DataFrame with embeddings and clusters (.pkl)."
+    )
+    parser.add_argument(
+        "--min_samples",
+        type=int,
+        default=5,
+        help="min_samples parameter for DBSCAN clustering."
+    )
+    parser.add_argument(
+        "--metric",
+        type=str,
+        default='cosine',
+        choices=['cosine', 'euclidean'],
+        help="Distance metric for DBSCAN clustering."
+    )
+    args = parser.parse_args()
+    # 1. Load the corpus
+    corpus_df = load_corpus(args.corpus_path)
+    # 2. Generate style embeddings
+    print(f"\nGenerating style embeddings with model: {args.model_name}")
+    # The function returns two dataframes, we are only interested in the first one here.
+    # We pass `task_authors_df=None` as we are processing a single corpus.
+    clustered_df, _ = cached_generate_style_embedding(
+        background_corpus_df=corpus_df,
+        text_clm='fullText',
+        model_name=args.model_name,
+        task_authors_df=None
+    )
+    embedding_col_name = f'{args.model_name.split("/")[-1]}_style_embedding'
+    print(f"Embeddings generated and stored in column '{embedding_col_name}'.")
+    # 3. Perform clustering
+    print(f"\nPerforming DBSCAN clustering with metric='{args.metric}' and min_samples={args.min_samples}...")
+    clustered_df = clustering_author(
+        background_corpus_df=clustered_df,
+        embedding_clm=embedding_col_name,
+        min_samples=args.min_samples,
+        metric=args.metric
+    )
+    # 4. Save the results
+    output_dir = os.path.dirname(args.output_path)
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+    clustered_df.to_pickle(args.output_path)
+    print(f"\nSuccessfully saved clustered DataFrame to: {args.output_path}")
+    print(f"DataFrame includes cluster labels in the 'cluster_label' column.")
+if __name__ == "__main__":
+    main()

utils/clustering_utils.py CHANGED Viewed

@@ -8,6 +8,28 @@ from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
 from scipy.stats import pearsonr
 from typing import List, Dict, Any
 def _find_best_dbscan_eps(X: np.ndarray,
                           eps_values: List[float],
                           min_samples: int,
@@ -143,12 +165,14 @@ def clustering_author(background_corpus_df: pd.DataFrame,
     if eps_values is None:
         if metric == 'cosine':
             eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
-        else:
             if X.shape[0] > 1:
-                data_spread = np.std(X)
                 eps_values = [round(data_spread * f, 2) for f in [0.25, 0.5, 1.0]]
-                eps_values = [e for e in eps_values if e > 1e-6]
-            if not eps_values or X.shape[0] <=1:
                  eps_values = [0.5, 1.0, 1.5]
         print(f"Warning: `eps_values` not provided. Using default range for metric '{metric}': {eps_values}. "
               f"It's recommended to supply `eps_values` tuned to your data.")

 from scipy.stats import pearsonr
 from typing import List, Dict, Any
+import json
+def sample_ds(input_file, output_file, num_insts=10000, min_num_text_per_inst=0, max_num_text_per_inst=3):
+    """
+    Usage
+    sample_ds('/mnt/swordfish-pool2/nikhil/raw_all/data.jsonl', '/mnt/swordfish-pool2/milad/hiatus-data/reddit_cluster_training.pkl',
+          num_insts=5000,
+          min_num_text_per_inst=3,
+          max_num_text_per_inst=10)
+    """
+    f = open(input_file)
+    out_list = []
+    for i in range(num_insts):
+        json_obj = json.loads(f.readline())
+        out_list.append({
+            'fullText': json_obj['syms'],
+            'authorID': json_obj['author_id']
+        })
+    df = pd.DataFrame(out_list)
+    df.to_pickle(output_file)
 def _find_best_dbscan_eps(X: np.ndarray,
                           eps_values: List[float],
                           min_samples: int,
     if eps_values is None:
         if metric == 'cosine':
             eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
+        else: # 'euclidean' or other
             if X.shape[0] > 1:
+                # For Euclidean, eps depends on the scale of the data.
+                # A simple heuristic: a fraction of the data's standard deviation.
+                data_spread = np.std(X)
                 eps_values = [round(data_spread * f, 2) for f in [0.25, 0.5, 1.0]]
+                eps_values = [e for e in eps_values if e > 1e-6] # Filter out zero or near-zero eps
+            if not eps_values or X.shape[0] <=1: # Fallback if heuristic fails or not enough data
                  eps_values = [0.5, 1.0, 1.5]
         print(f"Warning: `eps_values` not provided. Using default range for metric '{metric}': {eps_values}. "
               f"It's recommended to supply `eps_values` tuned to your data.")

utils/interp_space_utils.py CHANGED Viewed

@@ -172,7 +172,6 @@ def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str,
     print(f"Generating style embeddings using {model_name} on column '{text_clm}'...")
-    print(background_corpus_df.fullText.tolist()[:10])
     model = SentenceTransformer(model_name)
     embedding_dim = model.get_sentence_embedding_dimension()

     print(f"Generating style embeddings using {model_name} on column '{text_clm}'...")
     model = SentenceTransformer(model_name)
     embedding_dim = model.get_sentence_embedding_dimension()