Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

Anisha Bhatnagar commited on Aug 24

Commit

8367823

1 Parent(s): 6fc987a

showing g2v with z scores

Browse files

Files changed (3) hide show

app.py +7 -0
utils/interp_space_utils.py +21 -9
utils/visualizations.py +46 -4

app.py CHANGED Viewed

@@ -424,6 +424,13 @@ def app(share=False, use_cluster_feats=False):
                     ">
                         Gram2Vec Features prominent in the zoomed-in region
                     </div>
                     """)
                 gram2vec_rb    = gr.Radio(choices=[], label="Gram2Vec features for this zoomed-in region")#, label="Top-10 Gram2Vec Features most likely to occur in Mystery Author", info="Most prominent Gram2Vec features in the mystery text")
                 gram2vec_state = gr.State()

                     ">
                         Gram2Vec Features prominent in the zoomed-in region
                     </div>
+                    <div style="
+                        font-size: 0.9em;
+                        color: #666;
+                        margin-bottom: 1em;
+                    ">
+                        Features shown with normalized z-scores
+                    </div>
                     """)
                 gram2vec_rb    = gr.Radio(choices=[], label="Gram2Vec features for this zoomed-in region")#, label="Top-10 Gram2Vec Features most likely to occur in Mystery Author", info="Most prominent Gram2Vec features in the mystery text")
                 gram2vec_state = gr.State()

utils/interp_space_utils.py CHANGED Viewed

@@ -571,7 +571,7 @@ def compute_clusters_g2v_representation(
     mode: str = "contrastive",
     sharedness_method: str = "mean_minus_alpha_std",
     alpha: float = 0.5
-) -> List[str]:
     selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
@@ -600,10 +600,10 @@ def compute_clusters_g2v_representation(
             stds  = selected_matrix.std(axis=0)
             scores = means - float(alpha) * stds
-        # Rank and return
         feature_scores = [(feat, score) for feat, score in zip(all_g2v_feats, scores) if score > 0]
         feature_scores.sort(key=lambda x: x[1], reverse=True)
-        return [feat for feat, _ in feature_scores[:top_n]]
     # Contrastive mode (default): compute target mean and subtract contrast mean
@@ -626,11 +626,23 @@ def compute_clusters_g2v_representation(
     final_g2v_feats_values = all_g2v_values - all_g2v_other_values
     # Keep only features that have a positive contrastive score
     top_g2v_feats = sorted(
-        [(feat, val) for feat, val in zip(all_g2v_feats, final_g2v_feats_values) if val > 0],
-        key=lambda x: -x[1]
     )
     # Filter out features that are not present in any of the authors
@@ -638,18 +650,18 @@ def compute_clusters_g2v_representation(
     print('Filtering in g2v features for only the following authors: ', selected_authors)
     authors_g2v_feats = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
     filtered_features = []
-    for feature, score in top_g2v_feats:
         found_in_any_author = False
         for author_g2v_feats in authors_g2v_feats:
             if author_g2v_feats[feature] > 0:
                 found_in_any_author = True
                 break
         if found_in_any_author:
-            filtered_features.append(feature)
-    print('Filtered G2V features: ', filtered_features)
-    return filtered_features[:top_n]
 def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):

     mode: str = "contrastive",
     sharedness_method: str = "mean_minus_alpha_std",
     alpha: float = 0.5
+) -> List[tuple]:  # Changed return type to List[tuple] to include scores
     selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
             stds  = selected_matrix.std(axis=0)
             scores = means - float(alpha) * stds
+        # Rank and return with scores
         feature_scores = [(feat, score) for feat, score in zip(all_g2v_feats, scores) if score > 0]
         feature_scores.sort(key=lambda x: x[1], reverse=True)
+        return feature_scores[:top_n]  # Return tuples instead of just features
     # Contrastive mode (default): compute target mean and subtract contrast mean
     final_g2v_feats_values = all_g2v_values - all_g2v_other_values
+    # Compute z-scores for normalization
+    # Get population statistics from all features (both selected and contrast)
+    all_feats = background_corpus_df[features_clm_name].tolist()
+    population_matrix = np.array([list(x.values()) for x in all_feats])
+    population_mean = population_matrix.mean(axis=0)
+    population_std = population_matrix.std(axis=0)
+    # Avoid division by zero
+    population_std = np.where(population_std == 0, 1, population_std)
+    # Calculate z-scores for the contrastive values
+    z_scores = (final_g2v_feats_values - population_mean) / population_std
     # Keep only features that have a positive contrastive score
     top_g2v_feats = sorted(
+        [(feat, val, z_score) for feat, val, z_score in zip(all_g2v_feats, final_g2v_feats_values, z_scores) if val > 0],
+        key=lambda x: -x[1]  # Sort by contrastive score
     )
     # Filter out features that are not present in any of the authors
     print('Filtering in g2v features for only the following authors: ', selected_authors)
     authors_g2v_feats = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
     filtered_features = []
+    for feature, score, z_score in top_g2v_feats:
         found_in_any_author = False
         for author_g2v_feats in authors_g2v_feats:
             if author_g2v_feats[feature] > 0:
                 found_in_any_author = True
                 break
         if found_in_any_author:
+            filtered_features.append((feature, score, z_score))
+    print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features])  # Print feature names and z-scores
+    return filtered_features[:top_n]  # Return tuples with z-scores
 def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):

utils/visualizations.py CHANGED Viewed

@@ -194,6 +194,47 @@ def load_interp_space(cfg):
     }
 #function to handle zoom events
 def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors_df):
     """
@@ -268,7 +309,7 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
     for feat in g2v_feats:
         try:
             # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
-            if any(find_feature_spans(txt, feat) for txt in task_texts):
                 filtered_g2v_feats.append(feat)
             else:
                 print(f"[INFO] Dropping G2V feature with no spans in task texts: {feat}")
@@ -278,19 +319,20 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
     # Convert to human readable for display
     HR_g2v_list = []
     for feat in filtered_g2v_feats:
-        HR_g2v = get_fullform(feat)
         print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
         if HR_g2v is None:
             print(f"Skipping Gram2Vec feature without human readable form: {feat}")
         else:
-            HR_g2v_list.append(HR_g2v)
-    HR_g2v_list = ["None"] + HR_g2v_list
     print(f"[INFO] Found {len(llm_feats)} LLM features and {len(g2v_feats)} Gram2Vec features in the zoomed region.")
     print(f"[INFO] unfiltered g2v features: {g2v_feats}")
     print(f"[INFO] LLM features: {llm_feats}")
     print(f"[INFO] Gram2Vec features: {HR_g2v_list}")
     return (

     }
+# Function to process G2V features and create display choices
+def format_g2v_features_for_display(g2v_features_with_scores):
+    """
+    Convert G2V features with z-scores into display format for Gradio radio buttons.
+    Args:
+        g2v_features_with_scores: List of tuples like:
+            [('None', None), ('Feature Name', z_score), ...]
+    Returns:
+        tuple: (display_choices, original_values)
+    """
+    display_choices = []
+    original_values = []
+    for item in g2v_features_with_scores:
+        if len(item) == 2:
+            feature_name, z_score = item
+            # Handle None case
+            if feature_name == "None" or z_score is None:
+                display_choices.append("None")
+                original_values.append("None")
+            else:
+                # Convert numpy float to regular float if needed
+                if hasattr(z_score, 'item'):
+                    z_score = float(z_score.item())
+                else:
+                    z_score = float(z_score)
+                # Create display string with z-score
+                display_string = f"{feature_name} | Z={z_score:.2f}]"
+                display_choices.append(display_string)
+                original_values.append(feature_name)
+        else:
+            # Handle unexpected format
+            display_choices.append(str(item))
+            original_values.append(str(item))
+    return display_choices, original_values
 #function to handle zoom events
 def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors_df):
     """
     for feat in g2v_feats:
         try:
             # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
+            if any(find_feature_spans(txt, feat[0]) for txt in task_texts):
                 filtered_g2v_feats.append(feat)
             else:
                 print(f"[INFO] Dropping G2V feature with no spans in task texts: {feat}")
     # Convert to human readable for display
     HR_g2v_list = []
     for feat in filtered_g2v_feats:
+        HR_g2v = get_fullform(feat[0])
         print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
         if HR_g2v is None:
             print(f"Skipping Gram2Vec feature without human readable form: {feat}")
         else:
+            HR_g2v_list.append((HR_g2v, feat[1])) #get the score
+    HR_g2v_list = [("None", None)] + HR_g2v_list
     print(f"[INFO] Found {len(llm_feats)} LLM features and {len(g2v_feats)} Gram2Vec features in the zoomed region.")
     print(f"[INFO] unfiltered g2v features: {g2v_feats}")
     print(f"[INFO] LLM features: {llm_feats}")
+    HR_g2v_list, _ = format_g2v_features_for_display(HR_g2v_list)
     print(f"[INFO] Gram2Vec features: {HR_g2v_list}")
     return (