Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

peter-zeng commited on Aug 22

Commit

c0e59d3

1 Parent(s): 75cc8bf

changed default g2v clustering to contrastive, and added filtering to ensure spans show

Browse files

Files changed (3) hide show

utils/gram2vec_feat_utils.py +11 -3
utils/interp_space_utils.py +20 -5
utils/visualizations.py +26 -2

utils/gram2vec_feat_utils.py CHANGED Viewed

@@ -198,10 +198,18 @@ def show_combined_spans_all(selected_feature_llm, selected_feature_g2v,
     )
     combined_html = "<div>" + "\n<hr>\n".join(html_task_authors) + "</div>"
     html_background_authors = create_html(
-        texts[4:], #last three are background
-        llm_spans_list,
-        gram_spans_list,
         selected_feature_llm,
         selected_feature_g2v,
         short,

     )
     combined_html = "<div>" + "\n<hr>\n".join(html_task_authors) + "</div>"
+    # Filter background authors to those with at least one Gram2Vec span
+    bg_start = 4
+    bg_indices = list(range(bg_start, len(texts)))
+    kept_indices = [i for i in bg_indices if gram_spans_list[i]]
+    filtered_texts_bg = [texts[i] for i in kept_indices]
+    filtered_llm_bg   = [llm_spans_list[i] for i in kept_indices]
+    filtered_gram_bg  = [gram_spans_list[i] for i in kept_indices]
     html_background_authors = create_html(
+        filtered_texts_bg,
+        filtered_llm_bg,
+        filtered_gram_bg,
         selected_feature_llm,
         selected_feature_g2v,
         short,

utils/interp_space_utils.py CHANGED Viewed

@@ -528,7 +528,7 @@ def compute_clusters_g2v_representation(
     other_author_ids: List[Any],
     features_clm_name: str,
     top_n: int = 10,
-    mode: str = "sharedness",
     sharedness_method: str = "mean_minus_alpha_std",
     alpha: float = 0.5
 ) -> List[str]:
@@ -569,14 +569,29 @@ def compute_clusters_g2v_representation(
     # Contrastive mode (default): compute target mean and subtract contrast mean
     all_g2v_values = np.array([list(x.values()) for x in selected_feats]).mean(axis=0)
-    other_selected_feats = background_corpus_df[~selected_mask][features_clm_name].tolist()
-    all_g2v_other_feats  = list(other_selected_feats[0].keys())
-    all_g2v_other_values = np.array([list(x.values()) for x in other_selected_feats]).mean(axis=0)
     final_g2v_feats_values = all_g2v_values - all_g2v_other_values
-    top_g2v_feats = sorted(list(zip(all_g2v_feats, final_g2v_feats_values)), key=lambda x: -x[1])
     # Filter out features that are not present in any of the authors
     selected_authors = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(author_ids))

     other_author_ids: List[Any],
     features_clm_name: str,
     top_n: int = 10,
+    mode: str = "contrastive",
     sharedness_method: str = "mean_minus_alpha_std",
     alpha: float = 0.5
 ) -> List[str]:
     # Contrastive mode (default): compute target mean and subtract contrast mean
     all_g2v_values = np.array([list(x.values()) for x in selected_feats]).mean(axis=0)
+    # If an explicit contrast set is provided, use it; otherwise use everyone outside selection
+    if other_author_ids:
+        explicit_mask = background_corpus_df['authorID'].isin(other_author_ids).to_numpy()
+        # Ensure contrast set is disjoint from the selected set
+        contrast_mask = np.logical_and(explicit_mask, ~selected_mask)
+    else:
+        contrast_mask = ~selected_mask
+    other_selected_feats = background_corpus_df[contrast_mask][features_clm_name].tolist()
+    if len(other_selected_feats) > 0:
+        all_g2v_other_values = np.array([list(x.values()) for x in other_selected_feats]).mean(axis=0)
+    else:
+        # No contrast docs → treat contrast mean as zeros
+        all_g2v_other_values = np.zeros_like(all_g2v_values)
     final_g2v_feats_values = all_g2v_values - all_g2v_other_values
+    # Keep only features that have a positive contrastive score
+    top_g2v_feats = sorted(
+        [(feat, val) for feat, val in zip(all_g2v_feats, final_g2v_feats_values) if val > 0],
+        key=lambda x: -x[1]
+    )
     # Filter out features that are not present in any of the authors
     selected_authors = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(author_ids))

utils/visualizations.py CHANGED Viewed

@@ -13,6 +13,7 @@ import re
 from utils.interp_space_utils import compute_clusters_style_representation_3, compute_clusters_g2v_representation
 from utils.llm_feat_utils import split_features
 from utils.gram2vec_feat_utils import get_shorthand, get_fullform
 import plotly.io as pio
@@ -251,9 +252,32 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
         features_clm_name='g2v_vector'
     )
-    # Gram2vec features are already in shorthand. convert to human readable for display
-    HR_g2v_list = []
     for feat in g2v_feats:
         HR_g2v = get_fullform(feat)
         print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
         if HR_g2v is None:

 from utils.interp_space_utils import compute_clusters_style_representation_3, compute_clusters_g2v_representation
 from utils.llm_feat_utils import split_features
 from utils.gram2vec_feat_utils import get_shorthand, get_fullform
+from gram2vec.feature_locator import find_feature_spans
 import plotly.io as pio
         features_clm_name='g2v_vector'
     )
+    # ── Span-existence filter on task authors in the zoom ───────────────────
+    # Keep only features that have at least one detected span in any of the
+    # visible task authors' texts
+    visible_task_authors = task_authors_df[task_authors_df['authorID'].isin(visible_authors)]
+    if visible_task_authors.empty:
+        visible_task_authors = task_authors_df
+    def _to_text(x):
+        return '\n\n =========== \n\n'.join(x) if isinstance(x, list) else x
+    task_texts = [_to_text(x) for x in visible_task_authors['fullText'].tolist()]
+    filtered_g2v_feats = []
     for feat in g2v_feats:
+        try:
+            # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
+            if any(find_feature_spans(txt, feat) for txt in task_texts):
+                filtered_g2v_feats.append(feat)
+            else:
+                print(f"[INFO] Dropping G2V feature with no spans in task texts: {feat}")
+        except Exception as e:
+            print(f"[WARN] Error while checking spans for {feat}: {e}")
+    # Convert to human readable for display
+    HR_g2v_list = []
+    for feat in filtered_g2v_feats:
         HR_g2v = get_fullform(feat)
         print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
         if HR_g2v is None: