Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

peter-zeng commited on Sep 22

Commit

8e5c429

1 Parent(s): a6ee680

changed filtering and selection of g2v features

Browse files

Files changed (2) hide show

utils/interp_space_utils.py +26 -153
utils/visualizations.py +21 -9

utils/interp_space_utils.py CHANGED Viewed

@@ -613,168 +613,41 @@ def compute_clusters_g2v_representation(
     other_author_ids: List[Any],
     features_clm_name: str,
     top_n: int = 10,
-    mode: str = "contrastive",
-    sharedness_method: str = "mean_minus_alpha_std",
-    alpha: float = 0.5
 ) -> List[tuple]:  # Changed return type to List[tuple] to include scores
     selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
     if not selected_mask.any():
-        return [] # No documents found for the given cluster_ids
-    selected_feats = background_corpus_df[selected_mask][features_clm_name].tolist()
-    all_g2v_feats  = list(selected_feats[0].keys())
-    # If the user requested a sharedness-based score, compute it and return top-N.
-    if mode == "sharedness":
-        selected_matrix = np.array([list(x.values()) for x in selected_feats], dtype=float)
-        if sharedness_method == "mean":
-            scores = selected_matrix.mean(axis=0)
-        elif sharedness_method in ("mean_minus_alpha_std", "mean-std", "mean_minus_std"):
-            means = selected_matrix.mean(axis=0)
-            stds  = selected_matrix.std(axis=0)
-            scores = means - float(alpha) * stds
-        elif sharedness_method == "min":
-            scores = selected_matrix.min(axis=0)
-        else:
-            # Default fallback to mean-minus-alpha*std if unknown method
-            means = selected_matrix.mean(axis=0)
-            stds  = selected_matrix.std(axis=0)
-            scores = means - float(alpha) * stds
-        # Rank and return with scores
-        feature_scores = [(feat, score) for feat, score in zip(all_g2v_feats, scores) if score > 0]
-        feature_scores.sort(key=lambda x: x[1], reverse=True)
-        return feature_scores[:top_n]  # Return tuples instead of just features
-    # Contrastive mode (default): compute target mean and subtract contrast mean
-    all_g2v_values = np.array([list(x.values()) for x in selected_feats]).mean(axis=0)
-    # If an explicit contrast set is provided, use it; otherwise use everyone outside selection
-    if other_author_ids:
-        explicit_mask = background_corpus_df['authorID'].isin(other_author_ids).to_numpy()
-        # Ensure contrast set is disjoint from the selected set
-        contrast_mask = np.logical_and(explicit_mask, ~selected_mask)
-    else:
-        contrast_mask = ~selected_mask
-    other_selected_feats = background_corpus_df[contrast_mask][features_clm_name].tolist()
-    if len(other_selected_feats) > 0:
-        all_g2v_other_values = np.array([list(x.values()) for x in other_selected_feats]).mean(axis=0)
-    else:
-        # No contrast docs → treat contrast mean as zeros
-        all_g2v_other_values = np.zeros_like(all_g2v_values)
-    final_g2v_feats_values = all_g2v_values - all_g2v_other_values
-    # Compute z-scores for normalization
-    # Get population statistics from all features (both selected and contrast)
-    all_feats = background_corpus_df[features_clm_name].tolist()
-    population_matrix = np.array([list(x.values()) for x in all_feats])
-    population_mean = population_matrix.mean(axis=0)
-    population_std = population_matrix.std(axis=0)
-    # Avoid division by zero
-    population_std = np.where(population_std == 0, 1, population_std)
-    # Calculate z-scores for the contrastive values
-    z_scores = (final_g2v_feats_values - population_mean) / population_std
-    # Keep only features that have a positive contrastive score
-    top_g2v_feats = sorted(
-        [(feat, val, z_score) for feat, val, z_score in zip(all_g2v_feats, final_g2v_feats_values, z_scores) if val > 0],
-        key=lambda x: -x[1]  # Sort by contrastive score
-    )
-    # Filter in only features that are present in selected_authors
-    selected_authors = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(author_ids))
-    # DEBUG: Print what we're actually working with
-    print(f"[DEBUG] author_ids parameter: {author_ids}")
-    print(f"[DEBUG] Hardcoded selected_authors set: {{'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}}")
-    print(f"[DEBUG] Intersection result: {selected_authors}")
-    print(f"[DEBUG] Is selected_authors empty? {len(selected_authors) == 0}")
-    # Filter in only features that are present in selected_authors
-    selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
-    # print(f"[DEBUG] selected_authors_g2v_data length: {len(selected_authors_g2v_data)}")
-    # print(f"[DEBUG] selected_authors_g2v_data content: {selected_authors_g2v_data}")
-    # Get the actual text documents for the selected authors to verify feature presence
-    selected_authors_docs = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)]['fullText'].tolist()
-    print(f"[DEBUG] Found {len(selected_authors_docs)} documents for selected authors")
-    # Import find_feature_spans for text-based feature verification
-    try:
-        from gram2vec.feature_locator import find_feature_spans
-        print("[DEBUG] Successfully imported find_feature_spans")
-    except ImportError:
-        print("[WARNING] Could not import find_feature_spans, falling back to vector-based filtering")
-        find_feature_spans = None
-    filtered_features = []
-    for feature, score, z_score in top_g2v_feats:
-        # DEBUG: Print what we're checking for this feature
-        # print(f"[DEBUG] Checking feature: {feature}")
-        # print(f"[DEBUG] Feature score: {score}, z_score: {z_score}")
-        # Check if the feature has a non-zero value in all of the selected authors
-        feature_presence = []
-        for i, author_g2v_feats in enumerate(selected_authors_g2v_data):
-            feature_value = author_g2v_feats.get(feature, 0)
-            feature_presence.append(feature_value)
-            # print(f"[DEBUG] Author {i} has feature '{feature}' = {feature_value}")
-        # print(f"[DEBUG] All feature values: {feature_presence}")
-        # print(f"[DEBUG] All values > 0? {[v > 0 for v in feature_presence]}")
-        # print(f"[DEBUG] All values > 0? {all(v > 0 for v in feature_presence)}")
-        # First check: feature must be present in Gram2Vec vectors
-        vector_present = all(author_g2v_feats.get(feature, 0) > 0 for author_g2v_feats in selected_authors_g2v_data)
-        # Second check: feature must be present in actual text documents
-        text_present = True
-        if find_feature_spans and selected_authors_docs:
-            try:
-                # Check if feature appears in at least one document from each selected author
-                for i, doc in enumerate(selected_authors_docs):
-                    if isinstance(doc, list):
-                        doc_text = '\n\n'.join(doc)
-                    else:
-                        doc_text = str(doc)
-                    spans = find_feature_spans(doc_text, feature)
-                    if not spans:  # No spans found in this document
-                        # print(f"[DEBUG] ✗ Feature '{feature}' not found in document {i} of selected author")
-                        text_present = False
-                        break
-                    # else:
-                        # print(f"[DEBUG] ✓ Feature '{feature}' found in document {i} with {len(spans)} spans")
-            except Exception as e:
-                print(f"[WARNING] Error checking text presence for feature '{feature}': {e}")
-                # Fall back to vector-based filtering if text checking fails
-                text_present = vector_present
-        # Feature must pass BOTH checks
-        if vector_present and text_present:
-            filtered_features.append((feature, score, z_score))
-            # print(f"[DEBUG] ✓ Feature '{feature}' PASSED both vector and text checks")
-        # else:
-        #     if not vector_present:
-        #         # print(f"[DEBUG] ✗ Feature '{feature}' FAILED vector check")
-        #     if not text_present:
-        #         # print(f"[DEBUG] ✗ Feature '{feature}' FAILED text check")
-        #     # print(f"[DEBUG] ✗ Feature '{feature}' FAILED the filter")
-    print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features])  # Print feature names and z-scores
-    return filtered_features[:top_n]  # Return tuples with z-scores
 def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):

     other_author_ids: List[Any],
     features_clm_name: str,
     top_n: int = 10,
 ) -> List[tuple]:  # Changed return type to List[tuple] to include scores
+    # 1) Identify selected authors in the zoom region
     selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
     if not selected_mask.any():
+        return []  # No authors found for the given author_ids
+    # 2) Build a population matrix of all authors' Gram2Vec features
+    #    Expect each row in features_clm_name to be a dict {feature_name: value}
+    all_feature_dicts = background_corpus_df[features_clm_name].tolist()
+    if not all_feature_dicts:
+        return []
+    #    Use the first row to get consistent feature ordering
+    all_features = list(all_feature_dicts[0].keys())
+    population_matrix = np.array(
+        [[feat_dict.get(feat, 0.0) for feat in all_features] for feat_dict in all_feature_dicts],
+        dtype=float
+    )
+    # 3) Z-normalize columnwise across the entire corpus
+    col_means = population_matrix.mean(axis=0)
+    col_stds = population_matrix.std(axis=0)
+    col_stds[col_stds == 0] = 1.0
+    z_population = (population_matrix - col_means) / col_stds
+    # 4) Take the mean across the selected authors (zoom region)
+    selected_mean = z_population[selected_mask].mean(axis=0)
+    # 5) Rank features by mean z-score, keep positives only
+    feature_scores = [(feat, float(score)) for feat, score in zip(all_features, selected_mean) if score > 0]
+    feature_scores.sort(key=lambda x: x[1], reverse=True)
+    return feature_scores[:top_n]
 def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):

utils/visualizations.py CHANGED Viewed

@@ -290,32 +290,44 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
         background_corpus_df=merged_authors_df,
         author_ids=visible_authors,
         other_author_ids=[],
-        features_clm_name='g2v_vector'
     )
     # ── Span-existence filter on task authors in the zoom ───────────────────
-    # Keep only features that have at least one detected span in any of the
-    # visible task authors' texts
-    visible_task_authors = task_authors_df[task_authors_df['authorID'].isin(visible_authors)]
-    if visible_task_authors.empty:
-        visible_task_authors = task_authors_df
     def _to_text(x):
         return '\n\n =========== \n\n'.join(x) if isinstance(x, list) else x
-    task_texts = [_to_text(x) for x in visible_task_authors['fullText'].tolist()]
     filtered_g2v_feats = []
     for feat in g2v_feats:
         try:
             # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
-            if any(find_feature_spans(txt, feat[0]) for txt in task_texts):
                 filtered_g2v_feats.append(feat)
             else:
-                print(f"[INFO] Dropping G2V feature with no spans in task texts: {feat}")
         except Exception as e:
             print(f"[WARN] Error while checking spans for {feat}: {e}")
     # Convert to human readable for display
     HR_g2v_list = []
     for feat in filtered_g2v_feats:

         background_corpus_df=merged_authors_df,
         author_ids=visible_authors,
         other_author_ids=[],
+        features_clm_name='g2v_vector',
+        top_n=50
     )
     # ── Span-existence filter on task authors in the zoom ───────────────────
+    # Keep only features that have detected spans in at least 2 of the
+    # task authors' texts (Mystery + Candidates 1-3)
+    # Use only the task authors (Mystery + Candidates 1-3), not the zoom-visible set
+    task_author_ids = {"Mystery author", "Candidate Author 1", "Candidate Author 2", "Candidate Author 3"}
+    task_only_df = task_authors_df[task_authors_df['authorID'].isin(task_author_ids)]
+    if task_only_df.empty:
+        task_only_df = task_authors_df
     def _to_text(x):
         return '\n\n =========== \n\n'.join(x) if isinstance(x, list) else x
+    task_texts = [_to_text(x) for x in task_only_df['fullText'].tolist()]
+    print(f"task_texts: {task_texts}")
     filtered_g2v_feats = []
     for feat in g2v_feats:
         try:
             # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
+            occurrences = 0
+            for txt in task_texts:
+                spans = find_feature_spans(txt, feat[0])
+                if spans:
+                    occurrences += 1
+            if occurrences >= 2:
                 filtered_g2v_feats.append(feat)
             else:
+                print(f"[INFO] Dropping G2V feature with <2 task-author spans: {feat}")
         except Exception as e:
             print(f"[WARN] Error while checking spans for {feat}: {e}")
+    # After filtering by spans, keep top-N by score
+    filtered_g2v_feats = filtered_g2v_feats[:10]
     # Convert to human readable for display
     HR_g2v_list = []
     for feat in filtered_g2v_feats: