Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

Anisha Bhatnagar commited on Sep 14

Commit

a1e49f6

1 Parent(s): 0ce5cd2

reducing number of precomputed regions; updating cache; data url; g2v relaxed filtering (peter)

Browse files

Files changed (5) hide show

app.py +1 -1
config/config.yaml +2 -2
precompute_caches.py +3 -5
utils/interp_space_utils.py +59 -34
utils/visualizations.py +1 -1

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ def load_config(path="config/config.yaml"):
         return yaml.safe_load(f)
 # A comment to trigger change in spaces
-# comment 2
 cfg = load_config()

         return yaml.safe_load(f)
 # A comment to trigger change in spaces
+# comment 3
 cfg = load_config()

config/config.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # config.yaml
-instances_to_explain_path: "./datasets/hrs_explanations_luar_clusters_18_balanced.json"
-instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?download=true"
 interp_space_path:    "./datasets/sentence_luar_interp_space_2_35/"
 interp_space_url:    "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
 gram2vec_feats_path:      "./datasets/gram2vec_feats.csv"

 # config.yaml
+instances_to_explain_path: "./datasets/hrs_explanations_luar_clusters_2_35_balanced.json"
+instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_2_35_balanced.json?download=true"
 interp_space_path:    "./datasets/sentence_luar_interp_space_2_35/"
 interp_space_url:    "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
 gram2vec_feats_path:      "./datasets/gram2vec_feats.csv"

precompute_caches.py CHANGED Viewed

@@ -19,8 +19,7 @@ def load_config(path="config/config.yaml"):
 def precompute_all_caches(
     models_to_test=None,
     instances_to_process=None,
-    config_path="config/config.yaml",
-    force_regenerate=False
 ):
     """
     Precompute all cache files using the EXACT same methods as app.py.
@@ -194,13 +193,12 @@ from utils.visualizations import visualize_clusters_plotly
 if __name__ == "__main__":
     # Test with a small subset first
-    instances=[i for i in range(2)]  # First 2 instances for testing
     cache_stats = precompute_all_caches(
         models_to_test=[
             'gabrielloiseau/LUAR-MUD-sentence-transformers'
         ],
-        instances_to_process=instances,
-        force_regenerate=False
     )
     print(f"\nCache precomputation completed with {len(cache_stats['errors'])} errors.")

 def precompute_all_caches(
     models_to_test=None,
     instances_to_process=None,
+    config_path="config/config.yaml"
 ):
     """
     Precompute all cache files using the EXACT same methods as app.py.
 if __name__ == "__main__":
     # Test with a small subset first
+    instances=[i for i in range(20)]  # First 20 instances for testing
     cache_stats = precompute_all_caches(
         models_to_test=[
             'gabrielloiseau/LUAR-MUD-sentence-transformers'
         ],
+        instances_to_process=instances
     )
     print(f"\nCache precomputation completed with {len(cache_stats['errors'])} errors.")

utils/interp_space_utils.py CHANGED Viewed

@@ -546,7 +546,9 @@ def compute_clusters_style_representation_3(
     max_num_feats: int = 20,
     max_num_documents_per_author=1,
     max_num_authors=10,
-    max_authors_for_span_extraction=4
     ):
     print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
@@ -568,19 +570,40 @@ def compute_clusters_style_representation_3(
     print(author_names)
     spans_by_author = extract_all_spans(span_df, features, cluster_label_clm_name)
-    # Filter out features that are not present in any of the authors
-    filtered_spans_by_author = {x[0] : x[1] for x in spans_by_author.items() if x[0] in {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(cluster_ids))}
-    print(filtered_spans_by_author.keys())
-    filtered_spans_by_author = [set([f[0] for f in   x[1].items() if len(f[1]) > 0]) for x in filtered_spans_by_author.items()]
-    filtered_set_of_features =  filtered_spans_by_author[0] # all features that appear in all the sets in the filtered_Spans_by_authors list
-    for x in filtered_spans_by_author[1:]:
-        filtered_set_of_features = filtered_set_of_features.intersection(x)
-    print('filtered set of features: ', filtered_set_of_features)
     return {
-        "features": list(filtered_set_of_features),
         "spans": spans_by_author
     }
@@ -679,8 +702,8 @@ def compute_clusters_g2v_representation(
     # Filter in only features that are present in selected_authors
     selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
-    print(f"[DEBUG] selected_authors_g2v_data length: {len(selected_authors_g2v_data)}")
-    print(f"[DEBUG] selected_authors_g2v_data content: {selected_authors_g2v_data}")
     # Get the actual text documents for the selected authors to verify feature presence
     selected_authors_docs = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)]['fullText'].tolist()
@@ -828,7 +851,7 @@ def compute_predicted_author(task_authors_df: pd.DataFrame, col_name: str) -> in
     return predicted_author
-def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, model_name, n_neighbors=7):
     """
     Compute precomputed regions for mystery author and candidates.
@@ -914,41 +937,43 @@ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, model_name, n_n
         include_points = np.vstack([point1.reshape(1, -1), point2.reshape(1, -1)])
         return get_region_around_point(midpoint, region_name, include_points=include_points)
-    # Region 1: Around mystery author only
-    regions["Mystery Author Neighborhood"] = get_region_around_point(
-        q_proj, "Mystery Author"
-    )
-    # Regions 2-4: Around each candidate
-    for i in range(3):
-        regions[f"Candidate {i+1} Neighborhood"] = get_region_around_point(
-            c_proj[i], f"Candidate {i+1}"
-        )
     # Regions 5-7: Between mystery and each candidate
     for i in range(3):
-        region_name = f"Mystery & Candidate {i+1}"
-        regions[region_name] = get_region_between_points(
-            q_proj, c_proj[i], "Mystery", f"Candidate {i+1}"
-        )
     # Regions 8-10: Between candidate pairs
     candidate_pairs = [(0, 1), (0, 2), (1, 2)]
     for i, (c1, c2) in enumerate(candidate_pairs):
-        region_name = f"Candidate {c1+1} & Candidate {c2+1}"
-        regions[region_name] = get_region_between_points(
-            c_proj[c1], c_proj[c2], f"Candidate {c1+1}", f"Candidate {c2+1}"
-        )
     # Regions 11-12: Around predicted and ground truth (if different)
     # This would need predicted_author and ground_truth_author indices
     # For now, we'll create generic regions
     # Region 11: Centroid of all task authors (mystery + 3 candidates)
-    task_centroid = np.mean(np.vstack([q_proj, c_proj]), axis=0)
-    regions["All Task Authors Centroid"] = get_region_around_point(
-        task_centroid, "All Task Authors", include_points=np.vstack([q_proj, c_proj])
-    )
     def serialize_numpy_dtypes(obj):
         if isinstance(obj, np.ndarray):

     max_num_feats: int = 20,
     max_num_documents_per_author=1,
     max_num_authors=10,
+    max_authors_for_span_extraction=4,
+    min_authors_required: int = 2,
+    top_k: int = 10
     ):
     print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
     print(author_names)
     spans_by_author = extract_all_spans(span_df, features, cluster_label_clm_name)
+    # Filter-in only task authors that are part of the current selection
+    task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
+    filtered_task_authors = {author: feat_map for author, feat_map in spans_by_author.items() if author in task_author_names.intersection(set(cluster_ids))}
+    print(filtered_task_authors.keys())
+    # Build per-author sets of features that have at least one span
+    author_present_feature_sets = [
+        {feature for feature, spans in feature_map.items() if len(spans) > 0}
+        for _, feature_map in filtered_task_authors.items()
+    ]
+    # If nothing to aggregate (e.g., no task authors in selection), fall back to empty list
+    selected_features_ranked = []
+    if author_present_feature_sets:
+        coverage_counter = Counter()
+        for present_set in author_present_feature_sets:
+            coverage_counter.update(present_set)
+        # Keep features present in at least `min_authors_required` authors
+        eligible_features = [feat for feat, cnt in coverage_counter.items() if cnt >= int(min_authors_required)]
+        # Preserve original LLM feature ordering as a secondary key where possible
+        feature_original_index = {feat: idx for idx, feat in enumerate(features)} if features else {}
+        selected_features_ranked = sorted(
+            eligible_features,
+            key=lambda f: (-coverage_counter[f], feature_original_index.get(f, 10**9))
+        )[:int(top_k)]
+    print('filtered set of features (min coverage', min_authors_required, '): ', selected_features_ranked)
     return {
+        "features": list(selected_features_ranked),
         "spans": spans_by_author
     }
     # Filter in only features that are present in selected_authors
     selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
+    # print(f"[DEBUG] selected_authors_g2v_data length: {len(selected_authors_g2v_data)}")
+    # print(f"[DEBUG] selected_authors_g2v_data content: {selected_authors_g2v_data}")
     # Get the actual text documents for the selected authors to verify feature presence
     selected_authors_docs = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)]['fullText'].tolist()
     return predicted_author
+def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, pred_idx, model_name, n_neighbors=7):
     """
     Compute precomputed regions for mystery author and candidates.
         include_points = np.vstack([point1.reshape(1, -1), point2.reshape(1, -1)])
         return get_region_around_point(midpoint, region_name, include_points=include_points)
+    # # Region 1: Around mystery author only
+    # regions["Mystery Author Neighborhood"] = get_region_around_point(
+    #     q_proj, "Mystery Author"
+    # )
+    # # Regions 2-4: Around each candidate
+    # for i in range(3):
+    #     regions[f"Candidate {i+1} Neighborhood"] = get_region_around_point(
+    #         c_proj[i], f"Candidate {i+1}"
+    #     )
     # Regions 5-7: Between mystery and each candidate
     for i in range(3):
+        if i == pred_idx: #selecting only mystery and predicted candidate
+            region_name = f"Mystery & Candidate {i+1}"
+            regions[region_name] = get_region_between_points(
+                q_proj, c_proj[i], "Mystery", f"Candidate {i+1}"
+            )
     # Regions 8-10: Between candidate pairs
     candidate_pairs = [(0, 1), (0, 2), (1, 2)]
     for i, (c1, c2) in enumerate(candidate_pairs):
+        if c1 != pred_idx and c2 != pred_idx: #selecting only the non predicated candidates
+            region_name = f"Candidate {c1+1} & Candidate {c2+1}"
+            regions[region_name] = get_region_between_points(
+                c_proj[c1], c_proj[c2], f"Candidate {c1+1}", f"Candidate {c2+1}"
+            )
     # Regions 11-12: Around predicted and ground truth (if different)
     # This would need predicted_author and ground_truth_author indices
     # For now, we'll create generic regions
     # Region 11: Centroid of all task authors (mystery + 3 candidates)
+    # task_centroid = np.mean(np.vstack([q_proj, c_proj]), axis=0)
+    # regions["All Task Authors Centroid"] = get_region_around_point(
+    #     task_centroid, "All Task Authors", include_points=np.vstack([q_proj, c_proj])
+    # )
     def serialize_numpy_dtypes(obj):
         if isinstance(obj, np.ndarray):

utils/visualizations.py CHANGED Viewed

@@ -519,7 +519,7 @@ def visualize_clusters_plotly(iid, cfg, instances, model_radio, custom_model_inp
     candidate_ids = task_authors_df['authorID'].iloc[1:4].tolist()  # 3 candidate IDs
     precomputed_regions = compute_precomputed_regions(
-        bg_proj_for_regions, bg_ids_for_regions, q_proj, c_proj, model_name
     )
     # Create choices for radio buttons

     candidate_ids = task_authors_df['authorID'].iloc[1:4].tolist()  # 3 candidate IDs
     precomputed_regions = compute_precomputed_regions(
+        bg_proj_for_regions, bg_ids_for_regions, q_proj, c_proj, pred_idx, model_name
     )
     # Create choices for radio buttons