Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

Anisha Bhatnagar commited on Sep 9

Commit

40fde16

1 Parent(s): c70a836

precompute zoom regions

Browse files

Files changed (3) hide show

app.py +36 -2
utils/interp_space_utils.py +155 -1
utils/visualizations.py +32 -2

app.py CHANGED Viewed

@@ -387,13 +387,41 @@ def app(share=False):#, use_cluster_feats=False):
                 """
                 gr.HTML(styled_html(expl_html))
-        cluster_dropdown = gr.Dropdown(choices=[], label="Select Cluster to Inspect", visible=False)
         style_map_state = gr.State()
         llm_style_feats_analysis = gr.State()
         visible_zoomed_authors = gr.State()
         gr.HTML(instruction_callout("Zoom in on the plot to select a set of background authors and see the presence of the top features from this set in candidate and mystery authors."))
         with gr.Row():
             # ── LLM Features Column ──────────────────────────────────
             with gr.Column(scale=1, min_width=0):
@@ -439,7 +467,13 @@ def app(share=False):#, use_cluster_feats=False):
                 custom_model_input, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author
             ),
             inputs=[task_dropdown, model_radio, custom_model, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author],
-            outputs=[plot, style_map_state, bg_proj_state, bg_lbls_state, bg_authors_df]
         )
         axis_ranges.change(

                 """
                 gr.HTML(styled_html(expl_html))
         style_map_state = gr.State()
         llm_style_feats_analysis = gr.State()
         visible_zoomed_authors = gr.State()
         gr.HTML(instruction_callout("Zoom in on the plot to select a set of background authors and see the presence of the top features from this set in candidate and mystery authors."))
+        # State to store precomputed regions
+        precomputed_regions_state = gr.State()
+        # Add this after the plot generation
+        gr.HTML("""
+            <div style="
+                font-size: 1.2em;
+                font-weight: 600;
+                margin: 1em 0 0.5em 0;
+            ">
+                Quick Region Selection
+            </div>
+            <div style="
+                font-size: 0.9em;
+                color: #666;
+                margin-bottom: 1em;
+            ">
+                Select a precomputed region to analyze, or zoom manually on the plot above
+            </div>
+        """)
+        precomputed_regions_radio = gr.Radio(
+            choices=["None"],
+            value="None",
+            label="Precomputed Regions",
+            info="Select a region to automatically zoom and analyze"
+        )
         with gr.Row():
             # ── LLM Features Column ──────────────────────────────────
             with gr.Column(scale=1, min_width=0):
                 custom_model_input, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author
             ),
             inputs=[task_dropdown, model_radio, custom_model, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author],
+            outputs=[plot, style_map_state, bg_proj_state, bg_lbls_state, bg_authors_df, precomputed_regions_state, precomputed_regions_radio]
+        )
+        precomputed_regions_radio.change(
+            fn=lambda region_name, precomputed_regions: trigger_precomputed_region(region_name, precomputed_regions),
+            inputs=[precomputed_regions_radio, precomputed_regions_state],
+            outputs=[axis_ranges]
         )
         axis_ranges.change(

utils/interp_space_utils.py CHANGED Viewed

@@ -19,13 +19,15 @@ import time
 from utils.llm_feat_utils import generate_feature_spans_cached
 from collections import Counter
 import numpy as np
-from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.decomposition import PCA
 CACHE_DIR = "datasets/embeddings_cache"
 ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.makedirs(os.path.dirname(ZOOM_CACHE), exist_ok=True)
 # Bump this whenever there is a change etc...
 CACHE_VERSION = 1
@@ -826,6 +828,158 @@ def compute_predicted_author(task_authors_df: pd.DataFrame, col_name: str) -> in
     return predicted_author
 if __name__ == "__main__":
     background_corpus = pd.read_pickle('../datasets/luar_interp_space_cluster_19/train_authors.pkl')
     print(background_corpus.columns)

 from utils.llm_feat_utils import generate_feature_spans_cached
 from collections import Counter
 import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
 from sklearn.decomposition import PCA
 CACHE_DIR = "datasets/embeddings_cache"
 ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
+REGION_CACHE = "datasets/region_cache/regions_cache.pkl"
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.makedirs(os.path.dirname(ZOOM_CACHE), exist_ok=True)
+os.makedirs(os.path.dirname(REGION_CACHE), exist_ok=True)
 # Bump this whenever there is a change etc...
 CACHE_VERSION = 1
     return predicted_author
+def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, n_neighbors=7):
+    """
+    Compute precomputed regions for mystery author and candidates.
+    Args:
+        bg_proj: (N,2) numpy array with 2D coordinates of background authors
+        bg_ids: list of N author IDs for background authors
+        q_proj: (1,2) numpy array with mystery author coordinates
+        c_proj: (3,2) numpy array with candidate author coordinates
+        n_neighbors: number of closest neighbors to include in each region
+    Returns:
+        dict: mapping region names to bounding boxes and author lists
+    """
+    print("Computing sugested regions for zoom...")
+    key = f"{hashlib.md5((str(q_proj.tolist()) + str(c_proj.tolist()) + str(n_neighbors)).encode()).hexdigest()}"
+    if os.path.exists(REGION_CACHE):
+        with open(REGION_CACHE, 'rb') as f:
+            cache = pickle.load(f)
+    else:
+        cache = {}
+    if key in cache:
+        print(f"\nCache hit! Using cached regions.")
+        return cache[key]
+    else:
+        print(f"Cache miss. Computing regions.")
+    regions = {}
+    # All points for distance calculation (mystery + candidates + background)
+    all_points = np.vstack([q_proj.reshape(1, -1), c_proj, bg_proj])
+    all_ids = ['mystery'] + [f'candidate_{i}' for i in range(3)] + bg_ids
+    def get_region_around_point(center_point, region_name):
+        """Get region around a specific point"""
+        # Calculate distances from center point to all background authors
+        distances = euclidean_distances([center_point], bg_proj)[0]
+        # Get indices of closest neighbors
+        closest_indices = np.argsort(distances)[:n_neighbors]
+        closest_authors = [bg_ids[i] for i in closest_indices]
+        closest_points = bg_proj[closest_indices]
+        # Include the center point in the region
+        region_points = np.vstack([center_point.reshape(1, -1), closest_points])
+        # Calculate bounding box with some padding
+        x_min, x_max = region_points[:, 0].min(), region_points[:, 0].max()
+        y_min, y_max = region_points[:, 1].min(), region_points[:, 1].max()
+        # Add padding (10% of range)
+        x_padding = (x_max - x_min) * 0.1
+        y_padding = (y_max - y_min) * 0.1
+        bbox = {
+            'xaxis': [x_min - x_padding, x_max + x_padding],
+            'yaxis': [y_min - y_padding, y_max + y_padding]
+        }
+        return {
+            'bbox': bbox,
+            'authors': closest_authors,
+            'center_point': center_point,
+            'description': f"Region around {region_name} ({len(closest_authors)} closest authors)"
+        }
+    def get_region_between_points(point1, point2, name1, name2):
+        """Get region around the midpoint between two points"""
+        midpoint = (point1 + point2) / 2
+        region_name = f"{name1} & {name2}"
+        return get_region_around_point(midpoint, region_name)
+    # Region 1: Around mystery author only
+    print(f"Mystery author: {q_proj}")
+    regions["Mystery Author Neighborhood"] = get_region_around_point(
+        q_proj, "Mystery Author"
+    )
+    # Regions 2-4: Around each candidate
+    for i in range(3):
+        regions[f"Candidate {i+1} Neighborhood"] = get_region_around_point(
+            c_proj[i], f"Candidate {i+1}"
+        )
+    # Regions 5-7: Between mystery and each candidate
+    for i in range(3):
+        region_name = f"Mystery & Candidate {i+1}"
+        print(q_proj, c_proj[i])
+        regions[region_name] = get_region_between_points(
+            q_proj, c_proj[i], "Mystery", f"Candidate {i+1}"
+        )
+    # Regions 8-10: Between candidate pairs
+    candidate_pairs = [(0, 1), (0, 2), (1, 2)]
+    for i, (c1, c2) in enumerate(candidate_pairs):
+        region_name = f"Candidate {c1+1} & Candidate {c2+1}"
+        regions[region_name] = get_region_between_points(
+            c_proj[c1], c_proj[c2], f"Candidate {c1+1}", f"Candidate {c2+1}"
+        )
+    # Regions 11-12: Around predicted and ground truth (if different)
+    # This would need predicted_author and ground_truth_author indices
+    # For now, we'll create generic regions
+    # Region 11: Centroid of all task authors (mystery + 3 candidates)
+    task_centroid = np.mean(np.vstack([q_proj, c_proj]), axis=0)
+    regions["All Task Authors Centroid"] = get_region_around_point(
+        task_centroid, "All Task Authors"
+    )
+    # Region 12: Wider region encompassing all task authors
+    all_task_points = np.vstack([q_proj, c_proj])
+    task_centroid = np.mean(all_task_points, axis=0)
+    # Find distances from task centroid to all background authors
+    distances_from_centroid = euclidean_distances([task_centroid], bg_proj)[0]
+    # Take a larger number of neighbors (e.g., 20) for the expanded region
+    n_expanded = min(20, len(bg_ids))  # Don't exceed available authors
+    expanded_indices = np.argsort(distances_from_centroid)[:n_expanded]
+    expanded_authors = [bg_ids[i] for i in expanded_indices]
+    expanded_points = bg_proj[expanded_indices]
+    # Include all task points in the bounding box calculation
+    all_region_points = np.vstack([all_task_points, expanded_points])
+    x_min, x_max = all_region_points[:, 0].min(), all_region_points[:, 0].max()
+    y_min, y_max = all_region_points[:, 1].min(), all_region_points[:, 1].max()
+    # Add moderate padding
+    x_padding = (x_max - x_min) * 0.15
+    y_padding = (y_max - y_min) * 0.15
+    expanded_bbox = {
+        'xaxis': [x_min - x_padding, x_max + x_padding],
+        'yaxis': [y_min - y_padding, y_max + y_padding]
+    }
+    regions["Expanded Task Region"] = {
+        'bbox': expanded_bbox,
+        'authors': expanded_authors,
+        'center_point': task_centroid,
+        'description': f"Expanded region around all task authors ({len(expanded_authors)} authors)"
+    }
+    cache[key] = regions
+    with open(REGION_CACHE, 'wb') as f:
+        pickle.dump(cache, f)
+    return regions
 if __name__ == "__main__":
     background_corpus = pd.read_pickle('../datasets/luar_interp_space_cluster_19/train_authors.pkl')
     print(background_corpus.columns)

utils/visualizations.py CHANGED Viewed

@@ -10,7 +10,7 @@ import plotly.graph_objects as go
 from plotly.colors import sample_colorscale
 from gradio import update
 import re
-from utils.interp_space_utils import compute_clusters_style_representation_3, compute_clusters_g2v_representation
 from utils.llm_feat_utils import split_features
 from utils.gram2vec_feat_utils import get_shorthand, get_fullform
 from gram2vec.feature_locator import find_feature_spans
@@ -510,6 +510,18 @@ def visualize_clusters_plotly(iid, cfg, instances, model_radio, custom_model_inp
             font=dict(color='darkblue', size=12)
         )
     print('Done processing....')
     return (
@@ -519,6 +531,8 @@ def visualize_clusters_plotly(iid, cfg, instances, model_radio, custom_model_inp
       bg_proj,  # Return background points
       bg_ids,    # Return background labels
       background_authors_embeddings_df,  # Return the DataFrame for zoom handling
     )
     # return fig, update(choices=feature_list, value=feature_list[0]),feature_list
@@ -535,7 +549,23 @@ def extract_cluster_key(display_label: str) -> int:
         raise ValueError(f"Unrecognized cluster label: {display_label}")
     return int(m.group(1))
 # When a cluster is selected, split features and populate radio buttons
 def on_cluster_change(selected_cluster, style_map):

 from plotly.colors import sample_colorscale
 from gradio import update
 import re
+from utils.interp_space_utils import compute_clusters_style_representation_3, compute_clusters_g2v_representation, compute_precomputed_regions
 from utils.llm_feat_utils import split_features
 from utils.gram2vec_feat_utils import get_shorthand, get_fullform
 from gram2vec.feature_locator import find_feature_spans
             font=dict(color='darkblue', size=12)
         )
+    # Compute precomputed regions
+    bg_proj_for_regions = proj[4:]  # Background projections
+    bg_ids_for_regions = bg_ids[4:]  # Background IDs
+    # Compute precomputed regions
+    precomputed_regions = compute_precomputed_regions(
+        bg_proj_for_regions, bg_ids_for_regions, q_proj, c_proj
+    )
+    # Create choices for radio buttons
+    region_choices = ["None"] + list(precomputed_regions.keys())
     print('Done processing....')
     return (
       bg_proj,  # Return background points
       bg_ids,    # Return background labels
       background_authors_embeddings_df,  # Return the DataFrame for zoom handling
+      precomputed_regions,  # Return region choices
+      gr.update(choices=region_choices, value="None")
     )
     # return fig, update(choices=feature_list, value=feature_list[0]),feature_list
         raise ValueError(f"Unrecognized cluster label: {display_label}")
     return int(m.group(1))
+def trigger_precomputed_region(region_name, precomputed_regions):
+    """
+    Simulate a zoom event for a precomputed region.
+    Returns the JSON payload that would be sent to axis_ranges.
+    """
+    print(f"[INFO] Triggering precomputed region: {region_name}")
+    print(f"Available regions: {list(precomputed_regions.keys())}")
+    if region_name == "None" or region_name not in precomputed_regions:
+        return ""
+    region = precomputed_regions[region_name]
+    payload = region['bbox']
+    json_payload = {
+        'xaxis': [float(payload['xaxis'][0]), float(payload['xaxis'][1])],
+        'yaxis': [float(payload['yaxis'][0]), float(payload['yaxis'][1])]
+    }
+    return json.dumps(json_payload)
 # When a cluster is selected, split features and populate radio buttons
 def on_cluster_change(selected_cluster, style_map):