Anisha Bhatnagar commited on
Commit
40fde16
Β·
1 Parent(s): c70a836

precompute zoom regions

Browse files
Files changed (3) hide show
  1. app.py +36 -2
  2. utils/interp_space_utils.py +155 -1
  3. utils/visualizations.py +32 -2
app.py CHANGED
@@ -387,13 +387,41 @@ def app(share=False):#, use_cluster_feats=False):
387
  """
388
  gr.HTML(styled_html(expl_html))
389
 
390
- cluster_dropdown = gr.Dropdown(choices=[], label="Select Cluster to Inspect", visible=False)
391
  style_map_state = gr.State()
392
  llm_style_feats_analysis = gr.State()
393
  visible_zoomed_authors = gr.State()
394
 
395
  gr.HTML(instruction_callout("Zoom in on the plot to select a set of background authors and see the presence of the top features from this set in candidate and mystery authors."))
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  with gr.Row():
398
  # ── LLM Features Column ──────────────────────────────────
399
  with gr.Column(scale=1, min_width=0):
@@ -439,7 +467,13 @@ def app(share=False):#, use_cluster_feats=False):
439
  custom_model_input, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author
440
  ),
441
  inputs=[task_dropdown, model_radio, custom_model, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author],
442
- outputs=[plot, style_map_state, bg_proj_state, bg_lbls_state, bg_authors_df]
 
 
 
 
 
 
443
  )
444
 
445
  axis_ranges.change(
 
387
  """
388
  gr.HTML(styled_html(expl_html))
389
 
 
390
  style_map_state = gr.State()
391
  llm_style_feats_analysis = gr.State()
392
  visible_zoomed_authors = gr.State()
393
 
394
  gr.HTML(instruction_callout("Zoom in on the plot to select a set of background authors and see the presence of the top features from this set in candidate and mystery authors."))
395
 
396
+ # State to store precomputed regions
397
+ precomputed_regions_state = gr.State()
398
+
399
+ # Add this after the plot generation
400
+ gr.HTML("""
401
+ <div style="
402
+ font-size: 1.2em;
403
+ font-weight: 600;
404
+ margin: 1em 0 0.5em 0;
405
+ ">
406
+ Quick Region Selection
407
+ </div>
408
+ <div style="
409
+ font-size: 0.9em;
410
+ color: #666;
411
+ margin-bottom: 1em;
412
+ ">
413
+ Select a precomputed region to analyze, or zoom manually on the plot above
414
+ </div>
415
+ """)
416
+
417
+ precomputed_regions_radio = gr.Radio(
418
+ choices=["None"],
419
+ value="None",
420
+ label="Precomputed Regions",
421
+ info="Select a region to automatically zoom and analyze"
422
+ )
423
+
424
+
425
  with gr.Row():
426
  # ── LLM Features Column ──────────────────────────────────
427
  with gr.Column(scale=1, min_width=0):
 
467
  custom_model_input, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author
468
  ),
469
  inputs=[task_dropdown, model_radio, custom_model, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author],
470
+ outputs=[plot, style_map_state, bg_proj_state, bg_lbls_state, bg_authors_df, precomputed_regions_state, precomputed_regions_radio]
471
+ )
472
+
473
+ precomputed_regions_radio.change(
474
+ fn=lambda region_name, precomputed_regions: trigger_precomputed_region(region_name, precomputed_regions),
475
+ inputs=[precomputed_regions_radio, precomputed_regions_state],
476
+ outputs=[axis_ranges]
477
  )
478
 
479
  axis_ranges.change(
utils/interp_space_utils.py CHANGED
@@ -19,13 +19,15 @@ import time
19
  from utils.llm_feat_utils import generate_feature_spans_cached
20
  from collections import Counter
21
  import numpy as np
22
- from sklearn.metrics.pairwise import cosine_similarity
23
  from sklearn.decomposition import PCA
24
 
25
  CACHE_DIR = "datasets/embeddings_cache"
26
  ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
 
27
  os.makedirs(CACHE_DIR, exist_ok=True)
28
  os.makedirs(os.path.dirname(ZOOM_CACHE), exist_ok=True)
 
29
  # Bump this whenever there is a change etc...
30
  CACHE_VERSION = 1
31
 
@@ -826,6 +828,158 @@ def compute_predicted_author(task_authors_df: pd.DataFrame, col_name: str) -> in
826
  return predicted_author
827
 
828
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
829
  if __name__ == "__main__":
830
  background_corpus = pd.read_pickle('../datasets/luar_interp_space_cluster_19/train_authors.pkl')
831
  print(background_corpus.columns)
 
19
  from utils.llm_feat_utils import generate_feature_spans_cached
20
  from collections import Counter
21
  import numpy as np
22
+ from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
23
  from sklearn.decomposition import PCA
24
 
25
  CACHE_DIR = "datasets/embeddings_cache"
26
  ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
27
+ REGION_CACHE = "datasets/region_cache/regions_cache.pkl"
28
  os.makedirs(CACHE_DIR, exist_ok=True)
29
  os.makedirs(os.path.dirname(ZOOM_CACHE), exist_ok=True)
30
+ os.makedirs(os.path.dirname(REGION_CACHE), exist_ok=True)
31
  # Bump this whenever there is a change etc...
32
  CACHE_VERSION = 1
33
 
 
828
  return predicted_author
829
 
830
 
831
+ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, n_neighbors=7):
832
+ """
833
+ Compute precomputed regions for mystery author and candidates.
834
+
835
+ Args:
836
+ bg_proj: (N,2) numpy array with 2D coordinates of background authors
837
+ bg_ids: list of N author IDs for background authors
838
+ q_proj: (1,2) numpy array with mystery author coordinates
839
+ c_proj: (3,2) numpy array with candidate author coordinates
840
+ n_neighbors: number of closest neighbors to include in each region
841
+
842
+ Returns:
843
+ dict: mapping region names to bounding boxes and author lists
844
+ """
845
+ print("Computing sugested regions for zoom...")
846
+ key = f"{hashlib.md5((str(q_proj.tolist()) + str(c_proj.tolist()) + str(n_neighbors)).encode()).hexdigest()}"
847
+
848
+ if os.path.exists(REGION_CACHE):
849
+ with open(REGION_CACHE, 'rb') as f:
850
+ cache = pickle.load(f)
851
+ else:
852
+ cache = {}
853
+ if key in cache:
854
+ print(f"\nCache hit! Using cached regions.")
855
+ return cache[key]
856
+ else:
857
+ print(f"Cache miss. Computing regions.")
858
+
859
+ regions = {}
860
+ # All points for distance calculation (mystery + candidates + background)
861
+ all_points = np.vstack([q_proj.reshape(1, -1), c_proj, bg_proj])
862
+ all_ids = ['mystery'] + [f'candidate_{i}' for i in range(3)] + bg_ids
863
+
864
+ def get_region_around_point(center_point, region_name):
865
+ """Get region around a specific point"""
866
+ # Calculate distances from center point to all background authors
867
+ distances = euclidean_distances([center_point], bg_proj)[0]
868
+
869
+ # Get indices of closest neighbors
870
+ closest_indices = np.argsort(distances)[:n_neighbors]
871
+ closest_authors = [bg_ids[i] for i in closest_indices]
872
+ closest_points = bg_proj[closest_indices]
873
+
874
+ # Include the center point in the region
875
+ region_points = np.vstack([center_point.reshape(1, -1), closest_points])
876
+
877
+ # Calculate bounding box with some padding
878
+ x_min, x_max = region_points[:, 0].min(), region_points[:, 0].max()
879
+ y_min, y_max = region_points[:, 1].min(), region_points[:, 1].max()
880
+
881
+ # Add padding (10% of range)
882
+ x_padding = (x_max - x_min) * 0.1
883
+ y_padding = (y_max - y_min) * 0.1
884
+
885
+ bbox = {
886
+ 'xaxis': [x_min - x_padding, x_max + x_padding],
887
+ 'yaxis': [y_min - y_padding, y_max + y_padding]
888
+ }
889
+
890
+ return {
891
+ 'bbox': bbox,
892
+ 'authors': closest_authors,
893
+ 'center_point': center_point,
894
+ 'description': f"Region around {region_name} ({len(closest_authors)} closest authors)"
895
+ }
896
+
897
+ def get_region_between_points(point1, point2, name1, name2):
898
+ """Get region around the midpoint between two points"""
899
+ midpoint = (point1 + point2) / 2
900
+ region_name = f"{name1} & {name2}"
901
+ return get_region_around_point(midpoint, region_name)
902
+
903
+ # Region 1: Around mystery author only
904
+ print(f"Mystery author: {q_proj}")
905
+ regions["Mystery Author Neighborhood"] = get_region_around_point(
906
+ q_proj, "Mystery Author"
907
+ )
908
+
909
+ # Regions 2-4: Around each candidate
910
+ for i in range(3):
911
+ regions[f"Candidate {i+1} Neighborhood"] = get_region_around_point(
912
+ c_proj[i], f"Candidate {i+1}"
913
+ )
914
+
915
+ # Regions 5-7: Between mystery and each candidate
916
+
917
+ for i in range(3):
918
+ region_name = f"Mystery & Candidate {i+1}"
919
+ print(q_proj, c_proj[i])
920
+ regions[region_name] = get_region_between_points(
921
+ q_proj, c_proj[i], "Mystery", f"Candidate {i+1}"
922
+ )
923
+
924
+ # Regions 8-10: Between candidate pairs
925
+ candidate_pairs = [(0, 1), (0, 2), (1, 2)]
926
+ for i, (c1, c2) in enumerate(candidate_pairs):
927
+ region_name = f"Candidate {c1+1} & Candidate {c2+1}"
928
+ regions[region_name] = get_region_between_points(
929
+ c_proj[c1], c_proj[c2], f"Candidate {c1+1}", f"Candidate {c2+1}"
930
+ )
931
+
932
+ # Regions 11-12: Around predicted and ground truth (if different)
933
+ # This would need predicted_author and ground_truth_author indices
934
+ # For now, we'll create generic regions
935
+
936
+ # Region 11: Centroid of all task authors (mystery + 3 candidates)
937
+ task_centroid = np.mean(np.vstack([q_proj, c_proj]), axis=0)
938
+ regions["All Task Authors Centroid"] = get_region_around_point(
939
+ task_centroid, "All Task Authors"
940
+ )
941
+
942
+ # Region 12: Wider region encompassing all task authors
943
+ all_task_points = np.vstack([q_proj, c_proj])
944
+ task_centroid = np.mean(all_task_points, axis=0)
945
+
946
+ # Find distances from task centroid to all background authors
947
+ distances_from_centroid = euclidean_distances([task_centroid], bg_proj)[0]
948
+
949
+ # Take a larger number of neighbors (e.g., 20) for the expanded region
950
+ n_expanded = min(20, len(bg_ids)) # Don't exceed available authors
951
+ expanded_indices = np.argsort(distances_from_centroid)[:n_expanded]
952
+ expanded_authors = [bg_ids[i] for i in expanded_indices]
953
+ expanded_points = bg_proj[expanded_indices]
954
+
955
+ # Include all task points in the bounding box calculation
956
+ all_region_points = np.vstack([all_task_points, expanded_points])
957
+
958
+ x_min, x_max = all_region_points[:, 0].min(), all_region_points[:, 0].max()
959
+ y_min, y_max = all_region_points[:, 1].min(), all_region_points[:, 1].max()
960
+
961
+ # Add moderate padding
962
+ x_padding = (x_max - x_min) * 0.15
963
+ y_padding = (y_max - y_min) * 0.15
964
+
965
+ expanded_bbox = {
966
+ 'xaxis': [x_min - x_padding, x_max + x_padding],
967
+ 'yaxis': [y_min - y_padding, y_max + y_padding]
968
+ }
969
+
970
+ regions["Expanded Task Region"] = {
971
+ 'bbox': expanded_bbox,
972
+ 'authors': expanded_authors,
973
+ 'center_point': task_centroid,
974
+ 'description': f"Expanded region around all task authors ({len(expanded_authors)} authors)"
975
+ }
976
+
977
+ cache[key] = regions
978
+ with open(REGION_CACHE, 'wb') as f:
979
+ pickle.dump(cache, f)
980
+
981
+ return regions
982
+
983
  if __name__ == "__main__":
984
  background_corpus = pd.read_pickle('../datasets/luar_interp_space_cluster_19/train_authors.pkl')
985
  print(background_corpus.columns)
utils/visualizations.py CHANGED
@@ -10,7 +10,7 @@ import plotly.graph_objects as go
10
  from plotly.colors import sample_colorscale
11
  from gradio import update
12
  import re
13
- from utils.interp_space_utils import compute_clusters_style_representation_3, compute_clusters_g2v_representation
14
  from utils.llm_feat_utils import split_features
15
  from utils.gram2vec_feat_utils import get_shorthand, get_fullform
16
  from gram2vec.feature_locator import find_feature_spans
@@ -510,6 +510,18 @@ def visualize_clusters_plotly(iid, cfg, instances, model_radio, custom_model_inp
510
  font=dict(color='darkblue', size=12)
511
  )
512
 
 
 
 
 
 
 
 
 
 
 
 
 
513
  print('Done processing....')
514
 
515
  return (
@@ -519,6 +531,8 @@ def visualize_clusters_plotly(iid, cfg, instances, model_radio, custom_model_inp
519
  bg_proj, # Return background points
520
  bg_ids, # Return background labels
521
  background_authors_embeddings_df, # Return the DataFrame for zoom handling
 
 
522
 
523
  )
524
  # return fig, update(choices=feature_list, value=feature_list[0]),feature_list
@@ -535,7 +549,23 @@ def extract_cluster_key(display_label: str) -> int:
535
  raise ValueError(f"Unrecognized cluster label: {display_label}")
536
  return int(m.group(1))
537
 
538
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
 
540
  # When a cluster is selected, split features and populate radio buttons
541
  def on_cluster_change(selected_cluster, style_map):
 
10
  from plotly.colors import sample_colorscale
11
  from gradio import update
12
  import re
13
+ from utils.interp_space_utils import compute_clusters_style_representation_3, compute_clusters_g2v_representation, compute_precomputed_regions
14
  from utils.llm_feat_utils import split_features
15
  from utils.gram2vec_feat_utils import get_shorthand, get_fullform
16
  from gram2vec.feature_locator import find_feature_spans
 
510
  font=dict(color='darkblue', size=12)
511
  )
512
 
513
+ # Compute precomputed regions
514
+ bg_proj_for_regions = proj[4:] # Background projections
515
+ bg_ids_for_regions = bg_ids[4:] # Background IDs
516
+
517
+ # Compute precomputed regions
518
+ precomputed_regions = compute_precomputed_regions(
519
+ bg_proj_for_regions, bg_ids_for_regions, q_proj, c_proj
520
+ )
521
+
522
+ # Create choices for radio buttons
523
+ region_choices = ["None"] + list(precomputed_regions.keys())
524
+
525
  print('Done processing....')
526
 
527
  return (
 
531
  bg_proj, # Return background points
532
  bg_ids, # Return background labels
533
  background_authors_embeddings_df, # Return the DataFrame for zoom handling
534
+ precomputed_regions, # Return region choices
535
+ gr.update(choices=region_choices, value="None")
536
 
537
  )
538
  # return fig, update(choices=feature_list, value=feature_list[0]),feature_list
 
549
  raise ValueError(f"Unrecognized cluster label: {display_label}")
550
  return int(m.group(1))
551
 
552
+ def trigger_precomputed_region(region_name, precomputed_regions):
553
+ """
554
+ Simulate a zoom event for a precomputed region.
555
+ Returns the JSON payload that would be sent to axis_ranges.
556
+ """
557
+ print(f"[INFO] Triggering precomputed region: {region_name}")
558
+ print(f"Available regions: {list(precomputed_regions.keys())}")
559
+ if region_name == "None" or region_name not in precomputed_regions:
560
+ return ""
561
+
562
+ region = precomputed_regions[region_name]
563
+ payload = region['bbox']
564
+ json_payload = {
565
+ 'xaxis': [float(payload['xaxis'][0]), float(payload['xaxis'][1])],
566
+ 'yaxis': [float(payload['yaxis'][0]), float(payload['yaxis'][1])]
567
+ }
568
+ return json.dumps(json_payload)
569
 
570
  # When a cluster is selected, split features and populate radio buttons
571
  def on_cluster_change(selected_cluster, style_map):