Anisha Bhatnagar commited on
Commit
a1e49f6
·
1 Parent(s): 0ce5cd2

reducing number of precomputed regions; updating cache; data url; g2v relaxed filtering (peter)

Browse files
app.py CHANGED
@@ -22,7 +22,7 @@ def load_config(path="config/config.yaml"):
22
  return yaml.safe_load(f)
23
 
24
  # A comment to trigger change in spaces
25
- # comment 2
26
  cfg = load_config()
27
 
28
 
 
22
  return yaml.safe_load(f)
23
 
24
  # A comment to trigger change in spaces
25
+ # comment 3
26
  cfg = load_config()
27
 
28
 
config/config.yaml CHANGED
@@ -1,6 +1,6 @@
1
  # config.yaml
2
- instances_to_explain_path: "./datasets/hrs_explanations_luar_clusters_18_balanced.json"
3
- instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?download=true"
4
  interp_space_path: "./datasets/sentence_luar_interp_space_2_35/"
5
  interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
6
  gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
 
1
  # config.yaml
2
+ instances_to_explain_path: "./datasets/hrs_explanations_luar_clusters_2_35_balanced.json"
3
+ instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_2_35_balanced.json?download=true"
4
  interp_space_path: "./datasets/sentence_luar_interp_space_2_35/"
5
  interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
6
  gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
precompute_caches.py CHANGED
@@ -19,8 +19,7 @@ def load_config(path="config/config.yaml"):
19
  def precompute_all_caches(
20
  models_to_test=None,
21
  instances_to_process=None,
22
- config_path="config/config.yaml",
23
- force_regenerate=False
24
  ):
25
  """
26
  Precompute all cache files using the EXACT same methods as app.py.
@@ -194,13 +193,12 @@ from utils.visualizations import visualize_clusters_plotly
194
 
195
  if __name__ == "__main__":
196
  # Test with a small subset first
197
- instances=[i for i in range(2)] # First 2 instances for testing
198
  cache_stats = precompute_all_caches(
199
  models_to_test=[
200
  'gabrielloiseau/LUAR-MUD-sentence-transformers'
201
  ],
202
- instances_to_process=instances,
203
- force_regenerate=False
204
  )
205
 
206
  print(f"\nCache precomputation completed with {len(cache_stats['errors'])} errors.")
 
19
  def precompute_all_caches(
20
  models_to_test=None,
21
  instances_to_process=None,
22
+ config_path="config/config.yaml"
 
23
  ):
24
  """
25
  Precompute all cache files using the EXACT same methods as app.py.
 
193
 
194
  if __name__ == "__main__":
195
  # Test with a small subset first
196
+ instances=[i for i in range(20)] # First 20 instances for testing
197
  cache_stats = precompute_all_caches(
198
  models_to_test=[
199
  'gabrielloiseau/LUAR-MUD-sentence-transformers'
200
  ],
201
+ instances_to_process=instances
 
202
  )
203
 
204
  print(f"\nCache precomputation completed with {len(cache_stats['errors'])} errors.")
utils/interp_space_utils.py CHANGED
@@ -546,7 +546,9 @@ def compute_clusters_style_representation_3(
546
  max_num_feats: int = 20,
547
  max_num_documents_per_author=1,
548
  max_num_authors=10,
549
- max_authors_for_span_extraction=4
 
 
550
  ):
551
 
552
  print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
@@ -568,19 +570,40 @@ def compute_clusters_style_representation_3(
568
  print(author_names)
569
  spans_by_author = extract_all_spans(span_df, features, cluster_label_clm_name)
570
 
571
- # Filter out features that are not present in any of the authors
572
- filtered_spans_by_author = {x[0] : x[1] for x in spans_by_author.items() if x[0] in {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(cluster_ids))}
573
- print(filtered_spans_by_author.keys())
574
- filtered_spans_by_author = [set([f[0] for f in x[1].items() if len(f[1]) > 0]) for x in filtered_spans_by_author.items()]
575
 
576
- filtered_set_of_features = filtered_spans_by_author[0] # all features that appear in all the sets in the filtered_Spans_by_authors list
577
- for x in filtered_spans_by_author[1:]:
578
- filtered_set_of_features = filtered_set_of_features.intersection(x)
579
 
580
- print('filtered set of features: ', filtered_set_of_features)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581
 
582
  return {
583
- "features": list(filtered_set_of_features),
584
  "spans": spans_by_author
585
  }
586
 
@@ -679,8 +702,8 @@ def compute_clusters_g2v_representation(
679
  # Filter in only features that are present in selected_authors
680
  selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
681
 
682
- print(f"[DEBUG] selected_authors_g2v_data length: {len(selected_authors_g2v_data)}")
683
- print(f"[DEBUG] selected_authors_g2v_data content: {selected_authors_g2v_data}")
684
 
685
  # Get the actual text documents for the selected authors to verify feature presence
686
  selected_authors_docs = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)]['fullText'].tolist()
@@ -828,7 +851,7 @@ def compute_predicted_author(task_authors_df: pd.DataFrame, col_name: str) -> in
828
  return predicted_author
829
 
830
 
831
- def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, model_name, n_neighbors=7):
832
  """
833
  Compute precomputed regions for mystery author and candidates.
834
 
@@ -914,41 +937,43 @@ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, model_name, n_n
914
  include_points = np.vstack([point1.reshape(1, -1), point2.reshape(1, -1)])
915
  return get_region_around_point(midpoint, region_name, include_points=include_points)
916
 
917
- # Region 1: Around mystery author only
918
- regions["Mystery Author Neighborhood"] = get_region_around_point(
919
- q_proj, "Mystery Author"
920
- )
921
 
922
- # Regions 2-4: Around each candidate
923
- for i in range(3):
924
- regions[f"Candidate {i+1} Neighborhood"] = get_region_around_point(
925
- c_proj[i], f"Candidate {i+1}"
926
- )
927
 
928
  # Regions 5-7: Between mystery and each candidate
929
  for i in range(3):
930
- region_name = f"Mystery & Candidate {i+1}"
931
- regions[region_name] = get_region_between_points(
932
- q_proj, c_proj[i], "Mystery", f"Candidate {i+1}"
933
- )
 
934
 
935
  # Regions 8-10: Between candidate pairs
936
  candidate_pairs = [(0, 1), (0, 2), (1, 2)]
937
  for i, (c1, c2) in enumerate(candidate_pairs):
938
- region_name = f"Candidate {c1+1} & Candidate {c2+1}"
939
- regions[region_name] = get_region_between_points(
940
- c_proj[c1], c_proj[c2], f"Candidate {c1+1}", f"Candidate {c2+1}"
941
- )
 
942
 
943
  # Regions 11-12: Around predicted and ground truth (if different)
944
  # This would need predicted_author and ground_truth_author indices
945
  # For now, we'll create generic regions
946
 
947
  # Region 11: Centroid of all task authors (mystery + 3 candidates)
948
- task_centroid = np.mean(np.vstack([q_proj, c_proj]), axis=0)
949
- regions["All Task Authors Centroid"] = get_region_around_point(
950
- task_centroid, "All Task Authors", include_points=np.vstack([q_proj, c_proj])
951
- )
952
 
953
  def serialize_numpy_dtypes(obj):
954
  if isinstance(obj, np.ndarray):
 
546
  max_num_feats: int = 20,
547
  max_num_documents_per_author=1,
548
  max_num_authors=10,
549
+ max_authors_for_span_extraction=4,
550
+ min_authors_required: int = 2,
551
+ top_k: int = 10
552
  ):
553
 
554
  print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
 
570
  print(author_names)
571
  spans_by_author = extract_all_spans(span_df, features, cluster_label_clm_name)
572
 
573
+ # Filter-in only task authors that are part of the current selection
574
+ task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
575
+ filtered_task_authors = {author: feat_map for author, feat_map in spans_by_author.items() if author in task_author_names.intersection(set(cluster_ids))}
 
576
 
577
+ print(filtered_task_authors.keys())
 
 
578
 
579
+ # Build per-author sets of features that have at least one span
580
+ author_present_feature_sets = [
581
+ {feature for feature, spans in feature_map.items() if len(spans) > 0}
582
+ for _, feature_map in filtered_task_authors.items()
583
+ ]
584
+
585
+ # If nothing to aggregate (e.g., no task authors in selection), fall back to empty list
586
+ selected_features_ranked = []
587
+ if author_present_feature_sets:
588
+ coverage_counter = Counter()
589
+ for present_set in author_present_feature_sets:
590
+ coverage_counter.update(present_set)
591
+
592
+ # Keep features present in at least `min_authors_required` authors
593
+ eligible_features = [feat for feat, cnt in coverage_counter.items() if cnt >= int(min_authors_required)]
594
+
595
+ # Preserve original LLM feature ordering as a secondary key where possible
596
+ feature_original_index = {feat: idx for idx, feat in enumerate(features)} if features else {}
597
+
598
+ selected_features_ranked = sorted(
599
+ eligible_features,
600
+ key=lambda f: (-coverage_counter[f], feature_original_index.get(f, 10**9))
601
+ )[:int(top_k)]
602
+
603
+ print('filtered set of features (min coverage', min_authors_required, '): ', selected_features_ranked)
604
 
605
  return {
606
+ "features": list(selected_features_ranked),
607
  "spans": spans_by_author
608
  }
609
 
 
702
  # Filter in only features that are present in selected_authors
703
  selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
704
 
705
+ # print(f"[DEBUG] selected_authors_g2v_data length: {len(selected_authors_g2v_data)}")
706
+ # print(f"[DEBUG] selected_authors_g2v_data content: {selected_authors_g2v_data}")
707
 
708
  # Get the actual text documents for the selected authors to verify feature presence
709
  selected_authors_docs = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)]['fullText'].tolist()
 
851
  return predicted_author
852
 
853
 
854
+ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, pred_idx, model_name, n_neighbors=7):
855
  """
856
  Compute precomputed regions for mystery author and candidates.
857
 
 
937
  include_points = np.vstack([point1.reshape(1, -1), point2.reshape(1, -1)])
938
  return get_region_around_point(midpoint, region_name, include_points=include_points)
939
 
940
+ # # Region 1: Around mystery author only
941
+ # regions["Mystery Author Neighborhood"] = get_region_around_point(
942
+ # q_proj, "Mystery Author"
943
+ # )
944
 
945
+ # # Regions 2-4: Around each candidate
946
+ # for i in range(3):
947
+ # regions[f"Candidate {i+1} Neighborhood"] = get_region_around_point(
948
+ # c_proj[i], f"Candidate {i+1}"
949
+ # )
950
 
951
  # Regions 5-7: Between mystery and each candidate
952
  for i in range(3):
953
+ if i == pred_idx: #selecting only mystery and predicted candidate
954
+ region_name = f"Mystery & Candidate {i+1}"
955
+ regions[region_name] = get_region_between_points(
956
+ q_proj, c_proj[i], "Mystery", f"Candidate {i+1}"
957
+ )
958
 
959
  # Regions 8-10: Between candidate pairs
960
  candidate_pairs = [(0, 1), (0, 2), (1, 2)]
961
  for i, (c1, c2) in enumerate(candidate_pairs):
962
+ if c1 != pred_idx and c2 != pred_idx: #selecting only the non predicated candidates
963
+ region_name = f"Candidate {c1+1} & Candidate {c2+1}"
964
+ regions[region_name] = get_region_between_points(
965
+ c_proj[c1], c_proj[c2], f"Candidate {c1+1}", f"Candidate {c2+1}"
966
+ )
967
 
968
  # Regions 11-12: Around predicted and ground truth (if different)
969
  # This would need predicted_author and ground_truth_author indices
970
  # For now, we'll create generic regions
971
 
972
  # Region 11: Centroid of all task authors (mystery + 3 candidates)
973
+ # task_centroid = np.mean(np.vstack([q_proj, c_proj]), axis=0)
974
+ # regions["All Task Authors Centroid"] = get_region_around_point(
975
+ # task_centroid, "All Task Authors", include_points=np.vstack([q_proj, c_proj])
976
+ # )
977
 
978
  def serialize_numpy_dtypes(obj):
979
  if isinstance(obj, np.ndarray):
utils/visualizations.py CHANGED
@@ -519,7 +519,7 @@ def visualize_clusters_plotly(iid, cfg, instances, model_radio, custom_model_inp
519
  candidate_ids = task_authors_df['authorID'].iloc[1:4].tolist() # 3 candidate IDs
520
 
521
  precomputed_regions = compute_precomputed_regions(
522
- bg_proj_for_regions, bg_ids_for_regions, q_proj, c_proj, model_name
523
  )
524
 
525
  # Create choices for radio buttons
 
519
  candidate_ids = task_authors_df['authorID'].iloc[1:4].tolist() # 3 candidate IDs
520
 
521
  precomputed_regions = compute_precomputed_regions(
522
+ bg_proj_for_regions, bg_ids_for_regions, q_proj, c_proj, pred_idx, model_name
523
  )
524
 
525
  # Create choices for radio buttons