peter-zeng commited on
Commit
258c7f3
·
1 Parent(s): dcbbcbd

added span frequency feature ranking

Browse files
utils/gram2vec_feat_utils.py CHANGED
@@ -49,7 +49,17 @@ def get_shorthand(feature_str: str) -> str:
49
  return None
50
  if category not in FEATURE_HANDLERS:
51
  return None
52
- code = load_code_map().get(human)
 
 
 
 
 
 
 
 
 
 
53
  if code is None:
54
  # print(f"Warning: No code found for human-readable feature '{human}'")
55
  return None # fallback to the human-readable name
@@ -78,6 +88,14 @@ def get_fullform(shorthand: str) -> str:
78
  if human is None:
79
  return None
80
 
 
 
 
 
 
 
 
 
81
  return f"{category}:{human}"
82
 
83
  def highlight_both_spans(text, llm_spans, gram_spans):
@@ -169,8 +187,9 @@ def show_combined_spans_all(selected_feature_llm, selected_feature_g2v,
169
  if selected_feature_g2v and selected_feature_g2v != "None":
170
  # get gram2vec spans
171
  gram_spans_list = []
172
- # clean the display string and get the feature name without the zscore
173
- selected_feature_g2v = selected_feature_g2v.split(" | [Z=")[0].strip()
 
174
  print(f"Selected Gram2Vec feature: {selected_feature_g2v}")
175
  short = get_shorthand(selected_feature_g2v)
176
  print(f"short hand: {short}")
 
49
  return None
50
  if category not in FEATURE_HANDLERS:
51
  return None
52
+ code_map = load_code_map()
53
+ code = code_map.get(human)
54
+ if code is None:
55
+ # Try normalizing terminology shown in UI
56
+ # Convert 'Preposition' phrasing back to 'Adposition' used in the code map
57
+ human_alt = (human
58
+ .replace("Preposition", "Adposition")
59
+ .replace("preposition", "adposition")
60
+ .replace("Prepositional", "Adpositional")
61
+ .replace("prepositional", "adpositional"))
62
+ code = code_map.get(human_alt)
63
  if code is None:
64
  # print(f"Warning: No code found for human-readable feature '{human}'")
65
  return None # fallback to the human-readable name
 
88
  if human is None:
89
  return None
90
 
91
+ # Normalize terminology for UI: prefer "Preposition" over "Adposition"
92
+ # Also handle potential "adpositional" variants if present
93
+ human = (human
94
+ .replace("Adposition", "Preposition")
95
+ .replace("adposition", "preposition")
96
+ .replace("Adpositional", "Prepositional")
97
+ .replace("adpositional", "prepositional"))
98
+
99
  return f"{category}:{human}"
100
 
101
  def highlight_both_spans(text, llm_spans, gram_spans):
 
187
  if selected_feature_g2v and selected_feature_g2v != "None":
188
  # get gram2vec spans
189
  gram_spans_list = []
190
+ # In case any old label formatting with z-scores leaks through, strip it defensively
191
+ if "| [Z=" in selected_feature_g2v:
192
+ selected_feature_g2v = selected_feature_g2v.split(" | [Z=")[0].strip()
193
  print(f"Selected Gram2Vec feature: {selected_feature_g2v}")
194
  short = get_shorthand(selected_feature_g2v)
195
  print(f"short hand: {short}")
utils/interp_space_utils.py CHANGED
@@ -17,6 +17,8 @@ from pydantic import BaseModel
17
  from pydantic import ValidationError
18
  import time
19
  from utils.llm_feat_utils import generate_feature_spans_cached
 
 
20
  from collections import Counter
21
  import numpy as np
22
  from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
@@ -633,6 +635,7 @@ def compute_clusters_g2v_representation(
633
  other_author_ids: List[Any],
634
  features_clm_name: str,
635
  top_n: int = 10,
 
636
  ) -> List[tuple]: # Changed return type to List[tuple] to include scores
637
 
638
  # 1) Identify selected authors in the zoom region
@@ -666,8 +669,54 @@ def compute_clusters_g2v_representation(
666
  # 5) Rank features by mean z-score, keep positives only
667
  feature_scores = [(feat, float(score)) for feat, score in zip(all_features, selected_mean) if score > 0]
668
  feature_scores.sort(key=lambda x: x[1], reverse=True)
669
-
670
- return feature_scores[:top_n]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671
 
672
  def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
673
 
 
17
  from pydantic import ValidationError
18
  import time
19
  from utils.llm_feat_utils import generate_feature_spans_cached
20
+ from utils.gram2vec_feat_utils import get_shorthand, get_fullform
21
+ from gram2vec.feature_locator import find_feature_spans
22
  from collections import Counter
23
  import numpy as np
24
  from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
 
635
  other_author_ids: List[Any],
636
  features_clm_name: str,
637
  top_n: int = 10,
638
+ max_candidates_for_span_sorting: int = 50,
639
  ) -> List[tuple]: # Changed return type to List[tuple] to include scores
640
 
641
  # 1) Identify selected authors in the zoom region
 
669
  # 5) Rank features by mean z-score, keep positives only
670
  feature_scores = [(feat, float(score)) for feat, score in zip(all_features, selected_mean) if score > 0]
671
  feature_scores.sort(key=lambda x: x[1], reverse=True)
672
+
673
+ # 6) Extract top candidates for span-based sorting
674
+ candidate_features = feature_scores[:max_candidates_for_span_sorting]
675
+
676
+ # 7) Extract spans for task authors to sort by frequency
677
+ task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
678
+ task_authors_in_selection = [aid for aid in author_ids if aid in task_author_names]
679
+
680
+ if not task_authors_in_selection:
681
+ # If no task authors in selection, just return the z-score sorted features
682
+ print("[INFO] No task authors in selection, returning z-score sorted features")
683
+ return feature_scores[:top_n]
684
+
685
+ # Get task author data
686
+ task_authors_df = background_corpus_df[background_corpus_df['authorID'].isin(task_authors_in_selection)]
687
+
688
+ # Count spans for each feature across task authors
689
+ feature_span_counts = {}
690
+ for feat_shorthand, z_score in candidate_features:
691
+ span_count = 0
692
+
693
+ # Convert shorthand to human-readable for display (if needed)
694
+ # Note: features in gram2vec dict are in shorthand format like "pos_unigrams:ADJ"
695
+
696
+ for _, author_row in task_authors_df.iterrows():
697
+ author_text = author_row['fullText']
698
+ if isinstance(author_text, list):
699
+ author_text = '\n\n'.join(author_text)
700
+
701
+ try:
702
+ # find_feature_spans expects shorthand format like "pos_unigrams:ADJ"
703
+ spans = find_feature_spans(author_text, feat_shorthand)
704
+ span_count += len(spans)
705
+ except Exception as e:
706
+ # If span extraction fails, continue with 0 spans for this author
707
+ pass
708
+
709
+ feature_span_counts[feat_shorthand] = span_count
710
+
711
+ # 8) Sort features by span frequency, then by z-score as tiebreaker
712
+ sorted_by_spans = sorted(
713
+ candidate_features,
714
+ key=lambda x: (-feature_span_counts.get(x[0], 0), -x[1])
715
+ )
716
+
717
+ print(f"[INFO] Sorted gram2vec features by span frequency: {[(f, feature_span_counts.get(f, 0), z) for f, z in sorted_by_spans[:top_n]]}")
718
+
719
+ return sorted_by_spans[:top_n]
720
 
721
  def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
722