peter-zeng commited on
Commit
8e5c429
Β·
1 Parent(s): a6ee680

changed filtering and selection of g2v features

Browse files
Files changed (2) hide show
  1. utils/interp_space_utils.py +26 -153
  2. utils/visualizations.py +21 -9
utils/interp_space_utils.py CHANGED
@@ -613,168 +613,41 @@ def compute_clusters_g2v_representation(
613
  other_author_ids: List[Any],
614
  features_clm_name: str,
615
  top_n: int = 10,
616
- mode: str = "contrastive",
617
- sharedness_method: str = "mean_minus_alpha_std",
618
- alpha: float = 0.5
619
  ) -> List[tuple]: # Changed return type to List[tuple] to include scores
620
 
621
-
622
  selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
623
 
624
  if not selected_mask.any():
625
- return [] # No documents found for the given cluster_ids
626
-
627
- selected_feats = background_corpus_df[selected_mask][features_clm_name].tolist()
628
- all_g2v_feats = list(selected_feats[0].keys())
629
-
630
- # If the user requested a sharedness-based score, compute it and return top-N.
631
- if mode == "sharedness":
632
- selected_matrix = np.array([list(x.values()) for x in selected_feats], dtype=float)
633
-
634
- if sharedness_method == "mean":
635
- scores = selected_matrix.mean(axis=0)
636
- elif sharedness_method in ("mean_minus_alpha_std", "mean-std", "mean_minus_std"):
637
- means = selected_matrix.mean(axis=0)
638
- stds = selected_matrix.std(axis=0)
639
- scores = means - float(alpha) * stds
640
- elif sharedness_method == "min":
641
- scores = selected_matrix.min(axis=0)
642
- else:
643
- # Default fallback to mean-minus-alpha*std if unknown method
644
- means = selected_matrix.mean(axis=0)
645
- stds = selected_matrix.std(axis=0)
646
- scores = means - float(alpha) * stds
647
-
648
- # Rank and return with scores
649
- feature_scores = [(feat, score) for feat, score in zip(all_g2v_feats, scores) if score > 0]
650
- feature_scores.sort(key=lambda x: x[1], reverse=True)
651
- return feature_scores[:top_n] # Return tuples instead of just features
652
-
653
-
654
- # Contrastive mode (default): compute target mean and subtract contrast mean
655
- all_g2v_values = np.array([list(x.values()) for x in selected_feats]).mean(axis=0)
656
 
657
- # If an explicit contrast set is provided, use it; otherwise use everyone outside selection
658
- if other_author_ids:
659
- explicit_mask = background_corpus_df['authorID'].isin(other_author_ids).to_numpy()
660
- # Ensure contrast set is disjoint from the selected set
661
- contrast_mask = np.logical_and(explicit_mask, ~selected_mask)
662
- else:
663
- contrast_mask = ~selected_mask
664
 
665
- other_selected_feats = background_corpus_df[contrast_mask][features_clm_name].tolist()
666
- if len(other_selected_feats) > 0:
667
- all_g2v_other_values = np.array([list(x.values()) for x in other_selected_feats]).mean(axis=0)
668
- else:
669
- # No contrast docs β†’ treat contrast mean as zeros
670
- all_g2v_other_values = np.zeros_like(all_g2v_values)
671
 
672
- final_g2v_feats_values = all_g2v_values - all_g2v_other_values
 
 
673
 
674
- # Compute z-scores for normalization
675
- # Get population statistics from all features (both selected and contrast)
676
- all_feats = background_corpus_df[features_clm_name].tolist()
677
- population_matrix = np.array([list(x.values()) for x in all_feats])
678
- population_mean = population_matrix.mean(axis=0)
679
- population_std = population_matrix.std(axis=0)
680
-
681
- # Avoid division by zero
682
- population_std = np.where(population_std == 0, 1, population_std)
683
-
684
- # Calculate z-scores for the contrastive values
685
- z_scores = (final_g2v_feats_values - population_mean) / population_std
686
-
687
- # Keep only features that have a positive contrastive score
688
- top_g2v_feats = sorted(
689
- [(feat, val, z_score) for feat, val, z_score in zip(all_g2v_feats, final_g2v_feats_values, z_scores) if val > 0],
690
- key=lambda x: -x[1] # Sort by contrastive score
691
- )
692
-
693
- # Filter in only features that are present in selected_authors
694
- selected_authors = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(author_ids))
695
-
696
- # DEBUG: Print what we're actually working with
697
- print(f"[DEBUG] author_ids parameter: {author_ids}")
698
- print(f"[DEBUG] Hardcoded selected_authors set: {{'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}}")
699
- print(f"[DEBUG] Intersection result: {selected_authors}")
700
- print(f"[DEBUG] Is selected_authors empty? {len(selected_authors) == 0}")
701
-
702
- # Filter in only features that are present in selected_authors
703
- selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
704
-
705
- # print(f"[DEBUG] selected_authors_g2v_data length: {len(selected_authors_g2v_data)}")
706
- # print(f"[DEBUG] selected_authors_g2v_data content: {selected_authors_g2v_data}")
707
-
708
- # Get the actual text documents for the selected authors to verify feature presence
709
- selected_authors_docs = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)]['fullText'].tolist()
710
- print(f"[DEBUG] Found {len(selected_authors_docs)} documents for selected authors")
711
-
712
- # Import find_feature_spans for text-based feature verification
713
- try:
714
- from gram2vec.feature_locator import find_feature_spans
715
- print("[DEBUG] Successfully imported find_feature_spans")
716
- except ImportError:
717
- print("[WARNING] Could not import find_feature_spans, falling back to vector-based filtering")
718
- find_feature_spans = None
719
-
720
- filtered_features = []
721
- for feature, score, z_score in top_g2v_feats:
722
- # DEBUG: Print what we're checking for this feature
723
- # print(f"[DEBUG] Checking feature: {feature}")
724
- # print(f"[DEBUG] Feature score: {score}, z_score: {z_score}")
725
-
726
- # Check if the feature has a non-zero value in all of the selected authors
727
- feature_presence = []
728
- for i, author_g2v_feats in enumerate(selected_authors_g2v_data):
729
- feature_value = author_g2v_feats.get(feature, 0)
730
- feature_presence.append(feature_value)
731
- # print(f"[DEBUG] Author {i} has feature '{feature}' = {feature_value}")
732
-
733
- # print(f"[DEBUG] All feature values: {feature_presence}")
734
- # print(f"[DEBUG] All values > 0? {[v > 0 for v in feature_presence]}")
735
- # print(f"[DEBUG] All values > 0? {all(v > 0 for v in feature_presence)}")
736
-
737
- # First check: feature must be present in Gram2Vec vectors
738
- vector_present = all(author_g2v_feats.get(feature, 0) > 0 for author_g2v_feats in selected_authors_g2v_data)
739
-
740
- # Second check: feature must be present in actual text documents
741
- text_present = True
742
- if find_feature_spans and selected_authors_docs:
743
- try:
744
- # Check if feature appears in at least one document from each selected author
745
- for i, doc in enumerate(selected_authors_docs):
746
- if isinstance(doc, list):
747
- doc_text = '\n\n'.join(doc)
748
- else:
749
- doc_text = str(doc)
750
-
751
- spans = find_feature_spans(doc_text, feature)
752
- if not spans: # No spans found in this document
753
- # print(f"[DEBUG] βœ— Feature '{feature}' not found in document {i} of selected author")
754
- text_present = False
755
- break
756
- # else:
757
- # print(f"[DEBUG] βœ“ Feature '{feature}' found in document {i} with {len(spans)} spans")
758
- except Exception as e:
759
- print(f"[WARNING] Error checking text presence for feature '{feature}': {e}")
760
- # Fall back to vector-based filtering if text checking fails
761
- text_present = vector_present
762
-
763
- # Feature must pass BOTH checks
764
- if vector_present and text_present:
765
- filtered_features.append((feature, score, z_score))
766
- # print(f"[DEBUG] βœ“ Feature '{feature}' PASSED both vector and text checks")
767
- # else:
768
- # if not vector_present:
769
- # # print(f"[DEBUG] βœ— Feature '{feature}' FAILED vector check")
770
- # if not text_present:
771
- # # print(f"[DEBUG] βœ— Feature '{feature}' FAILED text check")
772
- # # print(f"[DEBUG] βœ— Feature '{feature}' FAILED the filter")
773
-
774
-
775
- print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features]) # Print feature names and z-scores
776
-
777
- return filtered_features[:top_n] # Return tuples with z-scores
778
 
779
  def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
780
 
 
613
  other_author_ids: List[Any],
614
  features_clm_name: str,
615
  top_n: int = 10,
 
 
 
616
  ) -> List[tuple]: # Changed return type to List[tuple] to include scores
617
 
618
+ # 1) Identify selected authors in the zoom region
619
  selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
620
 
621
  if not selected_mask.any():
622
+ return [] # No authors found for the given author_ids
623
+
624
+ # 2) Build a population matrix of all authors' Gram2Vec features
625
+ # Expect each row in features_clm_name to be a dict {feature_name: value}
626
+ all_feature_dicts = background_corpus_df[features_clm_name].tolist()
627
+ if not all_feature_dicts:
628
+ return []
629
+
630
+ # Use the first row to get consistent feature ordering
631
+ all_features = list(all_feature_dicts[0].keys())
632
+ population_matrix = np.array(
633
+ [[feat_dict.get(feat, 0.0) for feat in all_features] for feat_dict in all_feature_dicts],
634
+ dtype=float
635
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
 
637
+ # 3) Z-normalize columnwise across the entire corpus
638
+ col_means = population_matrix.mean(axis=0)
639
+ col_stds = population_matrix.std(axis=0)
640
+ col_stds[col_stds == 0] = 1.0
641
+ z_population = (population_matrix - col_means) / col_stds
 
 
642
 
643
+ # 4) Take the mean across the selected authors (zoom region)
644
+ selected_mean = z_population[selected_mask].mean(axis=0)
 
 
 
 
645
 
646
+ # 5) Rank features by mean z-score, keep positives only
647
+ feature_scores = [(feat, float(score)) for feat, score in zip(all_features, selected_mean) if score > 0]
648
+ feature_scores.sort(key=lambda x: x[1], reverse=True)
649
 
650
+ return feature_scores[:top_n]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
 
652
  def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
653
 
utils/visualizations.py CHANGED
@@ -290,32 +290,44 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
290
  background_corpus_df=merged_authors_df,
291
  author_ids=visible_authors,
292
  other_author_ids=[],
293
- features_clm_name='g2v_vector'
 
294
  )
295
 
296
  # ── Span-existence filter on task authors in the zoom ───────────────────
297
- # Keep only features that have at least one detected span in any of the
298
- # visible task authors' texts
299
- visible_task_authors = task_authors_df[task_authors_df['authorID'].isin(visible_authors)]
300
- if visible_task_authors.empty:
301
- visible_task_authors = task_authors_df
 
 
302
 
303
  def _to_text(x):
304
  return '\n\n =========== \n\n'.join(x) if isinstance(x, list) else x
305
 
306
- task_texts = [_to_text(x) for x in visible_task_authors['fullText'].tolist()]
307
 
 
308
  filtered_g2v_feats = []
309
  for feat in g2v_feats:
310
  try:
311
  # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
312
- if any(find_feature_spans(txt, feat[0]) for txt in task_texts):
 
 
 
 
 
313
  filtered_g2v_feats.append(feat)
314
  else:
315
- print(f"[INFO] Dropping G2V feature with no spans in task texts: {feat}")
316
  except Exception as e:
317
  print(f"[WARN] Error while checking spans for {feat}: {e}")
318
 
 
 
 
319
  # Convert to human readable for display
320
  HR_g2v_list = []
321
  for feat in filtered_g2v_feats:
 
290
  background_corpus_df=merged_authors_df,
291
  author_ids=visible_authors,
292
  other_author_ids=[],
293
+ features_clm_name='g2v_vector',
294
+ top_n=50
295
  )
296
 
297
  # ── Span-existence filter on task authors in the zoom ───────────────────
298
+ # Keep only features that have detected spans in at least 2 of the
299
+ # task authors' texts (Mystery + Candidates 1-3)
300
+ # Use only the task authors (Mystery + Candidates 1-3), not the zoom-visible set
301
+ task_author_ids = {"Mystery author", "Candidate Author 1", "Candidate Author 2", "Candidate Author 3"}
302
+ task_only_df = task_authors_df[task_authors_df['authorID'].isin(task_author_ids)]
303
+ if task_only_df.empty:
304
+ task_only_df = task_authors_df
305
 
306
  def _to_text(x):
307
  return '\n\n =========== \n\n'.join(x) if isinstance(x, list) else x
308
 
309
+ task_texts = [_to_text(x) for x in task_only_df['fullText'].tolist()]
310
 
311
+ print(f"task_texts: {task_texts}")
312
  filtered_g2v_feats = []
313
  for feat in g2v_feats:
314
  try:
315
  # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
316
+ occurrences = 0
317
+ for txt in task_texts:
318
+ spans = find_feature_spans(txt, feat[0])
319
+ if spans:
320
+ occurrences += 1
321
+ if occurrences >= 2:
322
  filtered_g2v_feats.append(feat)
323
  else:
324
+ print(f"[INFO] Dropping G2V feature with <2 task-author spans: {feat}")
325
  except Exception as e:
326
  print(f"[WARN] Error while checking spans for {feat}: {e}")
327
 
328
+ # After filtering by spans, keep top-N by score
329
+ filtered_g2v_feats = filtered_g2v_feats[:10]
330
+
331
  # Convert to human readable for display
332
  HR_g2v_list = []
333
  for feat in filtered_g2v_feats: