Milad Alshomary commited on
Commit
51ad242
·
1 Parent(s): 224c491
Files changed (1) hide show
  1. utils/interp_space_utils.py +32 -21
utils/interp_space_utils.py CHANGED
@@ -577,37 +577,48 @@ def compute_clusters_style_representation_3(
577
 
578
  # Filter-in only task authors that are part of the current selection
579
  task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
580
- filtered_task_authors = {author: feat_map for author, feat_map in spans_by_author.items() if author in task_author_names.intersection(set(cluster_ids))}
581
 
582
  # Build per-author sets of features that have at least one span
583
- author_present_feature_sets = [
584
- {feature for feature, spans in feature_map.items() if spans and len(spans) > 0}
585
- for _, feature_map in filtered_task_authors.items()
586
- ]
587
 
588
- print(filtered_task_authors.keys(), author_present_feature_sets)
589
 
590
 
591
- if len(author_present_feature_sets) > 0: # we have more than one task author
592
- coverage_counter = Counter()
593
- for present_set in author_present_feature_sets:
594
- coverage_counter.update(present_set)
595
 
596
- # Keep features present in at least `min_authors_required` authors
597
- eligible_features = [feat for feat, cnt in coverage_counter.items() if cnt >= len(author_present_feature_sets)]
598
 
599
- # Preserve original LLM feature ordering as a secondary key where possible
600
- feature_original_index = {feat: idx for idx, feat in enumerate(features)} if features else {}
 
 
 
 
 
 
 
601
 
602
- selected_features_ranked = sorted(
603
- eligible_features,
604
- key=lambda f: (-coverage_counter[f], feature_original_index.get(f, 10**9))
605
- )[:int(top_k)]
606
- else:
607
- selected_features_ranked = features
608
 
 
 
 
 
 
 
 
 
 
 
609
 
610
- print('filtered set of features (min coverage', len(author_present_feature_sets), '): ', selected_features_ranked)
611
 
612
  return {
613
  "features": list(selected_features_ranked),
 
577
 
578
  # Filter-in only task authors that are part of the current selection
579
  task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
580
+ #filtered_task_authors = {author: feat_map for author, feat_map in spans_by_author.items() if author in task_author_names.intersection(set(cluster_ids))}
581
 
582
  # Build per-author sets of features that have at least one span
583
+ # author_present_feature_sets = [
584
+ # {feature for feature, spans in feature_map.items() if spans and len(spans) > 0}
585
+ # for _, feature_map in filtered_task_authors.items()
586
+ # ]
587
 
588
+ # print(filtered_task_authors.keys(), author_present_feature_sets)
589
 
590
 
591
+ # if len(author_present_feature_sets) > 0: # we have more than one task author
592
+ # coverage_counter = Counter()
593
+ # for present_set in author_present_feature_sets:
594
+ # coverage_counter.update(present_set)
595
 
596
+ # # Keep features present in at least `min_authors_required` authors
597
+ # eligible_features = [feat for feat, cnt in coverage_counter.items() if cnt >= len(author_present_feature_sets)]
598
 
599
+ # # Preserve original LLM feature ordering as a secondary key where possible
600
+ # feature_original_index = {feat: idx for idx, feat in enumerate(features)} if features else {}
601
+
602
+ # selected_features_ranked = sorted(
603
+ # eligible_features,
604
+ # key=lambda f: (-coverage_counter[f], feature_original_index.get(f, 10**9))
605
+ # )[:int(top_k)]
606
+ # else:
607
+ # selected_features_ranked = features
608
 
 
 
 
 
 
 
609
 
610
+ feature_importance = {f : 0 for f in features}
611
+ for author, feature_map in spans_by_author.items():
612
+ if author in task_author_names.intersection(set(cluster_ids)):
613
+ for feature, spans in feature_map.items():
614
+ feature_importance[feature] += len(spans)
615
+ else:
616
+ for feature, spans in feature_map.items():
617
+ feature_importance[feature] -= len(spans)
618
+ print(feature_importance)
619
+ selected_features_ranked = sorted(feature_importance, key=lambda f: -feature_importance[f])[:int(top_k)]
620
 
621
+ #print('filtered set of features (min coverage', len(author_present_feature_sets), '): ', selected_features_ranked)
622
 
623
  return {
624
  "features": list(selected_features_ranked),