Milad Alshomary
commited on
Commit
·
51ad242
1
Parent(s):
224c491
updates
Browse files- utils/interp_space_utils.py +32 -21
utils/interp_space_utils.py
CHANGED
|
@@ -577,37 +577,48 @@ def compute_clusters_style_representation_3(
|
|
| 577 |
|
| 578 |
# Filter-in only task authors that are part of the current selection
|
| 579 |
task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
|
| 580 |
-
filtered_task_authors = {author: feat_map for author, feat_map in spans_by_author.items() if author in task_author_names.intersection(set(cluster_ids))}
|
| 581 |
|
| 582 |
# Build per-author sets of features that have at least one span
|
| 583 |
-
author_present_feature_sets = [
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
]
|
| 587 |
|
| 588 |
-
print(filtered_task_authors.keys(), author_present_feature_sets)
|
| 589 |
|
| 590 |
|
| 591 |
-
if len(author_present_feature_sets) > 0: # we have more than one task author
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
|
| 596 |
-
|
| 597 |
-
|
| 598 |
|
| 599 |
-
|
| 600 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
|
| 602 |
-
selected_features_ranked = sorted(
|
| 603 |
-
eligible_features,
|
| 604 |
-
key=lambda f: (-coverage_counter[f], feature_original_index.get(f, 10**9))
|
| 605 |
-
)[:int(top_k)]
|
| 606 |
-
else:
|
| 607 |
-
selected_features_ranked = features
|
| 608 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 609 |
|
| 610 |
-
print('filtered set of features (min coverage', len(author_present_feature_sets), '): ', selected_features_ranked)
|
| 611 |
|
| 612 |
return {
|
| 613 |
"features": list(selected_features_ranked),
|
|
|
|
| 577 |
|
| 578 |
# Filter-in only task authors that are part of the current selection
|
| 579 |
task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
|
| 580 |
+
#filtered_task_authors = {author: feat_map for author, feat_map in spans_by_author.items() if author in task_author_names.intersection(set(cluster_ids))}
|
| 581 |
|
| 582 |
# Build per-author sets of features that have at least one span
|
| 583 |
+
# author_present_feature_sets = [
|
| 584 |
+
# {feature for feature, spans in feature_map.items() if spans and len(spans) > 0}
|
| 585 |
+
# for _, feature_map in filtered_task_authors.items()
|
| 586 |
+
# ]
|
| 587 |
|
| 588 |
+
# print(filtered_task_authors.keys(), author_present_feature_sets)
|
| 589 |
|
| 590 |
|
| 591 |
+
# if len(author_present_feature_sets) > 0: # we have more than one task author
|
| 592 |
+
# coverage_counter = Counter()
|
| 593 |
+
# for present_set in author_present_feature_sets:
|
| 594 |
+
# coverage_counter.update(present_set)
|
| 595 |
|
| 596 |
+
# # Keep features present in at least `min_authors_required` authors
|
| 597 |
+
# eligible_features = [feat for feat, cnt in coverage_counter.items() if cnt >= len(author_present_feature_sets)]
|
| 598 |
|
| 599 |
+
# # Preserve original LLM feature ordering as a secondary key where possible
|
| 600 |
+
# feature_original_index = {feat: idx for idx, feat in enumerate(features)} if features else {}
|
| 601 |
+
|
| 602 |
+
# selected_features_ranked = sorted(
|
| 603 |
+
# eligible_features,
|
| 604 |
+
# key=lambda f: (-coverage_counter[f], feature_original_index.get(f, 10**9))
|
| 605 |
+
# )[:int(top_k)]
|
| 606 |
+
# else:
|
| 607 |
+
# selected_features_ranked = features
|
| 608 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 609 |
|
| 610 |
+
feature_importance = {f : 0 for f in features}
|
| 611 |
+
for author, feature_map in spans_by_author.items():
|
| 612 |
+
if author in task_author_names.intersection(set(cluster_ids)):
|
| 613 |
+
for feature, spans in feature_map.items():
|
| 614 |
+
feature_importance[feature] += len(spans)
|
| 615 |
+
else:
|
| 616 |
+
for feature, spans in feature_map.items():
|
| 617 |
+
feature_importance[feature] -= len(spans)
|
| 618 |
+
print(feature_importance)
|
| 619 |
+
selected_features_ranked = sorted(feature_importance, key=lambda f: -feature_importance[f])[:int(top_k)]
|
| 620 |
|
| 621 |
+
#print('filtered set of features (min coverage', len(author_present_feature_sets), '): ', selected_features_ranked)
|
| 622 |
|
| 623 |
return {
|
| 624 |
"features": list(selected_features_ranked),
|