Commit
·
258c7f3
1
Parent(s):
dcbbcbd
added span frequency feature ranking
Browse files- utils/gram2vec_feat_utils.py +22 -3
- utils/interp_space_utils.py +51 -2
utils/gram2vec_feat_utils.py
CHANGED
|
@@ -49,7 +49,17 @@ def get_shorthand(feature_str: str) -> str:
|
|
| 49 |
return None
|
| 50 |
if category not in FEATURE_HANDLERS:
|
| 51 |
return None
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
if code is None:
|
| 54 |
# print(f"Warning: No code found for human-readable feature '{human}'")
|
| 55 |
return None # fallback to the human-readable name
|
|
@@ -78,6 +88,14 @@ def get_fullform(shorthand: str) -> str:
|
|
| 78 |
if human is None:
|
| 79 |
return None
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
return f"{category}:{human}"
|
| 82 |
|
| 83 |
def highlight_both_spans(text, llm_spans, gram_spans):
|
|
@@ -169,8 +187,9 @@ def show_combined_spans_all(selected_feature_llm, selected_feature_g2v,
|
|
| 169 |
if selected_feature_g2v and selected_feature_g2v != "None":
|
| 170 |
# get gram2vec spans
|
| 171 |
gram_spans_list = []
|
| 172 |
-
#
|
| 173 |
-
|
|
|
|
| 174 |
print(f"Selected Gram2Vec feature: {selected_feature_g2v}")
|
| 175 |
short = get_shorthand(selected_feature_g2v)
|
| 176 |
print(f"short hand: {short}")
|
|
|
|
| 49 |
return None
|
| 50 |
if category not in FEATURE_HANDLERS:
|
| 51 |
return None
|
| 52 |
+
code_map = load_code_map()
|
| 53 |
+
code = code_map.get(human)
|
| 54 |
+
if code is None:
|
| 55 |
+
# Try normalizing terminology shown in UI
|
| 56 |
+
# Convert 'Preposition' phrasing back to 'Adposition' used in the code map
|
| 57 |
+
human_alt = (human
|
| 58 |
+
.replace("Preposition", "Adposition")
|
| 59 |
+
.replace("preposition", "adposition")
|
| 60 |
+
.replace("Prepositional", "Adpositional")
|
| 61 |
+
.replace("prepositional", "adpositional"))
|
| 62 |
+
code = code_map.get(human_alt)
|
| 63 |
if code is None:
|
| 64 |
# print(f"Warning: No code found for human-readable feature '{human}'")
|
| 65 |
return None # fallback to the human-readable name
|
|
|
|
| 88 |
if human is None:
|
| 89 |
return None
|
| 90 |
|
| 91 |
+
# Normalize terminology for UI: prefer "Preposition" over "Adposition"
|
| 92 |
+
# Also handle potential "adpositional" variants if present
|
| 93 |
+
human = (human
|
| 94 |
+
.replace("Adposition", "Preposition")
|
| 95 |
+
.replace("adposition", "preposition")
|
| 96 |
+
.replace("Adpositional", "Prepositional")
|
| 97 |
+
.replace("adpositional", "prepositional"))
|
| 98 |
+
|
| 99 |
return f"{category}:{human}"
|
| 100 |
|
| 101 |
def highlight_both_spans(text, llm_spans, gram_spans):
|
|
|
|
| 187 |
if selected_feature_g2v and selected_feature_g2v != "None":
|
| 188 |
# get gram2vec spans
|
| 189 |
gram_spans_list = []
|
| 190 |
+
# In case any old label formatting with z-scores leaks through, strip it defensively
|
| 191 |
+
if "| [Z=" in selected_feature_g2v:
|
| 192 |
+
selected_feature_g2v = selected_feature_g2v.split(" | [Z=")[0].strip()
|
| 193 |
print(f"Selected Gram2Vec feature: {selected_feature_g2v}")
|
| 194 |
short = get_shorthand(selected_feature_g2v)
|
| 195 |
print(f"short hand: {short}")
|
utils/interp_space_utils.py
CHANGED
|
@@ -17,6 +17,8 @@ from pydantic import BaseModel
|
|
| 17 |
from pydantic import ValidationError
|
| 18 |
import time
|
| 19 |
from utils.llm_feat_utils import generate_feature_spans_cached
|
|
|
|
|
|
|
| 20 |
from collections import Counter
|
| 21 |
import numpy as np
|
| 22 |
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
|
|
@@ -633,6 +635,7 @@ def compute_clusters_g2v_representation(
|
|
| 633 |
other_author_ids: List[Any],
|
| 634 |
features_clm_name: str,
|
| 635 |
top_n: int = 10,
|
|
|
|
| 636 |
) -> List[tuple]: # Changed return type to List[tuple] to include scores
|
| 637 |
|
| 638 |
# 1) Identify selected authors in the zoom region
|
|
@@ -666,8 +669,54 @@ def compute_clusters_g2v_representation(
|
|
| 666 |
# 5) Rank features by mean z-score, keep positives only
|
| 667 |
feature_scores = [(feat, float(score)) for feat, score in zip(all_features, selected_mean) if score > 0]
|
| 668 |
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
| 669 |
-
|
| 670 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 671 |
|
| 672 |
def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
|
| 673 |
|
|
|
|
| 17 |
from pydantic import ValidationError
|
| 18 |
import time
|
| 19 |
from utils.llm_feat_utils import generate_feature_spans_cached
|
| 20 |
+
from utils.gram2vec_feat_utils import get_shorthand, get_fullform
|
| 21 |
+
from gram2vec.feature_locator import find_feature_spans
|
| 22 |
from collections import Counter
|
| 23 |
import numpy as np
|
| 24 |
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
|
|
|
|
| 635 |
other_author_ids: List[Any],
|
| 636 |
features_clm_name: str,
|
| 637 |
top_n: int = 10,
|
| 638 |
+
max_candidates_for_span_sorting: int = 50,
|
| 639 |
) -> List[tuple]: # Changed return type to List[tuple] to include scores
|
| 640 |
|
| 641 |
# 1) Identify selected authors in the zoom region
|
|
|
|
| 669 |
# 5) Rank features by mean z-score, keep positives only
|
| 670 |
feature_scores = [(feat, float(score)) for feat, score in zip(all_features, selected_mean) if score > 0]
|
| 671 |
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
| 672 |
+
|
| 673 |
+
# 6) Extract top candidates for span-based sorting
|
| 674 |
+
candidate_features = feature_scores[:max_candidates_for_span_sorting]
|
| 675 |
+
|
| 676 |
+
# 7) Extract spans for task authors to sort by frequency
|
| 677 |
+
task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
|
| 678 |
+
task_authors_in_selection = [aid for aid in author_ids if aid in task_author_names]
|
| 679 |
+
|
| 680 |
+
if not task_authors_in_selection:
|
| 681 |
+
# If no task authors in selection, just return the z-score sorted features
|
| 682 |
+
print("[INFO] No task authors in selection, returning z-score sorted features")
|
| 683 |
+
return feature_scores[:top_n]
|
| 684 |
+
|
| 685 |
+
# Get task author data
|
| 686 |
+
task_authors_df = background_corpus_df[background_corpus_df['authorID'].isin(task_authors_in_selection)]
|
| 687 |
+
|
| 688 |
+
# Count spans for each feature across task authors
|
| 689 |
+
feature_span_counts = {}
|
| 690 |
+
for feat_shorthand, z_score in candidate_features:
|
| 691 |
+
span_count = 0
|
| 692 |
+
|
| 693 |
+
# Convert shorthand to human-readable for display (if needed)
|
| 694 |
+
# Note: features in gram2vec dict are in shorthand format like "pos_unigrams:ADJ"
|
| 695 |
+
|
| 696 |
+
for _, author_row in task_authors_df.iterrows():
|
| 697 |
+
author_text = author_row['fullText']
|
| 698 |
+
if isinstance(author_text, list):
|
| 699 |
+
author_text = '\n\n'.join(author_text)
|
| 700 |
+
|
| 701 |
+
try:
|
| 702 |
+
# find_feature_spans expects shorthand format like "pos_unigrams:ADJ"
|
| 703 |
+
spans = find_feature_spans(author_text, feat_shorthand)
|
| 704 |
+
span_count += len(spans)
|
| 705 |
+
except Exception as e:
|
| 706 |
+
# If span extraction fails, continue with 0 spans for this author
|
| 707 |
+
pass
|
| 708 |
+
|
| 709 |
+
feature_span_counts[feat_shorthand] = span_count
|
| 710 |
+
|
| 711 |
+
# 8) Sort features by span frequency, then by z-score as tiebreaker
|
| 712 |
+
sorted_by_spans = sorted(
|
| 713 |
+
candidate_features,
|
| 714 |
+
key=lambda x: (-feature_span_counts.get(x[0], 0), -x[1])
|
| 715 |
+
)
|
| 716 |
+
|
| 717 |
+
print(f"[INFO] Sorted gram2vec features by span frequency: {[(f, feature_span_counts.get(f, 0), z) for f, z in sorted_by_spans[:top_n]]}")
|
| 718 |
+
|
| 719 |
+
return sorted_by_spans[:top_n]
|
| 720 |
|
| 721 |
def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
|
| 722 |
|