Anisha Bhatnagar
commited on
Commit
·
8367823
1
Parent(s):
6fc987a
showing g2v with z scores
Browse files- app.py +7 -0
- utils/interp_space_utils.py +21 -9
- utils/visualizations.py +46 -4
app.py
CHANGED
|
@@ -424,6 +424,13 @@ def app(share=False, use_cluster_feats=False):
|
|
| 424 |
">
|
| 425 |
Gram2Vec Features prominent in the zoomed-in region
|
| 426 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
""")
|
| 428 |
gram2vec_rb = gr.Radio(choices=[], label="Gram2Vec features for this zoomed-in region")#, label="Top-10 Gram2Vec Features most likely to occur in Mystery Author", info="Most prominent Gram2Vec features in the mystery text")
|
| 429 |
gram2vec_state = gr.State()
|
|
|
|
| 424 |
">
|
| 425 |
Gram2Vec Features prominent in the zoomed-in region
|
| 426 |
</div>
|
| 427 |
+
<div style="
|
| 428 |
+
font-size: 0.9em;
|
| 429 |
+
color: #666;
|
| 430 |
+
margin-bottom: 1em;
|
| 431 |
+
">
|
| 432 |
+
Features shown with normalized z-scores
|
| 433 |
+
</div>
|
| 434 |
""")
|
| 435 |
gram2vec_rb = gr.Radio(choices=[], label="Gram2Vec features for this zoomed-in region")#, label="Top-10 Gram2Vec Features most likely to occur in Mystery Author", info="Most prominent Gram2Vec features in the mystery text")
|
| 436 |
gram2vec_state = gr.State()
|
utils/interp_space_utils.py
CHANGED
|
@@ -571,7 +571,7 @@ def compute_clusters_g2v_representation(
|
|
| 571 |
mode: str = "contrastive",
|
| 572 |
sharedness_method: str = "mean_minus_alpha_std",
|
| 573 |
alpha: float = 0.5
|
| 574 |
-
) -> List[
|
| 575 |
|
| 576 |
|
| 577 |
selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
|
|
@@ -600,10 +600,10 @@ def compute_clusters_g2v_representation(
|
|
| 600 |
stds = selected_matrix.std(axis=0)
|
| 601 |
scores = means - float(alpha) * stds
|
| 602 |
|
| 603 |
-
# Rank and return
|
| 604 |
feature_scores = [(feat, score) for feat, score in zip(all_g2v_feats, scores) if score > 0]
|
| 605 |
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
| 606 |
-
return [
|
| 607 |
|
| 608 |
|
| 609 |
# Contrastive mode (default): compute target mean and subtract contrast mean
|
|
@@ -626,11 +626,23 @@ def compute_clusters_g2v_representation(
|
|
| 626 |
|
| 627 |
final_g2v_feats_values = all_g2v_values - all_g2v_other_values
|
| 628 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 629 |
|
| 630 |
# Keep only features that have a positive contrastive score
|
| 631 |
top_g2v_feats = sorted(
|
| 632 |
-
[(feat, val) for feat, val in zip(all_g2v_feats, final_g2v_feats_values) if val > 0],
|
| 633 |
-
key=lambda x: -x[1]
|
| 634 |
)
|
| 635 |
|
| 636 |
# Filter out features that are not present in any of the authors
|
|
@@ -638,18 +650,18 @@ def compute_clusters_g2v_representation(
|
|
| 638 |
print('Filtering in g2v features for only the following authors: ', selected_authors)
|
| 639 |
authors_g2v_feats = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
|
| 640 |
filtered_features = []
|
| 641 |
-
for feature, score in top_g2v_feats:
|
| 642 |
found_in_any_author = False
|
| 643 |
for author_g2v_feats in authors_g2v_feats:
|
| 644 |
if author_g2v_feats[feature] > 0:
|
| 645 |
found_in_any_author = True
|
| 646 |
break
|
| 647 |
if found_in_any_author:
|
| 648 |
-
filtered_features.append(feature)
|
| 649 |
|
| 650 |
-
print('Filtered G2V features: ', filtered_features)
|
| 651 |
|
| 652 |
-
return filtered_features[:top_n]
|
| 653 |
|
| 654 |
def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
|
| 655 |
|
|
|
|
| 571 |
mode: str = "contrastive",
|
| 572 |
sharedness_method: str = "mean_minus_alpha_std",
|
| 573 |
alpha: float = 0.5
|
| 574 |
+
) -> List[tuple]: # Changed return type to List[tuple] to include scores
|
| 575 |
|
| 576 |
|
| 577 |
selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
|
|
|
|
| 600 |
stds = selected_matrix.std(axis=0)
|
| 601 |
scores = means - float(alpha) * stds
|
| 602 |
|
| 603 |
+
# Rank and return with scores
|
| 604 |
feature_scores = [(feat, score) for feat, score in zip(all_g2v_feats, scores) if score > 0]
|
| 605 |
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
| 606 |
+
return feature_scores[:top_n] # Return tuples instead of just features
|
| 607 |
|
| 608 |
|
| 609 |
# Contrastive mode (default): compute target mean and subtract contrast mean
|
|
|
|
| 626 |
|
| 627 |
final_g2v_feats_values = all_g2v_values - all_g2v_other_values
|
| 628 |
|
| 629 |
+
# Compute z-scores for normalization
|
| 630 |
+
# Get population statistics from all features (both selected and contrast)
|
| 631 |
+
all_feats = background_corpus_df[features_clm_name].tolist()
|
| 632 |
+
population_matrix = np.array([list(x.values()) for x in all_feats])
|
| 633 |
+
population_mean = population_matrix.mean(axis=0)
|
| 634 |
+
population_std = population_matrix.std(axis=0)
|
| 635 |
+
|
| 636 |
+
# Avoid division by zero
|
| 637 |
+
population_std = np.where(population_std == 0, 1, population_std)
|
| 638 |
+
|
| 639 |
+
# Calculate z-scores for the contrastive values
|
| 640 |
+
z_scores = (final_g2v_feats_values - population_mean) / population_std
|
| 641 |
|
| 642 |
# Keep only features that have a positive contrastive score
|
| 643 |
top_g2v_feats = sorted(
|
| 644 |
+
[(feat, val, z_score) for feat, val, z_score in zip(all_g2v_feats, final_g2v_feats_values, z_scores) if val > 0],
|
| 645 |
+
key=lambda x: -x[1] # Sort by contrastive score
|
| 646 |
)
|
| 647 |
|
| 648 |
# Filter out features that are not present in any of the authors
|
|
|
|
| 650 |
print('Filtering in g2v features for only the following authors: ', selected_authors)
|
| 651 |
authors_g2v_feats = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
|
| 652 |
filtered_features = []
|
| 653 |
+
for feature, score, z_score in top_g2v_feats:
|
| 654 |
found_in_any_author = False
|
| 655 |
for author_g2v_feats in authors_g2v_feats:
|
| 656 |
if author_g2v_feats[feature] > 0:
|
| 657 |
found_in_any_author = True
|
| 658 |
break
|
| 659 |
if found_in_any_author:
|
| 660 |
+
filtered_features.append((feature, score, z_score))
|
| 661 |
|
| 662 |
+
print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features]) # Print feature names and z-scores
|
| 663 |
|
| 664 |
+
return filtered_features[:top_n] # Return tuples with z-scores
|
| 665 |
|
| 666 |
def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
|
| 667 |
|
utils/visualizations.py
CHANGED
|
@@ -194,6 +194,47 @@ def load_interp_space(cfg):
|
|
| 194 |
|
| 195 |
}
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
#function to handle zoom events
|
| 198 |
def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors_df):
|
| 199 |
"""
|
|
@@ -268,7 +309,7 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
|
|
| 268 |
for feat in g2v_feats:
|
| 269 |
try:
|
| 270 |
# `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
|
| 271 |
-
if any(find_feature_spans(txt, feat) for txt in task_texts):
|
| 272 |
filtered_g2v_feats.append(feat)
|
| 273 |
else:
|
| 274 |
print(f"[INFO] Dropping G2V feature with no spans in task texts: {feat}")
|
|
@@ -278,19 +319,20 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
|
|
| 278 |
# Convert to human readable for display
|
| 279 |
HR_g2v_list = []
|
| 280 |
for feat in filtered_g2v_feats:
|
| 281 |
-
HR_g2v = get_fullform(feat)
|
| 282 |
print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
|
| 283 |
if HR_g2v is None:
|
| 284 |
print(f"Skipping Gram2Vec feature without human readable form: {feat}")
|
| 285 |
else:
|
| 286 |
-
HR_g2v_list.append(HR_g2v)
|
| 287 |
|
| 288 |
-
HR_g2v_list = ["None"] + HR_g2v_list
|
| 289 |
|
| 290 |
print(f"[INFO] Found {len(llm_feats)} LLM features and {len(g2v_feats)} Gram2Vec features in the zoomed region.")
|
| 291 |
print(f"[INFO] unfiltered g2v features: {g2v_feats}")
|
| 292 |
|
| 293 |
print(f"[INFO] LLM features: {llm_feats}")
|
|
|
|
| 294 |
print(f"[INFO] Gram2Vec features: {HR_g2v_list}")
|
| 295 |
|
| 296 |
return (
|
|
|
|
| 194 |
|
| 195 |
}
|
| 196 |
|
| 197 |
+
# Function to process G2V features and create display choices
|
| 198 |
+
def format_g2v_features_for_display(g2v_features_with_scores):
|
| 199 |
+
"""
|
| 200 |
+
Convert G2V features with z-scores into display format for Gradio radio buttons.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
g2v_features_with_scores: List of tuples like:
|
| 204 |
+
[('None', None), ('Feature Name', z_score), ...]
|
| 205 |
+
|
| 206 |
+
Returns:
|
| 207 |
+
tuple: (display_choices, original_values)
|
| 208 |
+
"""
|
| 209 |
+
display_choices = []
|
| 210 |
+
original_values = []
|
| 211 |
+
|
| 212 |
+
for item in g2v_features_with_scores:
|
| 213 |
+
if len(item) == 2:
|
| 214 |
+
feature_name, z_score = item
|
| 215 |
+
|
| 216 |
+
# Handle None case
|
| 217 |
+
if feature_name == "None" or z_score is None:
|
| 218 |
+
display_choices.append("None")
|
| 219 |
+
original_values.append("None")
|
| 220 |
+
else:
|
| 221 |
+
# Convert numpy float to regular float if needed
|
| 222 |
+
if hasattr(z_score, 'item'):
|
| 223 |
+
z_score = float(z_score.item())
|
| 224 |
+
else:
|
| 225 |
+
z_score = float(z_score)
|
| 226 |
+
|
| 227 |
+
# Create display string with z-score
|
| 228 |
+
display_string = f"{feature_name} | Z={z_score:.2f}]"
|
| 229 |
+
display_choices.append(display_string)
|
| 230 |
+
original_values.append(feature_name)
|
| 231 |
+
else:
|
| 232 |
+
# Handle unexpected format
|
| 233 |
+
display_choices.append(str(item))
|
| 234 |
+
original_values.append(str(item))
|
| 235 |
+
|
| 236 |
+
return display_choices, original_values
|
| 237 |
+
|
| 238 |
#function to handle zoom events
|
| 239 |
def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors_df):
|
| 240 |
"""
|
|
|
|
| 309 |
for feat in g2v_feats:
|
| 310 |
try:
|
| 311 |
# `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
|
| 312 |
+
if any(find_feature_spans(txt, feat[0]) for txt in task_texts):
|
| 313 |
filtered_g2v_feats.append(feat)
|
| 314 |
else:
|
| 315 |
print(f"[INFO] Dropping G2V feature with no spans in task texts: {feat}")
|
|
|
|
| 319 |
# Convert to human readable for display
|
| 320 |
HR_g2v_list = []
|
| 321 |
for feat in filtered_g2v_feats:
|
| 322 |
+
HR_g2v = get_fullform(feat[0])
|
| 323 |
print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
|
| 324 |
if HR_g2v is None:
|
| 325 |
print(f"Skipping Gram2Vec feature without human readable form: {feat}")
|
| 326 |
else:
|
| 327 |
+
HR_g2v_list.append((HR_g2v, feat[1])) #get the score
|
| 328 |
|
| 329 |
+
HR_g2v_list = [("None", None)] + HR_g2v_list
|
| 330 |
|
| 331 |
print(f"[INFO] Found {len(llm_feats)} LLM features and {len(g2v_feats)} Gram2Vec features in the zoomed region.")
|
| 332 |
print(f"[INFO] unfiltered g2v features: {g2v_feats}")
|
| 333 |
|
| 334 |
print(f"[INFO] LLM features: {llm_feats}")
|
| 335 |
+
HR_g2v_list, _ = format_g2v_features_for_display(HR_g2v_list)
|
| 336 |
print(f"[INFO] Gram2Vec features: {HR_g2v_list}")
|
| 337 |
|
| 338 |
return (
|