Commit
Β·
8e5c429
1
Parent(s):
a6ee680
changed filtering and selection of g2v features
Browse files- utils/interp_space_utils.py +26 -153
- utils/visualizations.py +21 -9
utils/interp_space_utils.py
CHANGED
|
@@ -613,168 +613,41 @@ def compute_clusters_g2v_representation(
|
|
| 613 |
other_author_ids: List[Any],
|
| 614 |
features_clm_name: str,
|
| 615 |
top_n: int = 10,
|
| 616 |
-
mode: str = "contrastive",
|
| 617 |
-
sharedness_method: str = "mean_minus_alpha_std",
|
| 618 |
-
alpha: float = 0.5
|
| 619 |
) -> List[tuple]: # Changed return type to List[tuple] to include scores
|
| 620 |
|
| 621 |
-
|
| 622 |
selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
|
| 623 |
|
| 624 |
if not selected_mask.any():
|
| 625 |
-
return []
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
scores = means - float(alpha) * stds
|
| 640 |
-
elif sharedness_method == "min":
|
| 641 |
-
scores = selected_matrix.min(axis=0)
|
| 642 |
-
else:
|
| 643 |
-
# Default fallback to mean-minus-alpha*std if unknown method
|
| 644 |
-
means = selected_matrix.mean(axis=0)
|
| 645 |
-
stds = selected_matrix.std(axis=0)
|
| 646 |
-
scores = means - float(alpha) * stds
|
| 647 |
-
|
| 648 |
-
# Rank and return with scores
|
| 649 |
-
feature_scores = [(feat, score) for feat, score in zip(all_g2v_feats, scores) if score > 0]
|
| 650 |
-
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
| 651 |
-
return feature_scores[:top_n] # Return tuples instead of just features
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
# Contrastive mode (default): compute target mean and subtract contrast mean
|
| 655 |
-
all_g2v_values = np.array([list(x.values()) for x in selected_feats]).mean(axis=0)
|
| 656 |
|
| 657 |
-
#
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
else:
|
| 663 |
-
contrast_mask = ~selected_mask
|
| 664 |
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
all_g2v_other_values = np.array([list(x.values()) for x in other_selected_feats]).mean(axis=0)
|
| 668 |
-
else:
|
| 669 |
-
# No contrast docs β treat contrast mean as zeros
|
| 670 |
-
all_g2v_other_values = np.zeros_like(all_g2v_values)
|
| 671 |
|
| 672 |
-
|
|
|
|
|
|
|
| 673 |
|
| 674 |
-
|
| 675 |
-
# Get population statistics from all features (both selected and contrast)
|
| 676 |
-
all_feats = background_corpus_df[features_clm_name].tolist()
|
| 677 |
-
population_matrix = np.array([list(x.values()) for x in all_feats])
|
| 678 |
-
population_mean = population_matrix.mean(axis=0)
|
| 679 |
-
population_std = population_matrix.std(axis=0)
|
| 680 |
-
|
| 681 |
-
# Avoid division by zero
|
| 682 |
-
population_std = np.where(population_std == 0, 1, population_std)
|
| 683 |
-
|
| 684 |
-
# Calculate z-scores for the contrastive values
|
| 685 |
-
z_scores = (final_g2v_feats_values - population_mean) / population_std
|
| 686 |
-
|
| 687 |
-
# Keep only features that have a positive contrastive score
|
| 688 |
-
top_g2v_feats = sorted(
|
| 689 |
-
[(feat, val, z_score) for feat, val, z_score in zip(all_g2v_feats, final_g2v_feats_values, z_scores) if val > 0],
|
| 690 |
-
key=lambda x: -x[1] # Sort by contrastive score
|
| 691 |
-
)
|
| 692 |
-
|
| 693 |
-
# Filter in only features that are present in selected_authors
|
| 694 |
-
selected_authors = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(author_ids))
|
| 695 |
-
|
| 696 |
-
# DEBUG: Print what we're actually working with
|
| 697 |
-
print(f"[DEBUG] author_ids parameter: {author_ids}")
|
| 698 |
-
print(f"[DEBUG] Hardcoded selected_authors set: {{'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}}")
|
| 699 |
-
print(f"[DEBUG] Intersection result: {selected_authors}")
|
| 700 |
-
print(f"[DEBUG] Is selected_authors empty? {len(selected_authors) == 0}")
|
| 701 |
-
|
| 702 |
-
# Filter in only features that are present in selected_authors
|
| 703 |
-
selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
|
| 704 |
-
|
| 705 |
-
# print(f"[DEBUG] selected_authors_g2v_data length: {len(selected_authors_g2v_data)}")
|
| 706 |
-
# print(f"[DEBUG] selected_authors_g2v_data content: {selected_authors_g2v_data}")
|
| 707 |
-
|
| 708 |
-
# Get the actual text documents for the selected authors to verify feature presence
|
| 709 |
-
selected_authors_docs = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)]['fullText'].tolist()
|
| 710 |
-
print(f"[DEBUG] Found {len(selected_authors_docs)} documents for selected authors")
|
| 711 |
-
|
| 712 |
-
# Import find_feature_spans for text-based feature verification
|
| 713 |
-
try:
|
| 714 |
-
from gram2vec.feature_locator import find_feature_spans
|
| 715 |
-
print("[DEBUG] Successfully imported find_feature_spans")
|
| 716 |
-
except ImportError:
|
| 717 |
-
print("[WARNING] Could not import find_feature_spans, falling back to vector-based filtering")
|
| 718 |
-
find_feature_spans = None
|
| 719 |
-
|
| 720 |
-
filtered_features = []
|
| 721 |
-
for feature, score, z_score in top_g2v_feats:
|
| 722 |
-
# DEBUG: Print what we're checking for this feature
|
| 723 |
-
# print(f"[DEBUG] Checking feature: {feature}")
|
| 724 |
-
# print(f"[DEBUG] Feature score: {score}, z_score: {z_score}")
|
| 725 |
-
|
| 726 |
-
# Check if the feature has a non-zero value in all of the selected authors
|
| 727 |
-
feature_presence = []
|
| 728 |
-
for i, author_g2v_feats in enumerate(selected_authors_g2v_data):
|
| 729 |
-
feature_value = author_g2v_feats.get(feature, 0)
|
| 730 |
-
feature_presence.append(feature_value)
|
| 731 |
-
# print(f"[DEBUG] Author {i} has feature '{feature}' = {feature_value}")
|
| 732 |
-
|
| 733 |
-
# print(f"[DEBUG] All feature values: {feature_presence}")
|
| 734 |
-
# print(f"[DEBUG] All values > 0? {[v > 0 for v in feature_presence]}")
|
| 735 |
-
# print(f"[DEBUG] All values > 0? {all(v > 0 for v in feature_presence)}")
|
| 736 |
-
|
| 737 |
-
# First check: feature must be present in Gram2Vec vectors
|
| 738 |
-
vector_present = all(author_g2v_feats.get(feature, 0) > 0 for author_g2v_feats in selected_authors_g2v_data)
|
| 739 |
-
|
| 740 |
-
# Second check: feature must be present in actual text documents
|
| 741 |
-
text_present = True
|
| 742 |
-
if find_feature_spans and selected_authors_docs:
|
| 743 |
-
try:
|
| 744 |
-
# Check if feature appears in at least one document from each selected author
|
| 745 |
-
for i, doc in enumerate(selected_authors_docs):
|
| 746 |
-
if isinstance(doc, list):
|
| 747 |
-
doc_text = '\n\n'.join(doc)
|
| 748 |
-
else:
|
| 749 |
-
doc_text = str(doc)
|
| 750 |
-
|
| 751 |
-
spans = find_feature_spans(doc_text, feature)
|
| 752 |
-
if not spans: # No spans found in this document
|
| 753 |
-
# print(f"[DEBUG] β Feature '{feature}' not found in document {i} of selected author")
|
| 754 |
-
text_present = False
|
| 755 |
-
break
|
| 756 |
-
# else:
|
| 757 |
-
# print(f"[DEBUG] β Feature '{feature}' found in document {i} with {len(spans)} spans")
|
| 758 |
-
except Exception as e:
|
| 759 |
-
print(f"[WARNING] Error checking text presence for feature '{feature}': {e}")
|
| 760 |
-
# Fall back to vector-based filtering if text checking fails
|
| 761 |
-
text_present = vector_present
|
| 762 |
-
|
| 763 |
-
# Feature must pass BOTH checks
|
| 764 |
-
if vector_present and text_present:
|
| 765 |
-
filtered_features.append((feature, score, z_score))
|
| 766 |
-
# print(f"[DEBUG] β Feature '{feature}' PASSED both vector and text checks")
|
| 767 |
-
# else:
|
| 768 |
-
# if not vector_present:
|
| 769 |
-
# # print(f"[DEBUG] β Feature '{feature}' FAILED vector check")
|
| 770 |
-
# if not text_present:
|
| 771 |
-
# # print(f"[DEBUG] β Feature '{feature}' FAILED text check")
|
| 772 |
-
# # print(f"[DEBUG] β Feature '{feature}' FAILED the filter")
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features]) # Print feature names and z-scores
|
| 776 |
-
|
| 777 |
-
return filtered_features[:top_n] # Return tuples with z-scores
|
| 778 |
|
| 779 |
def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
|
| 780 |
|
|
|
|
| 613 |
other_author_ids: List[Any],
|
| 614 |
features_clm_name: str,
|
| 615 |
top_n: int = 10,
|
|
|
|
|
|
|
|
|
|
| 616 |
) -> List[tuple]: # Changed return type to List[tuple] to include scores
|
| 617 |
|
| 618 |
+
# 1) Identify selected authors in the zoom region
|
| 619 |
selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
|
| 620 |
|
| 621 |
if not selected_mask.any():
|
| 622 |
+
return [] # No authors found for the given author_ids
|
| 623 |
+
|
| 624 |
+
# 2) Build a population matrix of all authors' Gram2Vec features
|
| 625 |
+
# Expect each row in features_clm_name to be a dict {feature_name: value}
|
| 626 |
+
all_feature_dicts = background_corpus_df[features_clm_name].tolist()
|
| 627 |
+
if not all_feature_dicts:
|
| 628 |
+
return []
|
| 629 |
+
|
| 630 |
+
# Use the first row to get consistent feature ordering
|
| 631 |
+
all_features = list(all_feature_dicts[0].keys())
|
| 632 |
+
population_matrix = np.array(
|
| 633 |
+
[[feat_dict.get(feat, 0.0) for feat in all_features] for feat_dict in all_feature_dicts],
|
| 634 |
+
dtype=float
|
| 635 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
|
| 637 |
+
# 3) Z-normalize columnwise across the entire corpus
|
| 638 |
+
col_means = population_matrix.mean(axis=0)
|
| 639 |
+
col_stds = population_matrix.std(axis=0)
|
| 640 |
+
col_stds[col_stds == 0] = 1.0
|
| 641 |
+
z_population = (population_matrix - col_means) / col_stds
|
|
|
|
|
|
|
| 642 |
|
| 643 |
+
# 4) Take the mean across the selected authors (zoom region)
|
| 644 |
+
selected_mean = z_population[selected_mask].mean(axis=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 645 |
|
| 646 |
+
# 5) Rank features by mean z-score, keep positives only
|
| 647 |
+
feature_scores = [(feat, float(score)) for feat, score in zip(all_features, selected_mean) if score > 0]
|
| 648 |
+
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
| 649 |
|
| 650 |
+
return feature_scores[:top_n]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 651 |
|
| 652 |
def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
|
| 653 |
|
utils/visualizations.py
CHANGED
|
@@ -290,32 +290,44 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
|
|
| 290 |
background_corpus_df=merged_authors_df,
|
| 291 |
author_ids=visible_authors,
|
| 292 |
other_author_ids=[],
|
| 293 |
-
features_clm_name='g2v_vector'
|
|
|
|
| 294 |
)
|
| 295 |
|
| 296 |
# ββ Span-existence filter on task authors in the zoom βββββββββββββββββββ
|
| 297 |
-
# Keep only features that have
|
| 298 |
-
#
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
|
|
|
|
|
|
| 302 |
|
| 303 |
def _to_text(x):
|
| 304 |
return '\n\n =========== \n\n'.join(x) if isinstance(x, list) else x
|
| 305 |
|
| 306 |
-
task_texts = [_to_text(x) for x in
|
| 307 |
|
|
|
|
| 308 |
filtered_g2v_feats = []
|
| 309 |
for feat in g2v_feats:
|
| 310 |
try:
|
| 311 |
# `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
|
| 312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
filtered_g2v_feats.append(feat)
|
| 314 |
else:
|
| 315 |
-
print(f"[INFO] Dropping G2V feature with
|
| 316 |
except Exception as e:
|
| 317 |
print(f"[WARN] Error while checking spans for {feat}: {e}")
|
| 318 |
|
|
|
|
|
|
|
|
|
|
| 319 |
# Convert to human readable for display
|
| 320 |
HR_g2v_list = []
|
| 321 |
for feat in filtered_g2v_feats:
|
|
|
|
| 290 |
background_corpus_df=merged_authors_df,
|
| 291 |
author_ids=visible_authors,
|
| 292 |
other_author_ids=[],
|
| 293 |
+
features_clm_name='g2v_vector',
|
| 294 |
+
top_n=50
|
| 295 |
)
|
| 296 |
|
| 297 |
# ββ Span-existence filter on task authors in the zoom βββββββββββββββββββ
|
| 298 |
+
# Keep only features that have detected spans in at least 2 of the
|
| 299 |
+
# task authors' texts (Mystery + Candidates 1-3)
|
| 300 |
+
# Use only the task authors (Mystery + Candidates 1-3), not the zoom-visible set
|
| 301 |
+
task_author_ids = {"Mystery author", "Candidate Author 1", "Candidate Author 2", "Candidate Author 3"}
|
| 302 |
+
task_only_df = task_authors_df[task_authors_df['authorID'].isin(task_author_ids)]
|
| 303 |
+
if task_only_df.empty:
|
| 304 |
+
task_only_df = task_authors_df
|
| 305 |
|
| 306 |
def _to_text(x):
|
| 307 |
return '\n\n =========== \n\n'.join(x) if isinstance(x, list) else x
|
| 308 |
|
| 309 |
+
task_texts = [_to_text(x) for x in task_only_df['fullText'].tolist()]
|
| 310 |
|
| 311 |
+
print(f"task_texts: {task_texts}")
|
| 312 |
filtered_g2v_feats = []
|
| 313 |
for feat in g2v_feats:
|
| 314 |
try:
|
| 315 |
# `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
|
| 316 |
+
occurrences = 0
|
| 317 |
+
for txt in task_texts:
|
| 318 |
+
spans = find_feature_spans(txt, feat[0])
|
| 319 |
+
if spans:
|
| 320 |
+
occurrences += 1
|
| 321 |
+
if occurrences >= 2:
|
| 322 |
filtered_g2v_feats.append(feat)
|
| 323 |
else:
|
| 324 |
+
print(f"[INFO] Dropping G2V feature with <2 task-author spans: {feat}")
|
| 325 |
except Exception as e:
|
| 326 |
print(f"[WARN] Error while checking spans for {feat}: {e}")
|
| 327 |
|
| 328 |
+
# After filtering by spans, keep top-N by score
|
| 329 |
+
filtered_g2v_feats = filtered_g2v_feats[:10]
|
| 330 |
+
|
| 331 |
# Convert to human readable for display
|
| 332 |
HR_g2v_list = []
|
| 333 |
for feat in filtered_g2v_feats:
|