Anisha Bhatnagar
commited on
Commit
·
a1e49f6
1
Parent(s):
0ce5cd2
reducing number of precomputed regions; updating cache; data url; g2v relaxed filtering (peter)
Browse files- app.py +1 -1
- config/config.yaml +2 -2
- precompute_caches.py +3 -5
- utils/interp_space_utils.py +59 -34
- utils/visualizations.py +1 -1
app.py
CHANGED
|
@@ -22,7 +22,7 @@ def load_config(path="config/config.yaml"):
|
|
| 22 |
return yaml.safe_load(f)
|
| 23 |
|
| 24 |
# A comment to trigger change in spaces
|
| 25 |
-
# comment
|
| 26 |
cfg = load_config()
|
| 27 |
|
| 28 |
|
|
|
|
| 22 |
return yaml.safe_load(f)
|
| 23 |
|
| 24 |
# A comment to trigger change in spaces
|
| 25 |
+
# comment 3
|
| 26 |
cfg = load_config()
|
| 27 |
|
| 28 |
|
config/config.yaml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# config.yaml
|
| 2 |
-
instances_to_explain_path: "./datasets/
|
| 3 |
-
instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/
|
| 4 |
interp_space_path: "./datasets/sentence_luar_interp_space_2_35/"
|
| 5 |
interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
|
| 6 |
gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
|
|
|
|
| 1 |
# config.yaml
|
| 2 |
+
instances_to_explain_path: "./datasets/hrs_explanations_luar_clusters_2_35_balanced.json"
|
| 3 |
+
instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_2_35_balanced.json?download=true"
|
| 4 |
interp_space_path: "./datasets/sentence_luar_interp_space_2_35/"
|
| 5 |
interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
|
| 6 |
gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
|
precompute_caches.py
CHANGED
|
@@ -19,8 +19,7 @@ def load_config(path="config/config.yaml"):
|
|
| 19 |
def precompute_all_caches(
|
| 20 |
models_to_test=None,
|
| 21 |
instances_to_process=None,
|
| 22 |
-
config_path="config/config.yaml"
|
| 23 |
-
force_regenerate=False
|
| 24 |
):
|
| 25 |
"""
|
| 26 |
Precompute all cache files using the EXACT same methods as app.py.
|
|
@@ -194,13 +193,12 @@ from utils.visualizations import visualize_clusters_plotly
|
|
| 194 |
|
| 195 |
if __name__ == "__main__":
|
| 196 |
# Test with a small subset first
|
| 197 |
-
instances=[i for i in range(
|
| 198 |
cache_stats = precompute_all_caches(
|
| 199 |
models_to_test=[
|
| 200 |
'gabrielloiseau/LUAR-MUD-sentence-transformers'
|
| 201 |
],
|
| 202 |
-
instances_to_process=instances
|
| 203 |
-
force_regenerate=False
|
| 204 |
)
|
| 205 |
|
| 206 |
print(f"\nCache precomputation completed with {len(cache_stats['errors'])} errors.")
|
|
|
|
| 19 |
def precompute_all_caches(
|
| 20 |
models_to_test=None,
|
| 21 |
instances_to_process=None,
|
| 22 |
+
config_path="config/config.yaml"
|
|
|
|
| 23 |
):
|
| 24 |
"""
|
| 25 |
Precompute all cache files using the EXACT same methods as app.py.
|
|
|
|
| 193 |
|
| 194 |
if __name__ == "__main__":
|
| 195 |
# Test with a small subset first
|
| 196 |
+
instances=[i for i in range(20)] # First 20 instances for testing
|
| 197 |
cache_stats = precompute_all_caches(
|
| 198 |
models_to_test=[
|
| 199 |
'gabrielloiseau/LUAR-MUD-sentence-transformers'
|
| 200 |
],
|
| 201 |
+
instances_to_process=instances
|
|
|
|
| 202 |
)
|
| 203 |
|
| 204 |
print(f"\nCache precomputation completed with {len(cache_stats['errors'])} errors.")
|
utils/interp_space_utils.py
CHANGED
|
@@ -546,7 +546,9 @@ def compute_clusters_style_representation_3(
|
|
| 546 |
max_num_feats: int = 20,
|
| 547 |
max_num_documents_per_author=1,
|
| 548 |
max_num_authors=10,
|
| 549 |
-
max_authors_for_span_extraction=4
|
|
|
|
|
|
|
| 550 |
):
|
| 551 |
|
| 552 |
print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
|
|
@@ -568,19 +570,40 @@ def compute_clusters_style_representation_3(
|
|
| 568 |
print(author_names)
|
| 569 |
spans_by_author = extract_all_spans(span_df, features, cluster_label_clm_name)
|
| 570 |
|
| 571 |
-
# Filter
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
filtered_spans_by_author = [set([f[0] for f in x[1].items() if len(f[1]) > 0]) for x in filtered_spans_by_author.items()]
|
| 575 |
|
| 576 |
-
|
| 577 |
-
for x in filtered_spans_by_author[1:]:
|
| 578 |
-
filtered_set_of_features = filtered_set_of_features.intersection(x)
|
| 579 |
|
| 580 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
|
| 582 |
return {
|
| 583 |
-
"features": list(
|
| 584 |
"spans": spans_by_author
|
| 585 |
}
|
| 586 |
|
|
@@ -679,8 +702,8 @@ def compute_clusters_g2v_representation(
|
|
| 679 |
# Filter in only features that are present in selected_authors
|
| 680 |
selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
|
| 681 |
|
| 682 |
-
print(f"[DEBUG] selected_authors_g2v_data length: {len(selected_authors_g2v_data)}")
|
| 683 |
-
print(f"[DEBUG] selected_authors_g2v_data content: {selected_authors_g2v_data}")
|
| 684 |
|
| 685 |
# Get the actual text documents for the selected authors to verify feature presence
|
| 686 |
selected_authors_docs = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)]['fullText'].tolist()
|
|
@@ -828,7 +851,7 @@ def compute_predicted_author(task_authors_df: pd.DataFrame, col_name: str) -> in
|
|
| 828 |
return predicted_author
|
| 829 |
|
| 830 |
|
| 831 |
-
def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, model_name, n_neighbors=7):
|
| 832 |
"""
|
| 833 |
Compute precomputed regions for mystery author and candidates.
|
| 834 |
|
|
@@ -914,41 +937,43 @@ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, model_name, n_n
|
|
| 914 |
include_points = np.vstack([point1.reshape(1, -1), point2.reshape(1, -1)])
|
| 915 |
return get_region_around_point(midpoint, region_name, include_points=include_points)
|
| 916 |
|
| 917 |
-
# Region 1: Around mystery author only
|
| 918 |
-
regions["Mystery Author Neighborhood"] = get_region_around_point(
|
| 919 |
-
|
| 920 |
-
)
|
| 921 |
|
| 922 |
-
# Regions 2-4: Around each candidate
|
| 923 |
-
for i in range(3):
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
| 927 |
|
| 928 |
# Regions 5-7: Between mystery and each candidate
|
| 929 |
for i in range(3):
|
| 930 |
-
|
| 931 |
-
|
| 932 |
-
|
| 933 |
-
|
|
|
|
| 934 |
|
| 935 |
# Regions 8-10: Between candidate pairs
|
| 936 |
candidate_pairs = [(0, 1), (0, 2), (1, 2)]
|
| 937 |
for i, (c1, c2) in enumerate(candidate_pairs):
|
| 938 |
-
|
| 939 |
-
|
| 940 |
-
|
| 941 |
-
|
|
|
|
| 942 |
|
| 943 |
# Regions 11-12: Around predicted and ground truth (if different)
|
| 944 |
# This would need predicted_author and ground_truth_author indices
|
| 945 |
# For now, we'll create generic regions
|
| 946 |
|
| 947 |
# Region 11: Centroid of all task authors (mystery + 3 candidates)
|
| 948 |
-
task_centroid = np.mean(np.vstack([q_proj, c_proj]), axis=0)
|
| 949 |
-
regions["All Task Authors Centroid"] = get_region_around_point(
|
| 950 |
-
|
| 951 |
-
)
|
| 952 |
|
| 953 |
def serialize_numpy_dtypes(obj):
|
| 954 |
if isinstance(obj, np.ndarray):
|
|
|
|
| 546 |
max_num_feats: int = 20,
|
| 547 |
max_num_documents_per_author=1,
|
| 548 |
max_num_authors=10,
|
| 549 |
+
max_authors_for_span_extraction=4,
|
| 550 |
+
min_authors_required: int = 2,
|
| 551 |
+
top_k: int = 10
|
| 552 |
):
|
| 553 |
|
| 554 |
print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
|
|
|
|
| 570 |
print(author_names)
|
| 571 |
spans_by_author = extract_all_spans(span_df, features, cluster_label_clm_name)
|
| 572 |
|
| 573 |
+
# Filter-in only task authors that are part of the current selection
|
| 574 |
+
task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
|
| 575 |
+
filtered_task_authors = {author: feat_map for author, feat_map in spans_by_author.items() if author in task_author_names.intersection(set(cluster_ids))}
|
|
|
|
| 576 |
|
| 577 |
+
print(filtered_task_authors.keys())
|
|
|
|
|
|
|
| 578 |
|
| 579 |
+
# Build per-author sets of features that have at least one span
|
| 580 |
+
author_present_feature_sets = [
|
| 581 |
+
{feature for feature, spans in feature_map.items() if len(spans) > 0}
|
| 582 |
+
for _, feature_map in filtered_task_authors.items()
|
| 583 |
+
]
|
| 584 |
+
|
| 585 |
+
# If nothing to aggregate (e.g., no task authors in selection), fall back to empty list
|
| 586 |
+
selected_features_ranked = []
|
| 587 |
+
if author_present_feature_sets:
|
| 588 |
+
coverage_counter = Counter()
|
| 589 |
+
for present_set in author_present_feature_sets:
|
| 590 |
+
coverage_counter.update(present_set)
|
| 591 |
+
|
| 592 |
+
# Keep features present in at least `min_authors_required` authors
|
| 593 |
+
eligible_features = [feat for feat, cnt in coverage_counter.items() if cnt >= int(min_authors_required)]
|
| 594 |
+
|
| 595 |
+
# Preserve original LLM feature ordering as a secondary key where possible
|
| 596 |
+
feature_original_index = {feat: idx for idx, feat in enumerate(features)} if features else {}
|
| 597 |
+
|
| 598 |
+
selected_features_ranked = sorted(
|
| 599 |
+
eligible_features,
|
| 600 |
+
key=lambda f: (-coverage_counter[f], feature_original_index.get(f, 10**9))
|
| 601 |
+
)[:int(top_k)]
|
| 602 |
+
|
| 603 |
+
print('filtered set of features (min coverage', min_authors_required, '): ', selected_features_ranked)
|
| 604 |
|
| 605 |
return {
|
| 606 |
+
"features": list(selected_features_ranked),
|
| 607 |
"spans": spans_by_author
|
| 608 |
}
|
| 609 |
|
|
|
|
| 702 |
# Filter in only features that are present in selected_authors
|
| 703 |
selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
|
| 704 |
|
| 705 |
+
# print(f"[DEBUG] selected_authors_g2v_data length: {len(selected_authors_g2v_data)}")
|
| 706 |
+
# print(f"[DEBUG] selected_authors_g2v_data content: {selected_authors_g2v_data}")
|
| 707 |
|
| 708 |
# Get the actual text documents for the selected authors to verify feature presence
|
| 709 |
selected_authors_docs = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)]['fullText'].tolist()
|
|
|
|
| 851 |
return predicted_author
|
| 852 |
|
| 853 |
|
| 854 |
+
def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, pred_idx, model_name, n_neighbors=7):
|
| 855 |
"""
|
| 856 |
Compute precomputed regions for mystery author and candidates.
|
| 857 |
|
|
|
|
| 937 |
include_points = np.vstack([point1.reshape(1, -1), point2.reshape(1, -1)])
|
| 938 |
return get_region_around_point(midpoint, region_name, include_points=include_points)
|
| 939 |
|
| 940 |
+
# # Region 1: Around mystery author only
|
| 941 |
+
# regions["Mystery Author Neighborhood"] = get_region_around_point(
|
| 942 |
+
# q_proj, "Mystery Author"
|
| 943 |
+
# )
|
| 944 |
|
| 945 |
+
# # Regions 2-4: Around each candidate
|
| 946 |
+
# for i in range(3):
|
| 947 |
+
# regions[f"Candidate {i+1} Neighborhood"] = get_region_around_point(
|
| 948 |
+
# c_proj[i], f"Candidate {i+1}"
|
| 949 |
+
# )
|
| 950 |
|
| 951 |
# Regions 5-7: Between mystery and each candidate
|
| 952 |
for i in range(3):
|
| 953 |
+
if i == pred_idx: #selecting only mystery and predicted candidate
|
| 954 |
+
region_name = f"Mystery & Candidate {i+1}"
|
| 955 |
+
regions[region_name] = get_region_between_points(
|
| 956 |
+
q_proj, c_proj[i], "Mystery", f"Candidate {i+1}"
|
| 957 |
+
)
|
| 958 |
|
| 959 |
# Regions 8-10: Between candidate pairs
|
| 960 |
candidate_pairs = [(0, 1), (0, 2), (1, 2)]
|
| 961 |
for i, (c1, c2) in enumerate(candidate_pairs):
|
| 962 |
+
if c1 != pred_idx and c2 != pred_idx: #selecting only the non predicated candidates
|
| 963 |
+
region_name = f"Candidate {c1+1} & Candidate {c2+1}"
|
| 964 |
+
regions[region_name] = get_region_between_points(
|
| 965 |
+
c_proj[c1], c_proj[c2], f"Candidate {c1+1}", f"Candidate {c2+1}"
|
| 966 |
+
)
|
| 967 |
|
| 968 |
# Regions 11-12: Around predicted and ground truth (if different)
|
| 969 |
# This would need predicted_author and ground_truth_author indices
|
| 970 |
# For now, we'll create generic regions
|
| 971 |
|
| 972 |
# Region 11: Centroid of all task authors (mystery + 3 candidates)
|
| 973 |
+
# task_centroid = np.mean(np.vstack([q_proj, c_proj]), axis=0)
|
| 974 |
+
# regions["All Task Authors Centroid"] = get_region_around_point(
|
| 975 |
+
# task_centroid, "All Task Authors", include_points=np.vstack([q_proj, c_proj])
|
| 976 |
+
# )
|
| 977 |
|
| 978 |
def serialize_numpy_dtypes(obj):
|
| 979 |
if isinstance(obj, np.ndarray):
|
utils/visualizations.py
CHANGED
|
@@ -519,7 +519,7 @@ def visualize_clusters_plotly(iid, cfg, instances, model_radio, custom_model_inp
|
|
| 519 |
candidate_ids = task_authors_df['authorID'].iloc[1:4].tolist() # 3 candidate IDs
|
| 520 |
|
| 521 |
precomputed_regions = compute_precomputed_regions(
|
| 522 |
-
bg_proj_for_regions, bg_ids_for_regions, q_proj, c_proj, model_name
|
| 523 |
)
|
| 524 |
|
| 525 |
# Create choices for radio buttons
|
|
|
|
| 519 |
candidate_ids = task_authors_df['authorID'].iloc[1:4].tolist() # 3 candidate IDs
|
| 520 |
|
| 521 |
precomputed_regions = compute_precomputed_regions(
|
| 522 |
+
bg_proj_for_regions, bg_ids_for_regions, q_proj, c_proj, pred_idx, model_name
|
| 523 |
)
|
| 524 |
|
| 525 |
# Create choices for radio buttons
|