Milad Alshomary
commited on
Commit
Β·
5199ee7
1
Parent(s):
c30deb9
updates
Browse files- app.py +10 -9
- config/config.yaml +4 -2
- utils/interp_space_utils.py +7 -6
- utils/ui.py +5 -5
- utils/visualizations.py +17 -19
app.py
CHANGED
|
@@ -58,8 +58,7 @@ def app(share=False, use_cluster_feats=False):
|
|
| 58 |
instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
|
| 59 |
|
| 60 |
interp = load_interp_space(cfg)
|
| 61 |
-
clustered_authors_df = interp['clustered_authors_df']
|
| 62 |
-
clustered_authors_df['fullText'] = clustered_authors_df['fullText']
|
| 63 |
|
| 64 |
with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
|
| 65 |
# ββ Big Centered Title ββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -352,13 +351,15 @@ def app(share=False, use_cluster_feats=False):
|
|
| 352 |
yaxis: [ev['yaxis.range[0]'], ev['yaxis.range[1]']]
|
| 353 |
};
|
| 354 |
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
|
|
|
|
|
|
| 362 |
}
|
| 363 |
});
|
| 364 |
};
|
|
|
|
| 58 |
instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
|
| 59 |
|
| 60 |
interp = load_interp_space(cfg)
|
| 61 |
+
clustered_authors_df = interp['clustered_authors_df']
|
|
|
|
| 62 |
|
| 63 |
with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
|
| 64 |
# ββ Big Centered Title ββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 351 |
yaxis: [ev['yaxis.range[0]'], ev['yaxis.range[1]']]
|
| 352 |
};
|
| 353 |
|
| 354 |
+
if (window.confirm("Do you want to analyze the writing style of the authors in this region?")) {
|
| 355 |
+
const txtbox = document.querySelector('#axis-ranges textarea');
|
| 356 |
+
if (txtbox) {
|
| 357 |
+
txtbox.value = JSON.stringify(payload);
|
| 358 |
+
txtbox.dispatchEvent(new Event('input', { bubbles: true }));
|
| 359 |
+
console.log("------------> Zoom payload dispatched:<------------", payload);
|
| 360 |
+
} else {
|
| 361 |
+
console.warn("------------> No hidden textbox found to write zoom payload.<------------");
|
| 362 |
+
}
|
| 363 |
}
|
| 364 |
});
|
| 365 |
};
|
config/config.yaml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# config.yaml
|
| 2 |
-
instances_to_explain_path: "./datasets/
|
| 3 |
-
instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/
|
| 4 |
interp_space_path: "./datasets/sentence_luar_interp_space_2_35/"
|
| 5 |
interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
|
| 6 |
gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
|
|
@@ -10,3 +10,5 @@ style_feat_clm: "llm_tfidf_weights"
|
|
| 10 |
top_k: 10
|
| 11 |
only_llm_feats: false
|
| 12 |
only_gram2vec_feats: false
|
|
|
|
|
|
|
|
|
| 1 |
# config.yaml
|
| 2 |
+
instances_to_explain_path: "./datasets/hrs_explanations_luar_clusters_2_35_balanced.json"
|
| 3 |
+
instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_2_35_balanced.json?download=true"
|
| 4 |
interp_space_path: "./datasets/sentence_luar_interp_space_2_35/"
|
| 5 |
interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
|
| 6 |
gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
|
|
|
|
| 10 |
top_k: 10
|
| 11 |
only_llm_feats: false
|
| 12 |
only_gram2vec_feats: false
|
| 13 |
+
max_num_docs_per_authors: 1
|
| 14 |
+
max_num_bg_authors: 1000
|
utils/interp_space_utils.py
CHANGED
|
@@ -129,9 +129,9 @@ def instance_to_df(instance, predicted_author=None, ground_truth_author=None):
|
|
| 129 |
#create a dataframe of the task authors
|
| 130 |
task_authos_df = pd.DataFrame([
|
| 131 |
{'authorID': 'Mystery author', 'fullText': instance['Q_fullText'], 'predicted': None, 'ground_truth': None},
|
| 132 |
-
{'authorID': 'Candidate Author 1', 'fullText': instance['a0_fullText'], 'predicted': int(predicted_author) == 0, 'ground_truth': int(ground_truth_author) == 0},
|
| 133 |
-
{'authorID': 'Candidate Author 2', 'fullText': instance['a1_fullText'], 'predicted': int(predicted_author) == 1, 'ground_truth': int(ground_truth_author) == 1},
|
| 134 |
-
{'authorID': 'Candidate Author 3', 'fullText': instance['a2_fullText'], 'predicted': int(predicted_author) == 2, 'ground_truth': int(ground_truth_author) == 2}
|
| 135 |
|
| 136 |
])
|
| 137 |
|
|
@@ -170,6 +170,7 @@ def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str,
|
|
| 170 |
|
| 171 |
print(f"Generating style embeddings using {model_name} on column '{text_clm}'...")
|
| 172 |
|
|
|
|
| 173 |
model = SentenceTransformer(model_name)
|
| 174 |
embedding_dim = model.get_sentence_embedding_dimension()
|
| 175 |
|
|
@@ -265,7 +266,7 @@ def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
|
|
| 265 |
print(f"Cache hit for {model_name} on column '{text_clm}'")
|
| 266 |
print(cache_path)
|
| 267 |
with open(cache_path, "rb") as f:
|
| 268 |
-
|
| 269 |
|
| 270 |
else:
|
| 271 |
# Otherwise, compute, cache, and return
|
|
@@ -541,8 +542,8 @@ def compute_clusters_style_representation_3(
|
|
| 541 |
cluster_ids: List[Any],
|
| 542 |
cluster_label_clm_name: str = 'authorID',
|
| 543 |
max_num_feats: int = 20,
|
| 544 |
-
max_num_documents_per_author=
|
| 545 |
-
max_num_authors=
|
| 546 |
max_authors_for_span_extraction=4
|
| 547 |
):
|
| 548 |
|
|
|
|
| 129 |
#create a dataframe of the task authors
|
| 130 |
task_authos_df = pd.DataFrame([
|
| 131 |
{'authorID': 'Mystery author', 'fullText': instance['Q_fullText'], 'predicted': None, 'ground_truth': None},
|
| 132 |
+
{'authorID': 'Candidate Author 1', 'fullText': instance['a0_fullText'], 'predicted': int(predicted_author) == 0 if predicted_author is not None else None, 'ground_truth': int(ground_truth_author) == 0 if ground_truth_author is not None else None},
|
| 133 |
+
{'authorID': 'Candidate Author 2', 'fullText': instance['a1_fullText'], 'predicted': int(predicted_author) == 1 if predicted_author is not None else None, 'ground_truth': int(ground_truth_author) == 1 if ground_truth_author is not None else None},
|
| 134 |
+
{'authorID': 'Candidate Author 3', 'fullText': instance['a2_fullText'], 'predicted': int(predicted_author) == 2 if predicted_author is not None else None, 'ground_truth': int(ground_truth_author) == 2 if ground_truth_author is not None else None}
|
| 135 |
|
| 136 |
])
|
| 137 |
|
|
|
|
| 170 |
|
| 171 |
print(f"Generating style embeddings using {model_name} on column '{text_clm}'...")
|
| 172 |
|
| 173 |
+
print(background_corpus_df.fullText.tolist()[:10])
|
| 174 |
model = SentenceTransformer(model_name)
|
| 175 |
embedding_dim = model.get_sentence_embedding_dimension()
|
| 176 |
|
|
|
|
| 266 |
print(f"Cache hit for {model_name} on column '{text_clm}'")
|
| 267 |
print(cache_path)
|
| 268 |
with open(cache_path, "rb") as f:
|
| 269 |
+
background_corpus_df = pickle.load(f)
|
| 270 |
|
| 271 |
else:
|
| 272 |
# Otherwise, compute, cache, and return
|
|
|
|
| 542 |
cluster_ids: List[Any],
|
| 543 |
cluster_label_clm_name: str = 'authorID',
|
| 544 |
max_num_feats: int = 20,
|
| 545 |
+
max_num_documents_per_author=1,
|
| 546 |
+
max_num_authors=10,
|
| 547 |
max_authors_for_span_extraction=4
|
| 548 |
):
|
| 549 |
|
utils/ui.py
CHANGED
|
@@ -117,7 +117,7 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
|
|
| 117 |
'a1_fullText': c2_txt,
|
| 118 |
'a2_fullText': c3_txt
|
| 119 |
}
|
| 120 |
-
task_authors_df = instance_to_df(custom_task_instance)
|
| 121 |
|
| 122 |
#print(f"Generating embeddings for {model_name} on task authors")
|
| 123 |
# task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
|
|
@@ -136,10 +136,10 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
|
|
| 136 |
task_authors_df['g2v_vector'] = task_authors_g2v
|
| 137 |
print(f"Gram2Vec feature generation complete")
|
| 138 |
|
| 139 |
-
if mode != "Predefined HRS Task":
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
|
| 144 |
#generating html for the task
|
| 145 |
header_html, mystery_html, candidate_htmls = task_HTML(mystery_txt, candidate_texts, predicted_author, ground_truth_author)
|
|
|
|
| 117 |
'a1_fullText': c2_txt,
|
| 118 |
'a2_fullText': c3_txt
|
| 119 |
}
|
| 120 |
+
task_authors_df = instance_to_df(custom_task_instance, predicted_author=None, ground_truth_author=true_author)
|
| 121 |
|
| 122 |
#print(f"Generating embeddings for {model_name} on task authors")
|
| 123 |
# task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
|
|
|
|
| 136 |
task_authors_df['g2v_vector'] = task_authors_g2v
|
| 137 |
print(f"Gram2Vec feature generation complete")
|
| 138 |
|
| 139 |
+
#if mode != "Predefined HRS Task":
|
| 140 |
+
# Computing predicted author by checking pairwise cosine similarity over luar embeddings
|
| 141 |
+
col_name = f'{model_name.split("/")[-1]}_style_embedding'
|
| 142 |
+
predicted_author = compute_predicted_author(task_authors_df, col_name)
|
| 143 |
|
| 144 |
#generating html for the task
|
| 145 |
header_html, mystery_html, candidate_htmls = task_HTML(mystery_txt, candidate_texts, predicted_author, ground_truth_author)
|
utils/visualizations.py
CHANGED
|
@@ -132,9 +132,9 @@ def compute_tsne_with_cache(embeddings: np.ndarray, cache_path: str = 'datasets/
|
|
| 132 |
return cache[hash_key]
|
| 133 |
else:
|
| 134 |
print("Computing t-SNE")
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
tsne_result = umap.UMAP(n_components=2, n_neighbors=
|
| 138 |
|
| 139 |
cache[hash_key] = tsne_result
|
| 140 |
with open(cache_path, 'wb') as f:
|
|
@@ -147,9 +147,15 @@ def load_interp_space(cfg):
|
|
| 147 |
gram2vec_feats_path = cfg['interp_space_path'] + '/../gram2vec_feats.csv'
|
| 148 |
clustered_authors_path = cfg['interp_space_path'] + 'train_authors.pkl'
|
| 149 |
|
|
|
|
|
|
|
|
|
|
| 150 |
# Load authors embeddings and their cluster labels
|
| 151 |
-
clustered_authors_df = pd.read_pickle(clustered_authors_path)
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
| 153 |
author_embedding = clustered_authors_df.author_embedding.tolist()
|
| 154 |
author_labels = clustered_authors_df.cluster_label.tolist()
|
| 155 |
author_ids = clustered_authors_df.authorID.tolist()
|
|
@@ -267,23 +273,15 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
|
|
| 267 |
|
| 268 |
print(f"[INFO] Zoomed region includes {len(visible_authors)} authors:{visible_authors}")
|
| 269 |
|
| 270 |
-
# Example: Find features for clusters [2,3,4] that are NOT prominent in cluster [1]
|
| 271 |
-
# llm_feats = compute_clusters_style_representation(
|
| 272 |
-
# background_corpus_df=clustered_authors_df,
|
| 273 |
-
# cluster_ids=visible_authors,
|
| 274 |
-
# cluster_label_clm_name='authorID',
|
| 275 |
-
# other_cluster_ids=[],
|
| 276 |
-
# features_clm_name='final_attribute_name_manually_processed'
|
| 277 |
-
# )
|
| 278 |
print(f"Task authors: {len(task_authors_df)}, Clustered authors: {len(clustered_authors_df)}")
|
| 279 |
merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
|
| 280 |
print(f"Merged authors DataFrame:\n{len(merged_authors_df)}")
|
| 281 |
-
style_analysis_response = {'features': [], 'spans': []}
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
|
| 288 |
llm_feats = ['None'] + style_analysis_response['features']
|
| 289 |
|
|
|
|
| 132 |
return cache[hash_key]
|
| 133 |
else:
|
| 134 |
print("Computing t-SNE")
|
| 135 |
+
tsne_result = TSNE(n_components=2, learning_rate='auto',
|
| 136 |
+
init='random', perplexity=10, random_state=42,metric='cosine').fit_transform(embeddings)
|
| 137 |
+
#tsne_result = umap.UMAP(n_components=2, n_neighbors=30, min_dist=0.3, metric='cosine').fit_transform(embeddings)
|
| 138 |
|
| 139 |
cache[hash_key] = tsne_result
|
| 140 |
with open(cache_path, 'wb') as f:
|
|
|
|
| 147 |
gram2vec_feats_path = cfg['interp_space_path'] + '/../gram2vec_feats.csv'
|
| 148 |
clustered_authors_path = cfg['interp_space_path'] + 'train_authors.pkl'
|
| 149 |
|
| 150 |
+
max_num_docs_per_authors = cfg['max_num_docs_per_authors']
|
| 151 |
+
max_num_bg_authors = cfg['max_num_bg_authors']
|
| 152 |
+
|
| 153 |
# Load authors embeddings and their cluster labels
|
| 154 |
+
clustered_authors_df = pd.read_pickle(clustered_authors_path).iloc[:max_num_bg_authors]
|
| 155 |
+
clustered_authors_df['fullText'] = clustered_authors_df.fullText.map(lambda list: '\n\n'.join(['Document {}: {}'.format(i+1, text) for i, text in enumerate(list[:max_num_docs_per_authors])]))
|
| 156 |
+
|
| 157 |
+
print('Average atuhor text length:', clustered_authors_df.fullText.map(lambda x: len(x.split())).mean())
|
| 158 |
+
|
| 159 |
author_embedding = clustered_authors_df.author_embedding.tolist()
|
| 160 |
author_labels = clustered_authors_df.cluster_label.tolist()
|
| 161 |
author_ids = clustered_authors_df.authorID.tolist()
|
|
|
|
| 273 |
|
| 274 |
print(f"[INFO] Zoomed region includes {len(visible_authors)} authors:{visible_authors}")
|
| 275 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
print(f"Task authors: {len(task_authors_df)}, Clustered authors: {len(clustered_authors_df)}")
|
| 277 |
merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
|
| 278 |
print(f"Merged authors DataFrame:\n{len(merged_authors_df)}")
|
| 279 |
+
#style_analysis_response = {'features': [], 'spans': []}
|
| 280 |
+
style_analysis_response = compute_clusters_style_representation_3(
|
| 281 |
+
background_corpus_df=merged_authors_df,
|
| 282 |
+
cluster_ids=visible_authors,
|
| 283 |
+
cluster_label_clm_name='authorID',
|
| 284 |
+
)
|
| 285 |
|
| 286 |
llm_feats = ['None'] + style_analysis_response['features']
|
| 287 |
|