Milad Alshomary commited on
Commit
5199ee7
Β·
1 Parent(s): c30deb9
app.py CHANGED
@@ -58,8 +58,7 @@ def app(share=False, use_cluster_feats=False):
58
  instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
59
 
60
  interp = load_interp_space(cfg)
61
- clustered_authors_df = interp['clustered_authors_df'][:1000]
62
- clustered_authors_df['fullText'] = clustered_authors_df['fullText']
63
 
64
  with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
65
  # ── Big Centered Title ──────────────────────────────────────────
@@ -352,13 +351,15 @@ def app(share=False, use_cluster_feats=False):
352
  yaxis: [ev['yaxis.range[0]'], ev['yaxis.range[1]']]
353
  };
354
 
355
- const txtbox = document.querySelector('#axis-ranges textarea');
356
- if (txtbox) {
357
- txtbox.value = JSON.stringify(payload);
358
- txtbox.dispatchEvent(new Event('input', { bubbles: true }));
359
- console.log("------------> Zoom payload dispatched:<------------", payload);
360
- } else {
361
- console.warn("------------> No hidden textbox found to write zoom payload.<------------");
 
 
362
  }
363
  });
364
  };
 
58
  instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
59
 
60
  interp = load_interp_space(cfg)
61
+ clustered_authors_df = interp['clustered_authors_df']
 
62
 
63
  with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
64
  # ── Big Centered Title ──────────────────────────────────────────
 
351
  yaxis: [ev['yaxis.range[0]'], ev['yaxis.range[1]']]
352
  };
353
 
354
+ if (window.confirm("Do you want to analyze the writing style of the authors in this region?")) {
355
+ const txtbox = document.querySelector('#axis-ranges textarea');
356
+ if (txtbox) {
357
+ txtbox.value = JSON.stringify(payload);
358
+ txtbox.dispatchEvent(new Event('input', { bubbles: true }));
359
+ console.log("------------> Zoom payload dispatched:<------------", payload);
360
+ } else {
361
+ console.warn("------------> No hidden textbox found to write zoom payload.<------------");
362
+ }
363
  }
364
  });
365
  };
config/config.yaml CHANGED
@@ -1,6 +1,6 @@
1
  # config.yaml
2
- instances_to_explain_path: "./datasets/hrs_explanations.json"
3
- instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
4
  interp_space_path: "./datasets/sentence_luar_interp_space_2_35/"
5
  interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
6
  gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
@@ -10,3 +10,5 @@ style_feat_clm: "llm_tfidf_weights"
10
  top_k: 10
11
  only_llm_feats: false
12
  only_gram2vec_feats: false
 
 
 
1
  # config.yaml
2
+ instances_to_explain_path: "./datasets/hrs_explanations_luar_clusters_2_35_balanced.json"
3
+ instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_2_35_balanced.json?download=true"
4
  interp_space_path: "./datasets/sentence_luar_interp_space_2_35/"
5
  interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
6
  gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
 
10
  top_k: 10
11
  only_llm_feats: false
12
  only_gram2vec_feats: false
13
+ max_num_docs_per_authors: 1
14
+ max_num_bg_authors: 1000
utils/interp_space_utils.py CHANGED
@@ -129,9 +129,9 @@ def instance_to_df(instance, predicted_author=None, ground_truth_author=None):
129
  #create a dataframe of the task authors
130
  task_authos_df = pd.DataFrame([
131
  {'authorID': 'Mystery author', 'fullText': instance['Q_fullText'], 'predicted': None, 'ground_truth': None},
132
- {'authorID': 'Candidate Author 1', 'fullText': instance['a0_fullText'], 'predicted': int(predicted_author) == 0, 'ground_truth': int(ground_truth_author) == 0},
133
- {'authorID': 'Candidate Author 2', 'fullText': instance['a1_fullText'], 'predicted': int(predicted_author) == 1, 'ground_truth': int(ground_truth_author) == 1},
134
- {'authorID': 'Candidate Author 3', 'fullText': instance['a2_fullText'], 'predicted': int(predicted_author) == 2, 'ground_truth': int(ground_truth_author) == 2}
135
 
136
  ])
137
 
@@ -170,6 +170,7 @@ def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str,
170
 
171
  print(f"Generating style embeddings using {model_name} on column '{text_clm}'...")
172
 
 
173
  model = SentenceTransformer(model_name)
174
  embedding_dim = model.get_sentence_embedding_dimension()
175
 
@@ -265,7 +266,7 @@ def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
265
  print(f"Cache hit for {model_name} on column '{text_clm}'")
266
  print(cache_path)
267
  with open(cache_path, "rb") as f:
268
- return pickle.load(f)
269
 
270
  else:
271
  # Otherwise, compute, cache, and return
@@ -541,8 +542,8 @@ def compute_clusters_style_representation_3(
541
  cluster_ids: List[Any],
542
  cluster_label_clm_name: str = 'authorID',
543
  max_num_feats: int = 20,
544
- max_num_documents_per_author=3,
545
- max_num_authors=5,
546
  max_authors_for_span_extraction=4
547
  ):
548
 
 
129
  #create a dataframe of the task authors
130
  task_authos_df = pd.DataFrame([
131
  {'authorID': 'Mystery author', 'fullText': instance['Q_fullText'], 'predicted': None, 'ground_truth': None},
132
+ {'authorID': 'Candidate Author 1', 'fullText': instance['a0_fullText'], 'predicted': int(predicted_author) == 0 if predicted_author is not None else None, 'ground_truth': int(ground_truth_author) == 0 if ground_truth_author is not None else None},
133
+ {'authorID': 'Candidate Author 2', 'fullText': instance['a1_fullText'], 'predicted': int(predicted_author) == 1 if predicted_author is not None else None, 'ground_truth': int(ground_truth_author) == 1 if ground_truth_author is not None else None},
134
+ {'authorID': 'Candidate Author 3', 'fullText': instance['a2_fullText'], 'predicted': int(predicted_author) == 2 if predicted_author is not None else None, 'ground_truth': int(ground_truth_author) == 2 if ground_truth_author is not None else None}
135
 
136
  ])
137
 
 
170
 
171
  print(f"Generating style embeddings using {model_name} on column '{text_clm}'...")
172
 
173
+ print(background_corpus_df.fullText.tolist()[:10])
174
  model = SentenceTransformer(model_name)
175
  embedding_dim = model.get_sentence_embedding_dimension()
176
 
 
266
  print(f"Cache hit for {model_name} on column '{text_clm}'")
267
  print(cache_path)
268
  with open(cache_path, "rb") as f:
269
+ background_corpus_df = pickle.load(f)
270
 
271
  else:
272
  # Otherwise, compute, cache, and return
 
542
  cluster_ids: List[Any],
543
  cluster_label_clm_name: str = 'authorID',
544
  max_num_feats: int = 20,
545
+ max_num_documents_per_author=1,
546
+ max_num_authors=10,
547
  max_authors_for_span_extraction=4
548
  ):
549
 
utils/ui.py CHANGED
@@ -117,7 +117,7 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
117
  'a1_fullText': c2_txt,
118
  'a2_fullText': c3_txt
119
  }
120
- task_authors_df = instance_to_df(custom_task_instance)
121
 
122
  #print(f"Generating embeddings for {model_name} on task authors")
123
  # task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
@@ -136,10 +136,10 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
136
  task_authors_df['g2v_vector'] = task_authors_g2v
137
  print(f"Gram2Vec feature generation complete")
138
 
139
- if mode != "Predefined HRS Task":
140
- # Computing predicted author by checking pairwise cosine similarity over luar embeddings
141
- col_name = f'{model_name.split("/")[-1]}_style_embedding'
142
- predicted_author = compute_predicted_author(task_authors_df, col_name)
143
 
144
  #generating html for the task
145
  header_html, mystery_html, candidate_htmls = task_HTML(mystery_txt, candidate_texts, predicted_author, ground_truth_author)
 
117
  'a1_fullText': c2_txt,
118
  'a2_fullText': c3_txt
119
  }
120
+ task_authors_df = instance_to_df(custom_task_instance, predicted_author=None, ground_truth_author=true_author)
121
 
122
  #print(f"Generating embeddings for {model_name} on task authors")
123
  # task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
 
136
  task_authors_df['g2v_vector'] = task_authors_g2v
137
  print(f"Gram2Vec feature generation complete")
138
 
139
+ #if mode != "Predefined HRS Task":
140
+ # Computing predicted author by checking pairwise cosine similarity over luar embeddings
141
+ col_name = f'{model_name.split("/")[-1]}_style_embedding'
142
+ predicted_author = compute_predicted_author(task_authors_df, col_name)
143
 
144
  #generating html for the task
145
  header_html, mystery_html, candidate_htmls = task_HTML(mystery_txt, candidate_texts, predicted_author, ground_truth_author)
utils/visualizations.py CHANGED
@@ -132,9 +132,9 @@ def compute_tsne_with_cache(embeddings: np.ndarray, cache_path: str = 'datasets/
132
  return cache[hash_key]
133
  else:
134
  print("Computing t-SNE")
135
- # tsne_result = TSNE(n_components=2, learning_rate='auto',
136
- # init='random', perplexity=3).fit_transform(embeddings)
137
- tsne_result = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.0, metric='cosine').fit_transform(embeddings)
138
 
139
  cache[hash_key] = tsne_result
140
  with open(cache_path, 'wb') as f:
@@ -147,9 +147,15 @@ def load_interp_space(cfg):
147
  gram2vec_feats_path = cfg['interp_space_path'] + '/../gram2vec_feats.csv'
148
  clustered_authors_path = cfg['interp_space_path'] + 'train_authors.pkl'
149
 
 
 
 
150
  # Load authors embeddings and their cluster labels
151
- clustered_authors_df = pd.read_pickle(clustered_authors_path)
152
- #clustered_authors_df = clustered_authors_df[clustered_authors_df.cluster_label != -1]
 
 
 
153
  author_embedding = clustered_authors_df.author_embedding.tolist()
154
  author_labels = clustered_authors_df.cluster_label.tolist()
155
  author_ids = clustered_authors_df.authorID.tolist()
@@ -267,23 +273,15 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
267
 
268
  print(f"[INFO] Zoomed region includes {len(visible_authors)} authors:{visible_authors}")
269
 
270
- # Example: Find features for clusters [2,3,4] that are NOT prominent in cluster [1]
271
- # llm_feats = compute_clusters_style_representation(
272
- # background_corpus_df=clustered_authors_df,
273
- # cluster_ids=visible_authors,
274
- # cluster_label_clm_name='authorID',
275
- # other_cluster_ids=[],
276
- # features_clm_name='final_attribute_name_manually_processed'
277
- # )
278
  print(f"Task authors: {len(task_authors_df)}, Clustered authors: {len(clustered_authors_df)}")
279
  merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
280
  print(f"Merged authors DataFrame:\n{len(merged_authors_df)}")
281
- style_analysis_response = {'features': [], 'spans': []}
282
- # style_analysis_response = compute_clusters_style_representation_3(
283
- # background_corpus_df=merged_authors_df,
284
- # cluster_ids=visible_authors,
285
- # cluster_label_clm_name='authorID',
286
- # )
287
 
288
  llm_feats = ['None'] + style_analysis_response['features']
289
 
 
132
  return cache[hash_key]
133
  else:
134
  print("Computing t-SNE")
135
+ tsne_result = TSNE(n_components=2, learning_rate='auto',
136
+ init='random', perplexity=10, random_state=42,metric='cosine').fit_transform(embeddings)
137
+ #tsne_result = umap.UMAP(n_components=2, n_neighbors=30, min_dist=0.3, metric='cosine').fit_transform(embeddings)
138
 
139
  cache[hash_key] = tsne_result
140
  with open(cache_path, 'wb') as f:
 
147
  gram2vec_feats_path = cfg['interp_space_path'] + '/../gram2vec_feats.csv'
148
  clustered_authors_path = cfg['interp_space_path'] + 'train_authors.pkl'
149
 
150
+ max_num_docs_per_authors = cfg['max_num_docs_per_authors']
151
+ max_num_bg_authors = cfg['max_num_bg_authors']
152
+
153
  # Load authors embeddings and their cluster labels
154
+ clustered_authors_df = pd.read_pickle(clustered_authors_path).iloc[:max_num_bg_authors]
155
+ clustered_authors_df['fullText'] = clustered_authors_df.fullText.map(lambda list: '\n\n'.join(['Document {}: {}'.format(i+1, text) for i, text in enumerate(list[:max_num_docs_per_authors])]))
156
+
157
+ print('Average atuhor text length:', clustered_authors_df.fullText.map(lambda x: len(x.split())).mean())
158
+
159
  author_embedding = clustered_authors_df.author_embedding.tolist()
160
  author_labels = clustered_authors_df.cluster_label.tolist()
161
  author_ids = clustered_authors_df.authorID.tolist()
 
273
 
274
  print(f"[INFO] Zoomed region includes {len(visible_authors)} authors:{visible_authors}")
275
 
 
 
 
 
 
 
 
 
276
  print(f"Task authors: {len(task_authors_df)}, Clustered authors: {len(clustered_authors_df)}")
277
  merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
278
  print(f"Merged authors DataFrame:\n{len(merged_authors_df)}")
279
+ #style_analysis_response = {'features': [], 'spans': []}
280
+ style_analysis_response = compute_clusters_style_representation_3(
281
+ background_corpus_df=merged_authors_df,
282
+ cluster_ids=visible_authors,
283
+ cluster_label_clm_name='authorID',
284
+ )
285
 
286
  llm_feats = ['None'] + style_analysis_response['features']
287