Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

Milad Alshomary commited on Sep 2

Commit

5199ee7

1 Parent(s): c30deb9

updates

Browse files

Files changed (5) hide show

app.py +10 -9
config/config.yaml +4 -2
utils/interp_space_utils.py +7 -6
utils/ui.py +5 -5
utils/visualizations.py +17 -19

app.py CHANGED Viewed

@@ -58,8 +58,7 @@ def app(share=False, use_cluster_feats=False):
     instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
     interp      = load_interp_space(cfg)
-    clustered_authors_df = interp['clustered_authors_df'][:1000]
-    clustered_authors_df['fullText'] = clustered_authors_df['fullText']
     with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
         # ── Big Centered Title ──────────────────────────────────────────
@@ -352,13 +351,15 @@ def app(share=False, use_cluster_feats=False):
                                     yaxis: [ev['yaxis.range[0]'], ev['yaxis.range[1]']]
                                 };
-                                const txtbox = document.querySelector('#axis-ranges textarea');
-                                if (txtbox) {
-                                    txtbox.value = JSON.stringify(payload);
-                                    txtbox.dispatchEvent(new Event('input', { bubbles: true }));
-                                    console.log("------------> Zoom payload dispatched:<------------", payload);
-                                } else {
-                                    console.warn("------------> No hidden textbox found to write zoom payload.<------------");
                                 }
                             });
                         };

     instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
     interp      = load_interp_space(cfg)
+    clustered_authors_df = interp['clustered_authors_df']
     with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
         # ── Big Centered Title ──────────────────────────────────────────
                                     yaxis: [ev['yaxis.range[0]'], ev['yaxis.range[1]']]
                                 };
+                                if (window.confirm("Do you want to analyze the writing style of the authors in this region?")) {
+                                    const txtbox = document.querySelector('#axis-ranges textarea');
+                                    if (txtbox) {
+                                        txtbox.value = JSON.stringify(payload);
+                                        txtbox.dispatchEvent(new Event('input', { bubbles: true }));
+                                        console.log("------------> Zoom payload dispatched:<------------", payload);
+                                    } else {
+                                        console.warn("------------> No hidden textbox found to write zoom payload.<------------");
+                                    }
                                 }
                             });
                         };

config/config.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # config.yaml
-instances_to_explain_path: "./datasets/hrs_explanations.json"
-instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
 interp_space_path:    "./datasets/sentence_luar_interp_space_2_35/"
 interp_space_url:    "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
 gram2vec_feats_path:      "./datasets/gram2vec_feats.csv"
@@ -10,3 +10,5 @@ style_feat_clm:       "llm_tfidf_weights"
 top_k:                10
 only_llm_feats:       false
 only_gram2vec_feats:  false

 # config.yaml
+instances_to_explain_path: "./datasets/hrs_explanations_luar_clusters_2_35_balanced.json"
+instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_2_35_balanced.json?download=true"
 interp_space_path:    "./datasets/sentence_luar_interp_space_2_35/"
 interp_space_url:    "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
 gram2vec_feats_path:      "./datasets/gram2vec_feats.csv"
 top_k:                10
 only_llm_feats:       false
 only_gram2vec_feats:  false
+max_num_docs_per_authors: 1
+max_num_bg_authors: 1000

utils/interp_space_utils.py CHANGED Viewed

@@ -129,9 +129,9 @@ def instance_to_df(instance, predicted_author=None, ground_truth_author=None):
     #create a dataframe of the task authors
     task_authos_df  = pd.DataFrame([
         {'authorID': 'Mystery author', 'fullText': instance['Q_fullText'], 'predicted': None, 'ground_truth': None},
-        {'authorID': 'Candidate Author 1', 'fullText': instance['a0_fullText'], 'predicted': int(predicted_author) == 0, 'ground_truth': int(ground_truth_author) == 0},
-        {'authorID': 'Candidate Author 2', 'fullText': instance['a1_fullText'], 'predicted': int(predicted_author) == 1, 'ground_truth': int(ground_truth_author) == 1},
-        {'authorID': 'Candidate Author 3', 'fullText': instance['a2_fullText'], 'predicted': int(predicted_author) == 2, 'ground_truth': int(ground_truth_author) == 2}
     ])
@@ -170,6 +170,7 @@ def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str,
     print(f"Generating style embeddings using {model_name} on column '{text_clm}'...")
     model = SentenceTransformer(model_name)
     embedding_dim = model.get_sentence_embedding_dimension()
@@ -265,7 +266,7 @@ def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
         print(f"Cache hit for {model_name} on column '{text_clm}'")
         print(cache_path)
         with open(cache_path, "rb") as f:
-            return pickle.load(f)
     else:
         # Otherwise, compute, cache, and return
@@ -541,8 +542,8 @@ def compute_clusters_style_representation_3(
     cluster_ids: List[Any],
     cluster_label_clm_name: str = 'authorID',
     max_num_feats: int = 20,
-    max_num_documents_per_author=3,
-    max_num_authors=5,
     max_authors_for_span_extraction=4
     ):

     #create a dataframe of the task authors
     task_authos_df  = pd.DataFrame([
         {'authorID': 'Mystery author', 'fullText': instance['Q_fullText'], 'predicted': None, 'ground_truth': None},
+        {'authorID': 'Candidate Author 1', 'fullText': instance['a0_fullText'], 'predicted': int(predicted_author) == 0 if predicted_author is not None else None, 'ground_truth': int(ground_truth_author) == 0 if ground_truth_author is not None else None},
+        {'authorID': 'Candidate Author 2', 'fullText': instance['a1_fullText'], 'predicted': int(predicted_author) == 1 if predicted_author is not None else None, 'ground_truth': int(ground_truth_author) == 1 if ground_truth_author is not None else None},
+        {'authorID': 'Candidate Author 3', 'fullText': instance['a2_fullText'], 'predicted': int(predicted_author) == 2 if predicted_author is not None else None, 'ground_truth': int(ground_truth_author) == 2 if ground_truth_author is not None else None}
     ])
     print(f"Generating style embeddings using {model_name} on column '{text_clm}'...")
+    print(background_corpus_df.fullText.tolist()[:10])
     model = SentenceTransformer(model_name)
     embedding_dim = model.get_sentence_embedding_dimension()
         print(f"Cache hit for {model_name} on column '{text_clm}'")
         print(cache_path)
         with open(cache_path, "rb") as f:
+            background_corpus_df = pickle.load(f)
     else:
         # Otherwise, compute, cache, and return
     cluster_ids: List[Any],
     cluster_label_clm_name: str = 'authorID',
     max_num_feats: int = 20,
+    max_num_documents_per_author=1,
+    max_num_authors=10,
     max_authors_for_span_extraction=4
     ):

utils/ui.py CHANGED Viewed

@@ -117,7 +117,7 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
             'a1_fullText': c2_txt,
             'a2_fullText': c3_txt
         }
-        task_authors_df  = instance_to_df(custom_task_instance)
     #print(f"Generating embeddings for {model_name} on task authors")
     # task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
@@ -136,10 +136,10 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
     task_authors_df['g2v_vector'] = task_authors_g2v
     print(f"Gram2Vec feature generation complete")
-    if mode != "Predefined HRS Task":
-        # Computing predicted author by checking pairwise cosine similarity over luar embeddings
-        col_name = f'{model_name.split("/")[-1]}_style_embedding'
-        predicted_author = compute_predicted_author(task_authors_df, col_name)
     #generating html for the task
     header_html, mystery_html, candidate_htmls = task_HTML(mystery_txt, candidate_texts, predicted_author, ground_truth_author)

             'a1_fullText': c2_txt,
             'a2_fullText': c3_txt
         }
+        task_authors_df  = instance_to_df(custom_task_instance, predicted_author=None, ground_truth_author=true_author)
     #print(f"Generating embeddings for {model_name} on task authors")
     # task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
     task_authors_df['g2v_vector'] = task_authors_g2v
     print(f"Gram2Vec feature generation complete")
+    #if mode != "Predefined HRS Task":
+    # Computing predicted author by checking pairwise cosine similarity over luar embeddings
+    col_name = f'{model_name.split("/")[-1]}_style_embedding'
+    predicted_author = compute_predicted_author(task_authors_df, col_name)
     #generating html for the task
     header_html, mystery_html, candidate_htmls = task_HTML(mystery_txt, candidate_texts, predicted_author, ground_truth_author)

utils/visualizations.py CHANGED Viewed

@@ -132,9 +132,9 @@ def compute_tsne_with_cache(embeddings: np.ndarray, cache_path: str = 'datasets/
         return cache[hash_key]
     else:
         print("Computing t-SNE")
-        # tsne_result = TSNE(n_components=2, learning_rate='auto',
-        #                    init='random', perplexity=3).fit_transform(embeddings)
-        tsne_result = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.0, metric='cosine').fit_transform(embeddings)
         cache[hash_key] = tsne_result
         with open(cache_path, 'wb') as f:
@@ -147,9 +147,15 @@ def load_interp_space(cfg):
     gram2vec_feats_path    = cfg['interp_space_path'] + '/../gram2vec_feats.csv'
     clustered_authors_path = cfg['interp_space_path'] + 'train_authors.pkl'
     # Load authors embeddings and their cluster labels
-    clustered_authors_df = pd.read_pickle(clustered_authors_path)
-    #clustered_authors_df = clustered_authors_df[clustered_authors_df.cluster_label != -1]
     author_embedding = clustered_authors_df.author_embedding.tolist()
     author_labels    = clustered_authors_df.cluster_label.tolist()
     author_ids      = clustered_authors_df.authorID.tolist()
@@ -267,23 +273,15 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
     print(f"[INFO] Zoomed region includes {len(visible_authors)} authors:{visible_authors}")
-    # Example: Find features for clusters [2,3,4] that are NOT prominent in cluster [1]
-    # llm_feats = compute_clusters_style_representation(
-    #     background_corpus_df=clustered_authors_df,
-    #     cluster_ids=visible_authors,
-    #     cluster_label_clm_name='authorID',
-    #     other_cluster_ids=[],
-    #     features_clm_name='final_attribute_name_manually_processed'
-    # )
     print(f"Task authors: {len(task_authors_df)}, Clustered authors: {len(clustered_authors_df)}")
     merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
     print(f"Merged authors DataFrame:\n{len(merged_authors_df)}")
-    style_analysis_response = {'features': [], 'spans': []}
-    # style_analysis_response = compute_clusters_style_representation_3(
-    #     background_corpus_df=merged_authors_df,
-    #     cluster_ids=visible_authors,
-    #     cluster_label_clm_name='authorID',
-    # )
     llm_feats = ['None'] + style_analysis_response['features']

         return cache[hash_key]
     else:
         print("Computing t-SNE")
+        tsne_result = TSNE(n_components=2, learning_rate='auto',
+                          init='random', perplexity=10, random_state=42,metric='cosine').fit_transform(embeddings)
+        #tsne_result = umap.UMAP(n_components=2, n_neighbors=30, min_dist=0.3, metric='cosine').fit_transform(embeddings)
         cache[hash_key] = tsne_result
         with open(cache_path, 'wb') as f:
     gram2vec_feats_path    = cfg['interp_space_path'] + '/../gram2vec_feats.csv'
     clustered_authors_path = cfg['interp_space_path'] + 'train_authors.pkl'
+    max_num_docs_per_authors = cfg['max_num_docs_per_authors']
+    max_num_bg_authors = cfg['max_num_bg_authors']
     # Load authors embeddings and their cluster labels
+    clustered_authors_df = pd.read_pickle(clustered_authors_path).iloc[:max_num_bg_authors]
+    clustered_authors_df['fullText'] = clustered_authors_df.fullText.map(lambda list: '\n\n'.join(['Document {}: {}'.format(i+1, text) for i, text in enumerate(list[:max_num_docs_per_authors])]))
+    print('Average atuhor text length:', clustered_authors_df.fullText.map(lambda x: len(x.split())).mean())
     author_embedding = clustered_authors_df.author_embedding.tolist()
     author_labels    = clustered_authors_df.cluster_label.tolist()
     author_ids      = clustered_authors_df.authorID.tolist()
     print(f"[INFO] Zoomed region includes {len(visible_authors)} authors:{visible_authors}")
     print(f"Task authors: {len(task_authors_df)}, Clustered authors: {len(clustered_authors_df)}")
     merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
     print(f"Merged authors DataFrame:\n{len(merged_authors_df)}")
+    #style_analysis_response = {'features': [], 'spans': []}
+    style_analysis_response = compute_clusters_style_representation_3(
+        background_corpus_df=merged_authors_df,
+        cluster_ids=visible_authors,
+        cluster_label_clm_name='authorID',
+    )
     llm_feats = ['None'] + style_analysis_response['features']