Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

Milad Alshomary commited on Aug 27

Commit

c54eddb

1 Parent(s): 74947b9

updates

Browse files

Files changed (4) hide show

app.py +2 -2
utils/gram2vec_feat_utils.py +1 -1
utils/interp_space_utils.py +22 -28
utils/visualizations.py +1 -0

app.py CHANGED Viewed

@@ -58,8 +58,8 @@ def app(share=False, use_cluster_feats=False):
     instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
     interp      = load_interp_space(cfg)
-    clustered_authors_df = interp['clustered_authors_df'][:1000]
-    clustered_authors_df['fullText'] = clustered_authors_df['fullText'].map(lambda l: l[:3]) # Take at most 3 texts per author
     with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
         # ── Big Centered Title ──────────────────────────────────────────

     instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
     interp      = load_interp_space(cfg)
+    clustered_authors_df = interp['clustered_authors_df']
+    clustered_authors_df['fullText'] = clustered_authors_df['fullText'].map(lambda l: l[:5]) # Take at most 3 texts per author
     with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
         # ── Big Centered Title ──────────────────────────────────────────

utils/gram2vec_feat_utils.py CHANGED Viewed

@@ -126,7 +126,7 @@ def highlight_both_spans(text, llm_spans, gram_spans):
 def show_combined_spans_all(selected_feature_llm, selected_feature_g2v,
-                            llm_style_feats_analysis, background_authors_embeddings_df, task_authors_embeddings_df, visible_authors, predicted_author=None, ground_truth_author=None, max_num_authors=7):
     """
     For mystery + 3 candidates:
      1. get llm spans via your existing cache+API

 def show_combined_spans_all(selected_feature_llm, selected_feature_g2v,
+                            llm_style_feats_analysis, background_authors_embeddings_df, task_authors_embeddings_df, visible_authors, predicted_author=None, ground_truth_author=None, max_num_authors=4):
     """
     For mystery + 3 candidates:
      1. get llm spans via your existing cache+API

utils/interp_space_utils.py CHANGED Viewed

@@ -449,11 +449,11 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
             print(f"Cache miss. Computing features for authors: {author_names}")
     client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-    prompt = f"""Identify {max_num_feats} writing style features that are commonly found across the following texts. Do not extract spans. Just return the feature names as a list.
     Author Texts:
-    \"\"\"{chr(10).join(author_texts)}\"\"\"
-    """
     def _make_call():
         response = client.chat.completions.create(
             model="gpt-4o-mini",
@@ -473,7 +473,6 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
     features = retry_call(_make_call, FeatureIdentificationSchema).features
-    print(f"Adding to zoom cache")
     if cache_key and author_names:
         cache[cache_key] = {
             "features": features
@@ -519,10 +518,10 @@ def compute_clusters_style_representation_3(
     background_corpus_df: pd.DataFrame,
     cluster_ids: List[Any],
     cluster_label_clm_name: str = 'authorID',
-    max_num_feats: int = 10,
     max_num_documents_per_author=3,
     max_num_authors=5,
-    max_authors_for_span_extraction=7
     ):
     print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
@@ -546,20 +545,17 @@ def compute_clusters_style_representation_3(
     # Filter out features that are not present in any of the authors
     filtered_spans_by_author = {x[0] : x[1] for x in spans_by_author.items() if x[0] in {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(cluster_ids))}
-    print('Filtering in features for only the following authors: ', filtered_spans_by_author.keys())
-    filtered_features = []
-    for feature in features:
-        found_in_any_author = False
-        for author_name, author_spans in filtered_spans_by_author.items():
-            if feature in author_spans:
-                found_in_any_author = True
-                break
-        if found_in_any_author:
-            filtered_features.append(feature)
-    features = filtered_features
     return {
-        "features": features,
         "spans": spans_by_author
     }
@@ -646,19 +642,17 @@ def compute_clusters_g2v_representation(
         key=lambda x: -x[1]  # Sort by contrastive score
     )
-    # Filter out features that are not present in any of the authors
     selected_authors = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(author_ids))
-    print('Filtering in g2v features for only the following authors: ', selected_authors)
-    authors_g2v_feats = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
     filtered_features = []
     for feature, score, z_score in top_g2v_feats:
-        found_in_any_author = False
-        for author_g2v_feats in authors_g2v_feats:
-            if author_g2v_feats[feature] > 0:
-                found_in_any_author = True
-                break
-        if found_in_any_author:
-            filtered_features.append((feature, score, z_score))
     print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features])  # Print feature names and z-scores

             print(f"Cache miss. Computing features for authors: {author_names}")
     client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    prompt = f"""Identify {max_num_feats} writing style features that are commonly between the authors texts.
     Author Texts:
+    {author_texts}
+    """
     def _make_call():
         response = client.chat.completions.create(
             model="gpt-4o-mini",
     features = retry_call(_make_call, FeatureIdentificationSchema).features
     if cache_key and author_names:
         cache[cache_key] = {
             "features": features
     background_corpus_df: pd.DataFrame,
     cluster_ids: List[Any],
     cluster_label_clm_name: str = 'authorID',
+    max_num_feats: int = 20,
     max_num_documents_per_author=3,
     max_num_authors=5,
+    max_authors_for_span_extraction=4
     ):
     print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
     # Filter out features that are not present in any of the authors
     filtered_spans_by_author = {x[0] : x[1] for x in spans_by_author.items() if x[0] in {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(cluster_ids))}
+    print(filtered_spans_by_author.keys())
+    filtered_spans_by_author = [set([f[0] for f in   x[1].items() if len(f[1]) > 0]) for x in filtered_spans_by_author.items()]
+    filtered_set_of_features =  filtered_spans_by_author[0] # all features that appear in all the sets in the filtered_Spans_by_authors list
+    for x in filtered_spans_by_author[1:]:
+        filtered_set_of_features = filtered_set_of_features.intersection(x)
+    print('filtered set of features: ', filtered_set_of_features)
     return {
+        "features": list(filtered_set_of_features),
         "spans": spans_by_author
     }
         key=lambda x: -x[1]  # Sort by contrastive score
     )
+    # Filter in only features that are present in selected_authors
     selected_authors = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}.intersection(set(author_ids))
+    # Filter in only features that are present in selected_authors
+    selected_authors_g2v_data = background_corpus_df[background_corpus_df['authorID'].isin(selected_authors)][features_clm_name].tolist()
     filtered_features = []
     for feature, score, z_score in top_g2v_feats:
+        # Check if the feature has a non-zero value in all of the selected authors
+        if all(author_g2v_feats.get(feature, 0) > 0 for author_g2v_feats in selected_authors_g2v_data):
+            filtered_features.append((feature, score, z_score)) # Only return feature and z-score
     print('Filtered G2V features: ', [(f[0], f[2]) for f in filtered_features])  # Print feature names and z-scores

utils/visualizations.py CHANGED Viewed

@@ -276,6 +276,7 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
     print(f"Task authors: {len(task_authors_df)}, Clustered authors: {len(clustered_authors_df)}")
     merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
     print(f"Merged authors DataFrame:\n{len(merged_authors_df)}")
     style_analysis_response = compute_clusters_style_representation_3(
         background_corpus_df=merged_authors_df,
         cluster_ids=visible_authors,

     print(f"Task authors: {len(task_authors_df)}, Clustered authors: {len(clustered_authors_df)}")
     merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
     print(f"Merged authors DataFrame:\n{len(merged_authors_df)}")
+    #style_analysis_response = {'features': [], 'spans': []}
     style_analysis_response = compute_clusters_style_representation_3(
         background_corpus_df=merged_authors_df,
         cluster_ids=visible_authors,