Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

Milad Alshomary commited on Sep 24

Commit

8883582

1 Parent(s): 8e5c429

updates

Browse files

Files changed (3) hide show

utils/interp_space_utils.py +22 -18
utils/llm_feat_utils.py +4 -2
utils/visualizations.py +1 -0

utils/interp_space_utils.py CHANGED Viewed

@@ -474,7 +474,7 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
             print(f"Cache miss. Computing features for authors: {author_names}")
     client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-    prompt = f"""Identify {max_num_feats} writing style features that are commonly between the authors texts.
     Author Texts:
     {author_texts}
@@ -483,7 +483,7 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
         response = client.chat.completions.create(
             model="gpt-4o-mini",
             messages=[
-                {"role": "assistant", "content": "You are a forensic linguist specializing in writing styles."},
                 {"role": "user", "content": prompt}
             ],
             response_format={
@@ -507,6 +507,8 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
             json.dump(cache, f, indent=2)
         print(f"Cached features for authors: {author_names}")
 def retry_call(call_fn, schema_class, max_attempts=3, wait_sec=2):
     for attempt in range(max_attempts):
@@ -547,7 +549,6 @@ def compute_clusters_style_representation_3(
     max_num_documents_per_author=1,
     max_num_authors=10,
     max_authors_for_span_extraction=4,
-    min_authors_required: int = 2,
     top_k: int = 10
     ):
@@ -563,6 +564,7 @@ def compute_clusters_style_representation_3(
     print(author_names)
     features = identify_style_features(author_texts, author_names, max_num_feats=max_num_feats)
     # STEP 2: Prepare author pool for span extraction
     span_df = background_corpus_df.iloc[:max_authors_for_span_extraction]
     author_names = span_df[cluster_label_clm_name].tolist()[:max_authors_for_span_extraction]
@@ -574,23 +576,22 @@ def compute_clusters_style_representation_3(
     task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
     filtered_task_authors = {author: feat_map for author, feat_map in spans_by_author.items() if author in task_author_names.intersection(set(cluster_ids))}
-    print(filtered_task_authors.keys())
     # Build per-author sets of features that have at least one span
     author_present_feature_sets = [
         {feature for feature, spans in feature_map.items() if len(spans) > 0}
         for _, feature_map in filtered_task_authors.items()
     ]
-    # If nothing to aggregate (e.g., no task authors in selection), fall back to empty list
-    selected_features_ranked = []
-    if author_present_feature_sets:
         coverage_counter = Counter()
         for present_set in author_present_feature_sets:
             coverage_counter.update(present_set)
         # Keep features present in at least `min_authors_required` authors
-        eligible_features = [feat for feat, cnt in coverage_counter.items() if cnt >= int(min_authors_required)]
         # Preserve original LLM feature ordering as a secondary key where possible
         feature_original_index = {feat: idx for idx, feat in enumerate(features)} if features else {}
@@ -599,8 +600,11 @@ def compute_clusters_style_representation_3(
             eligible_features,
             key=lambda f: (-coverage_counter[f], feature_original_index.get(f, 10**9))
         )[:int(top_k)]
-    print('filtered set of features (min coverage', min_authors_required, '): ', selected_features_ranked)
     return {
         "features": list(selected_features_ranked),
@@ -815,7 +819,7 @@ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, pred_idx, model
     #     q_proj, "Mystery Author"
     # )
-    # # Regions 2-4: Around each candidate
     # for i in range(3):
     #     regions[f"Candidate {i+1} Neighborhood"] = get_region_around_point(
     #         c_proj[i], f"Candidate {i+1}"
@@ -830,13 +834,13 @@ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, pred_idx, model
             )
     # Regions 8-10: Between candidate pairs
-    candidate_pairs = [(0, 1), (0, 2), (1, 2)]
-    for i, (c1, c2) in enumerate(candidate_pairs):
-        if c1 != pred_idx and c2 != pred_idx: #selecting only the non predicated candidates
-            region_name = f"Candidate {c1+1} & Candidate {c2+1}"
-            regions[region_name] = get_region_between_points(
-                c_proj[c1], c_proj[c2], f"Candidate {c1+1}", f"Candidate {c2+1}"
-            )
     # Regions 11-12: Around predicted and ground truth (if different)
     # This would need predicted_author and ground_truth_author indices

             print(f"Cache miss. Computing features for authors: {author_names}")
     client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    prompt = f"""Identify {max_num_feats} writing style features that are common between the authors texts.
     Author Texts:
     {author_texts}
         response = client.chat.completions.create(
             model="gpt-4o-mini",
             messages=[
+                {"role": "assistant", "content": "You are a forensic linguist who knows how to analyze linguistic and stylometric similarites between texts."},
                 {"role": "user", "content": prompt}
             ],
             response_format={
             json.dump(cache, f, indent=2)
         print(f"Cached features for authors: {author_names}")
+    return features
 def retry_call(call_fn, schema_class, max_attempts=3, wait_sec=2):
     for attempt in range(max_attempts):
     max_num_documents_per_author=1,
     max_num_authors=10,
     max_authors_for_span_extraction=4,
     top_k: int = 10
     ):
     print(author_names)
     features = identify_style_features(author_texts, author_names, max_num_feats=max_num_feats)
+    print("Features: ", features)
     # STEP 2: Prepare author pool for span extraction
     span_df = background_corpus_df.iloc[:max_authors_for_span_extraction]
     author_names = span_df[cluster_label_clm_name].tolist()[:max_authors_for_span_extraction]
     task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
     filtered_task_authors = {author: feat_map for author, feat_map in spans_by_author.items() if author in task_author_names.intersection(set(cluster_ids))}
     # Build per-author sets of features that have at least one span
     author_present_feature_sets = [
         {feature for feature, spans in feature_map.items() if len(spans) > 0}
         for _, feature_map in filtered_task_authors.items()
     ]
+    print(filtered_task_authors.keys(), author_present_feature_sets)
+    if len(author_present_feature_sets) > 0: # we have more than one task author
         coverage_counter = Counter()
         for present_set in author_present_feature_sets:
             coverage_counter.update(present_set)
         # Keep features present in at least `min_authors_required` authors
+        eligible_features = [feat for feat, cnt in coverage_counter.items() if cnt >= len(author_present_feature_sets)]
         # Preserve original LLM feature ordering as a secondary key where possible
         feature_original_index = {feat: idx for idx, feat in enumerate(features)} if features else {}
             eligible_features,
             key=lambda f: (-coverage_counter[f], feature_original_index.get(f, 10**9))
         )[:int(top_k)]
+    else:
+        selected_features_ranked = features
+    print('filtered set of features (min coverage', len(author_present_feature_sets), '): ', selected_features_ranked)
     return {
         "features": list(selected_features_ranked),
     #     q_proj, "Mystery Author"
     # )
+    # # # Regions 2-4: Around each candidate
     # for i in range(3):
     #     regions[f"Candidate {i+1} Neighborhood"] = get_region_around_point(
     #         c_proj[i], f"Candidate {i+1}"
             )
     # Regions 8-10: Between candidate pairs
+    # candidate_pairs = [(0, 1), (0, 2), (1, 2)]
+    # for i, (c1, c2) in enumerate(candidate_pairs):
+    #     if c1 != pred_idx and c2 != pred_idx: #selecting only the non predicated candidates
+    #         region_name = f"Candidate {c1+1} & Candidate {c2+1}"
+    #         regions[region_name] = get_region_between_points(
+    #             c_proj[c1], c_proj[c2], f"Candidate {c1+1}", f"Candidate {c2+1}"
+    #         )
     # Regions 11-12: Around predicted and ground truth (if different)
     # This would need predicted_author and ground_truth_author indices

utils/llm_feat_utils.py CHANGED Viewed

@@ -53,11 +53,13 @@ def generate_feature_spans(client, text: str, features: list[str]) -> str:
     {features}
     """
     response = client.chat.completions.create(
-        model="gpt-4",
         messages=[{"role":"user","content":prompt}],
         temperature=0.3,
     )
-    return response.choices[0].message.content
 def generate_feature_spans_with_retries(client, text: str, features: list[str]) -> dict:
     """

     {features}
     """
     response = client.chat.completions.create(
+        model="gpt-4o-mini",
         messages=[{"role":"user","content":prompt}],
         temperature=0.3,
     )
+    content = response.choices[0].message.content
+    content = content.replace('```json', '').replace('```','')
+    return content
 def generate_feature_spans_with_retries(client, text: str, features: list[str]) -> dict:
     """

utils/visualizations.py CHANGED Viewed

@@ -286,6 +286,7 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
     merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
     g2v_feats = compute_clusters_g2v_representation(
         background_corpus_df=merged_authors_df,
         author_ids=visible_authors,

     merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
+    #g2v_feats = []
     g2v_feats = compute_clusters_g2v_representation(
         background_corpus_df=merged_authors_df,
         author_ids=visible_authors,