Milad Alshomary commited on
Commit
8883582
·
1 Parent(s): 8e5c429
utils/interp_space_utils.py CHANGED
@@ -474,7 +474,7 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
474
  print(f"Cache miss. Computing features for authors: {author_names}")
475
 
476
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
477
- prompt = f"""Identify {max_num_feats} writing style features that are commonly between the authors texts.
478
  Author Texts:
479
 
480
  {author_texts}
@@ -483,7 +483,7 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
483
  response = client.chat.completions.create(
484
  model="gpt-4o-mini",
485
  messages=[
486
- {"role": "assistant", "content": "You are a forensic linguist specializing in writing styles."},
487
  {"role": "user", "content": prompt}
488
  ],
489
  response_format={
@@ -507,6 +507,8 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
507
  json.dump(cache, f, indent=2)
508
 
509
  print(f"Cached features for authors: {author_names}")
 
 
510
 
511
  def retry_call(call_fn, schema_class, max_attempts=3, wait_sec=2):
512
  for attempt in range(max_attempts):
@@ -547,7 +549,6 @@ def compute_clusters_style_representation_3(
547
  max_num_documents_per_author=1,
548
  max_num_authors=10,
549
  max_authors_for_span_extraction=4,
550
- min_authors_required: int = 2,
551
  top_k: int = 10
552
  ):
553
 
@@ -563,6 +564,7 @@ def compute_clusters_style_representation_3(
563
  print(author_names)
564
  features = identify_style_features(author_texts, author_names, max_num_feats=max_num_feats)
565
 
 
566
  # STEP 2: Prepare author pool for span extraction
567
  span_df = background_corpus_df.iloc[:max_authors_for_span_extraction]
568
  author_names = span_df[cluster_label_clm_name].tolist()[:max_authors_for_span_extraction]
@@ -574,23 +576,22 @@ def compute_clusters_style_representation_3(
574
  task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
575
  filtered_task_authors = {author: feat_map for author, feat_map in spans_by_author.items() if author in task_author_names.intersection(set(cluster_ids))}
576
 
577
- print(filtered_task_authors.keys())
578
-
579
  # Build per-author sets of features that have at least one span
580
  author_present_feature_sets = [
581
  {feature for feature, spans in feature_map.items() if len(spans) > 0}
582
  for _, feature_map in filtered_task_authors.items()
583
  ]
584
 
585
- # If nothing to aggregate (e.g., no task authors in selection), fall back to empty list
586
- selected_features_ranked = []
587
- if author_present_feature_sets:
 
588
  coverage_counter = Counter()
589
  for present_set in author_present_feature_sets:
590
  coverage_counter.update(present_set)
591
 
592
  # Keep features present in at least `min_authors_required` authors
593
- eligible_features = [feat for feat, cnt in coverage_counter.items() if cnt >= int(min_authors_required)]
594
 
595
  # Preserve original LLM feature ordering as a secondary key where possible
596
  feature_original_index = {feat: idx for idx, feat in enumerate(features)} if features else {}
@@ -599,8 +600,11 @@ def compute_clusters_style_representation_3(
599
  eligible_features,
600
  key=lambda f: (-coverage_counter[f], feature_original_index.get(f, 10**9))
601
  )[:int(top_k)]
 
 
 
602
 
603
- print('filtered set of features (min coverage', min_authors_required, '): ', selected_features_ranked)
604
 
605
  return {
606
  "features": list(selected_features_ranked),
@@ -815,7 +819,7 @@ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, pred_idx, model
815
  # q_proj, "Mystery Author"
816
  # )
817
 
818
- # # Regions 2-4: Around each candidate
819
  # for i in range(3):
820
  # regions[f"Candidate {i+1} Neighborhood"] = get_region_around_point(
821
  # c_proj[i], f"Candidate {i+1}"
@@ -830,13 +834,13 @@ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, pred_idx, model
830
  )
831
 
832
  # Regions 8-10: Between candidate pairs
833
- candidate_pairs = [(0, 1), (0, 2), (1, 2)]
834
- for i, (c1, c2) in enumerate(candidate_pairs):
835
- if c1 != pred_idx and c2 != pred_idx: #selecting only the non predicated candidates
836
- region_name = f"Candidate {c1+1} & Candidate {c2+1}"
837
- regions[region_name] = get_region_between_points(
838
- c_proj[c1], c_proj[c2], f"Candidate {c1+1}", f"Candidate {c2+1}"
839
- )
840
 
841
  # Regions 11-12: Around predicted and ground truth (if different)
842
  # This would need predicted_author and ground_truth_author indices
 
474
  print(f"Cache miss. Computing features for authors: {author_names}")
475
 
476
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
477
+ prompt = f"""Identify {max_num_feats} writing style features that are common between the authors texts.
478
  Author Texts:
479
 
480
  {author_texts}
 
483
  response = client.chat.completions.create(
484
  model="gpt-4o-mini",
485
  messages=[
486
+ {"role": "assistant", "content": "You are a forensic linguist who knows how to analyze linguistic and stylometric similarites between texts."},
487
  {"role": "user", "content": prompt}
488
  ],
489
  response_format={
 
507
  json.dump(cache, f, indent=2)
508
 
509
  print(f"Cached features for authors: {author_names}")
510
+
511
+ return features
512
 
513
  def retry_call(call_fn, schema_class, max_attempts=3, wait_sec=2):
514
  for attempt in range(max_attempts):
 
549
  max_num_documents_per_author=1,
550
  max_num_authors=10,
551
  max_authors_for_span_extraction=4,
 
552
  top_k: int = 10
553
  ):
554
 
 
564
  print(author_names)
565
  features = identify_style_features(author_texts, author_names, max_num_feats=max_num_feats)
566
 
567
+ print("Features: ", features)
568
  # STEP 2: Prepare author pool for span extraction
569
  span_df = background_corpus_df.iloc[:max_authors_for_span_extraction]
570
  author_names = span_df[cluster_label_clm_name].tolist()[:max_authors_for_span_extraction]
 
576
  task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
577
  filtered_task_authors = {author: feat_map for author, feat_map in spans_by_author.items() if author in task_author_names.intersection(set(cluster_ids))}
578
 
 
 
579
  # Build per-author sets of features that have at least one span
580
  author_present_feature_sets = [
581
  {feature for feature, spans in feature_map.items() if len(spans) > 0}
582
  for _, feature_map in filtered_task_authors.items()
583
  ]
584
 
585
+ print(filtered_task_authors.keys(), author_present_feature_sets)
586
+
587
+
588
+ if len(author_present_feature_sets) > 0: # we have more than one task author
589
  coverage_counter = Counter()
590
  for present_set in author_present_feature_sets:
591
  coverage_counter.update(present_set)
592
 
593
  # Keep features present in at least `min_authors_required` authors
594
+ eligible_features = [feat for feat, cnt in coverage_counter.items() if cnt >= len(author_present_feature_sets)]
595
 
596
  # Preserve original LLM feature ordering as a secondary key where possible
597
  feature_original_index = {feat: idx for idx, feat in enumerate(features)} if features else {}
 
600
  eligible_features,
601
  key=lambda f: (-coverage_counter[f], feature_original_index.get(f, 10**9))
602
  )[:int(top_k)]
603
+ else:
604
+ selected_features_ranked = features
605
+
606
 
607
+ print('filtered set of features (min coverage', len(author_present_feature_sets), '): ', selected_features_ranked)
608
 
609
  return {
610
  "features": list(selected_features_ranked),
 
819
  # q_proj, "Mystery Author"
820
  # )
821
 
822
+ # # # Regions 2-4: Around each candidate
823
  # for i in range(3):
824
  # regions[f"Candidate {i+1} Neighborhood"] = get_region_around_point(
825
  # c_proj[i], f"Candidate {i+1}"
 
834
  )
835
 
836
  # Regions 8-10: Between candidate pairs
837
+ # candidate_pairs = [(0, 1), (0, 2), (1, 2)]
838
+ # for i, (c1, c2) in enumerate(candidate_pairs):
839
+ # if c1 != pred_idx and c2 != pred_idx: #selecting only the non predicated candidates
840
+ # region_name = f"Candidate {c1+1} & Candidate {c2+1}"
841
+ # regions[region_name] = get_region_between_points(
842
+ # c_proj[c1], c_proj[c2], f"Candidate {c1+1}", f"Candidate {c2+1}"
843
+ # )
844
 
845
  # Regions 11-12: Around predicted and ground truth (if different)
846
  # This would need predicted_author and ground_truth_author indices
utils/llm_feat_utils.py CHANGED
@@ -53,11 +53,13 @@ def generate_feature_spans(client, text: str, features: list[str]) -> str:
53
  {features}
54
  """
55
  response = client.chat.completions.create(
56
- model="gpt-4",
57
  messages=[{"role":"user","content":prompt}],
58
  temperature=0.3,
59
  )
60
- return response.choices[0].message.content
 
 
61
 
62
  def generate_feature_spans_with_retries(client, text: str, features: list[str]) -> dict:
63
  """
 
53
  {features}
54
  """
55
  response = client.chat.completions.create(
56
+ model="gpt-4o-mini",
57
  messages=[{"role":"user","content":prompt}],
58
  temperature=0.3,
59
  )
60
+ content = response.choices[0].message.content
61
+ content = content.replace('```json', '').replace('```','')
62
+ return content
63
 
64
  def generate_feature_spans_with_retries(client, text: str, features: list[str]) -> dict:
65
  """
utils/visualizations.py CHANGED
@@ -286,6 +286,7 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
286
 
287
 
288
  merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
 
289
  g2v_feats = compute_clusters_g2v_representation(
290
  background_corpus_df=merged_authors_df,
291
  author_ids=visible_authors,
 
286
 
287
 
288
  merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
289
+ #g2v_feats = []
290
  g2v_feats = compute_clusters_g2v_representation(
291
  background_corpus_df=merged_authors_df,
292
  author_ids=visible_authors,