Milad Alshomary
commited on
Commit
·
8883582
1
Parent(s):
8e5c429
updates
Browse files- utils/interp_space_utils.py +22 -18
- utils/llm_feat_utils.py +4 -2
- utils/visualizations.py +1 -0
utils/interp_space_utils.py
CHANGED
|
@@ -474,7 +474,7 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
|
|
| 474 |
print(f"Cache miss. Computing features for authors: {author_names}")
|
| 475 |
|
| 476 |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 477 |
-
prompt = f"""Identify {max_num_feats} writing style features that are
|
| 478 |
Author Texts:
|
| 479 |
|
| 480 |
{author_texts}
|
|
@@ -483,7 +483,7 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
|
|
| 483 |
response = client.chat.completions.create(
|
| 484 |
model="gpt-4o-mini",
|
| 485 |
messages=[
|
| 486 |
-
{"role": "assistant", "content": "You are a forensic linguist
|
| 487 |
{"role": "user", "content": prompt}
|
| 488 |
],
|
| 489 |
response_format={
|
|
@@ -507,6 +507,8 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
|
|
| 507 |
json.dump(cache, f, indent=2)
|
| 508 |
|
| 509 |
print(f"Cached features for authors: {author_names}")
|
|
|
|
|
|
|
| 510 |
|
| 511 |
def retry_call(call_fn, schema_class, max_attempts=3, wait_sec=2):
|
| 512 |
for attempt in range(max_attempts):
|
|
@@ -547,7 +549,6 @@ def compute_clusters_style_representation_3(
|
|
| 547 |
max_num_documents_per_author=1,
|
| 548 |
max_num_authors=10,
|
| 549 |
max_authors_for_span_extraction=4,
|
| 550 |
-
min_authors_required: int = 2,
|
| 551 |
top_k: int = 10
|
| 552 |
):
|
| 553 |
|
|
@@ -563,6 +564,7 @@ def compute_clusters_style_representation_3(
|
|
| 563 |
print(author_names)
|
| 564 |
features = identify_style_features(author_texts, author_names, max_num_feats=max_num_feats)
|
| 565 |
|
|
|
|
| 566 |
# STEP 2: Prepare author pool for span extraction
|
| 567 |
span_df = background_corpus_df.iloc[:max_authors_for_span_extraction]
|
| 568 |
author_names = span_df[cluster_label_clm_name].tolist()[:max_authors_for_span_extraction]
|
|
@@ -574,23 +576,22 @@ def compute_clusters_style_representation_3(
|
|
| 574 |
task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
|
| 575 |
filtered_task_authors = {author: feat_map for author, feat_map in spans_by_author.items() if author in task_author_names.intersection(set(cluster_ids))}
|
| 576 |
|
| 577 |
-
print(filtered_task_authors.keys())
|
| 578 |
-
|
| 579 |
# Build per-author sets of features that have at least one span
|
| 580 |
author_present_feature_sets = [
|
| 581 |
{feature for feature, spans in feature_map.items() if len(spans) > 0}
|
| 582 |
for _, feature_map in filtered_task_authors.items()
|
| 583 |
]
|
| 584 |
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
|
|
|
| 588 |
coverage_counter = Counter()
|
| 589 |
for present_set in author_present_feature_sets:
|
| 590 |
coverage_counter.update(present_set)
|
| 591 |
|
| 592 |
# Keep features present in at least `min_authors_required` authors
|
| 593 |
-
eligible_features = [feat for feat, cnt in coverage_counter.items() if cnt >=
|
| 594 |
|
| 595 |
# Preserve original LLM feature ordering as a secondary key where possible
|
| 596 |
feature_original_index = {feat: idx for idx, feat in enumerate(features)} if features else {}
|
|
@@ -599,8 +600,11 @@ def compute_clusters_style_representation_3(
|
|
| 599 |
eligible_features,
|
| 600 |
key=lambda f: (-coverage_counter[f], feature_original_index.get(f, 10**9))
|
| 601 |
)[:int(top_k)]
|
|
|
|
|
|
|
|
|
|
| 602 |
|
| 603 |
-
print('filtered set of features (min coverage',
|
| 604 |
|
| 605 |
return {
|
| 606 |
"features": list(selected_features_ranked),
|
|
@@ -815,7 +819,7 @@ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, pred_idx, model
|
|
| 815 |
# q_proj, "Mystery Author"
|
| 816 |
# )
|
| 817 |
|
| 818 |
-
# # Regions 2-4: Around each candidate
|
| 819 |
# for i in range(3):
|
| 820 |
# regions[f"Candidate {i+1} Neighborhood"] = get_region_around_point(
|
| 821 |
# c_proj[i], f"Candidate {i+1}"
|
|
@@ -830,13 +834,13 @@ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, pred_idx, model
|
|
| 830 |
)
|
| 831 |
|
| 832 |
# Regions 8-10: Between candidate pairs
|
| 833 |
-
candidate_pairs = [(0, 1), (0, 2), (1, 2)]
|
| 834 |
-
for i, (c1, c2) in enumerate(candidate_pairs):
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
|
| 841 |
# Regions 11-12: Around predicted and ground truth (if different)
|
| 842 |
# This would need predicted_author and ground_truth_author indices
|
|
|
|
| 474 |
print(f"Cache miss. Computing features for authors: {author_names}")
|
| 475 |
|
| 476 |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 477 |
+
prompt = f"""Identify {max_num_feats} writing style features that are common between the authors texts.
|
| 478 |
Author Texts:
|
| 479 |
|
| 480 |
{author_texts}
|
|
|
|
| 483 |
response = client.chat.completions.create(
|
| 484 |
model="gpt-4o-mini",
|
| 485 |
messages=[
|
| 486 |
+
{"role": "assistant", "content": "You are a forensic linguist who knows how to analyze linguistic and stylometric similarites between texts."},
|
| 487 |
{"role": "user", "content": prompt}
|
| 488 |
],
|
| 489 |
response_format={
|
|
|
|
| 507 |
json.dump(cache, f, indent=2)
|
| 508 |
|
| 509 |
print(f"Cached features for authors: {author_names}")
|
| 510 |
+
|
| 511 |
+
return features
|
| 512 |
|
| 513 |
def retry_call(call_fn, schema_class, max_attempts=3, wait_sec=2):
|
| 514 |
for attempt in range(max_attempts):
|
|
|
|
| 549 |
max_num_documents_per_author=1,
|
| 550 |
max_num_authors=10,
|
| 551 |
max_authors_for_span_extraction=4,
|
|
|
|
| 552 |
top_k: int = 10
|
| 553 |
):
|
| 554 |
|
|
|
|
| 564 |
print(author_names)
|
| 565 |
features = identify_style_features(author_texts, author_names, max_num_feats=max_num_feats)
|
| 566 |
|
| 567 |
+
print("Features: ", features)
|
| 568 |
# STEP 2: Prepare author pool for span extraction
|
| 569 |
span_df = background_corpus_df.iloc[:max_authors_for_span_extraction]
|
| 570 |
author_names = span_df[cluster_label_clm_name].tolist()[:max_authors_for_span_extraction]
|
|
|
|
| 576 |
task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
|
| 577 |
filtered_task_authors = {author: feat_map for author, feat_map in spans_by_author.items() if author in task_author_names.intersection(set(cluster_ids))}
|
| 578 |
|
|
|
|
|
|
|
| 579 |
# Build per-author sets of features that have at least one span
|
| 580 |
author_present_feature_sets = [
|
| 581 |
{feature for feature, spans in feature_map.items() if len(spans) > 0}
|
| 582 |
for _, feature_map in filtered_task_authors.items()
|
| 583 |
]
|
| 584 |
|
| 585 |
+
print(filtered_task_authors.keys(), author_present_feature_sets)
|
| 586 |
+
|
| 587 |
+
|
| 588 |
+
if len(author_present_feature_sets) > 0: # we have more than one task author
|
| 589 |
coverage_counter = Counter()
|
| 590 |
for present_set in author_present_feature_sets:
|
| 591 |
coverage_counter.update(present_set)
|
| 592 |
|
| 593 |
# Keep features present in at least `min_authors_required` authors
|
| 594 |
+
eligible_features = [feat for feat, cnt in coverage_counter.items() if cnt >= len(author_present_feature_sets)]
|
| 595 |
|
| 596 |
# Preserve original LLM feature ordering as a secondary key where possible
|
| 597 |
feature_original_index = {feat: idx for idx, feat in enumerate(features)} if features else {}
|
|
|
|
| 600 |
eligible_features,
|
| 601 |
key=lambda f: (-coverage_counter[f], feature_original_index.get(f, 10**9))
|
| 602 |
)[:int(top_k)]
|
| 603 |
+
else:
|
| 604 |
+
selected_features_ranked = features
|
| 605 |
+
|
| 606 |
|
| 607 |
+
print('filtered set of features (min coverage', len(author_present_feature_sets), '): ', selected_features_ranked)
|
| 608 |
|
| 609 |
return {
|
| 610 |
"features": list(selected_features_ranked),
|
|
|
|
| 819 |
# q_proj, "Mystery Author"
|
| 820 |
# )
|
| 821 |
|
| 822 |
+
# # # Regions 2-4: Around each candidate
|
| 823 |
# for i in range(3):
|
| 824 |
# regions[f"Candidate {i+1} Neighborhood"] = get_region_around_point(
|
| 825 |
# c_proj[i], f"Candidate {i+1}"
|
|
|
|
| 834 |
)
|
| 835 |
|
| 836 |
# Regions 8-10: Between candidate pairs
|
| 837 |
+
# candidate_pairs = [(0, 1), (0, 2), (1, 2)]
|
| 838 |
+
# for i, (c1, c2) in enumerate(candidate_pairs):
|
| 839 |
+
# if c1 != pred_idx and c2 != pred_idx: #selecting only the non predicated candidates
|
| 840 |
+
# region_name = f"Candidate {c1+1} & Candidate {c2+1}"
|
| 841 |
+
# regions[region_name] = get_region_between_points(
|
| 842 |
+
# c_proj[c1], c_proj[c2], f"Candidate {c1+1}", f"Candidate {c2+1}"
|
| 843 |
+
# )
|
| 844 |
|
| 845 |
# Regions 11-12: Around predicted and ground truth (if different)
|
| 846 |
# This would need predicted_author and ground_truth_author indices
|
utils/llm_feat_utils.py
CHANGED
|
@@ -53,11 +53,13 @@ def generate_feature_spans(client, text: str, features: list[str]) -> str:
|
|
| 53 |
{features}
|
| 54 |
"""
|
| 55 |
response = client.chat.completions.create(
|
| 56 |
-
model="gpt-
|
| 57 |
messages=[{"role":"user","content":prompt}],
|
| 58 |
temperature=0.3,
|
| 59 |
)
|
| 60 |
-
|
|
|
|
|
|
|
| 61 |
|
| 62 |
def generate_feature_spans_with_retries(client, text: str, features: list[str]) -> dict:
|
| 63 |
"""
|
|
|
|
| 53 |
{features}
|
| 54 |
"""
|
| 55 |
response = client.chat.completions.create(
|
| 56 |
+
model="gpt-4o-mini",
|
| 57 |
messages=[{"role":"user","content":prompt}],
|
| 58 |
temperature=0.3,
|
| 59 |
)
|
| 60 |
+
content = response.choices[0].message.content
|
| 61 |
+
content = content.replace('```json', '').replace('```','')
|
| 62 |
+
return content
|
| 63 |
|
| 64 |
def generate_feature_spans_with_retries(client, text: str, features: list[str]) -> dict:
|
| 65 |
"""
|
utils/visualizations.py
CHANGED
|
@@ -286,6 +286,7 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
|
|
| 286 |
|
| 287 |
|
| 288 |
merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
|
|
|
|
| 289 |
g2v_feats = compute_clusters_g2v_representation(
|
| 290 |
background_corpus_df=merged_authors_df,
|
| 291 |
author_ids=visible_authors,
|
|
|
|
| 286 |
|
| 287 |
|
| 288 |
merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
|
| 289 |
+
#g2v_feats = []
|
| 290 |
g2v_feats = compute_clusters_g2v_representation(
|
| 291 |
background_corpus_df=merged_authors_df,
|
| 292 |
author_ids=visible_authors,
|