Anisha Bhatnagar commited on
Commit
88f39f5
·
1 Parent(s): 2194877

moved gram2vec cache files to its won folders ; reduced logging ; commented out unused code

Browse files
Files changed (1) hide show
  1. utils/interp_space_utils.py +254 -232
utils/interp_space_utils.py CHANGED
@@ -25,9 +25,11 @@ from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
25
  from sklearn.decomposition import PCA
26
 
27
  CACHE_DIR = "datasets/embeddings_cache"
 
28
  ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
29
  REGION_CACHE = "datasets/region_cache/regions_cache.pkl"
30
  os.makedirs(CACHE_DIR, exist_ok=True)
 
31
  os.makedirs(os.path.dirname(ZOOM_CACHE), exist_ok=True)
32
  os.makedirs(os.path.dirname(REGION_CACHE), exist_ok=True)
33
  # Bump this whenever there is a change etc...
@@ -54,8 +56,8 @@ def compute_g2v_features(clustered_authors_df: pd.DataFrame, task_authors_df: pd
54
  print (f"concatenating task authors and background corpus authors")
55
  print(f"Number of task authors: {len(task_authors_df)}")
56
  print(f"task authors author_ids: {task_authors_df.authorID.tolist()}")
57
- print(f"task authors -->")
58
- print(task_authors_df)
59
  print(f"Number of background corpus authors: {len(clustered_authors_df)}")
60
  clustered_authors_df = pd.concat([task_authors_df, clustered_authors_df])
61
  print(f"Number of authors after concatenation: {len(clustered_authors_df)}")
@@ -63,10 +65,12 @@ def compute_g2v_features(clustered_authors_df: pd.DataFrame, task_authors_df: pd
63
  # Gather the input texts (preserves list-of-strings if any)
64
  #texts = background_corpus_df[text_clm].fillna("").tolist()
65
  author_texts = ['\n\n'.join(x) for x in clustered_authors_df.fullText.tolist()]
66
- print('author_text at 0:{}'.format(author_texts[0]))
67
  print(f"Number of author_texts: {len(author_texts)}")
68
 
69
  # Create a reproducible JSON serialization of the texts
 
 
70
  serialized = json.dumps({
71
  "col": text_clm,
72
  "texts": author_texts
@@ -74,15 +78,20 @@ def compute_g2v_features(clustered_authors_df: pd.DataFrame, task_authors_df: pd
74
 
75
  # Compute MD5 hash
76
  digest = hashlib.md5(serialized.encode("utf-8")).hexdigest()
77
- cache_path = os.path.join(CACHE_DIR, f"{digest}.pkl")
78
 
79
  # If cache hit, load and return
80
  if os.path.exists(cache_path):
81
- print(f"Cache hit...")
 
 
82
  with open(cache_path, "rb") as f:
83
  clustered_authors_df = pickle.load(f)
84
 
85
  else: # Else compute and cache
 
 
 
86
  g2v_feats_df = vectorizer.from_documents(author_texts, batch_size=8)
87
 
88
  print(f"Number of g2v features: {len(g2v_feats_df)}")
@@ -116,6 +125,9 @@ def compute_g2v_features(clustered_authors_df: pd.DataFrame, task_authors_df: pd
116
 
117
  with open(cache_path, "wb") as f:
118
  pickle.dump(clustered_authors_df, f)
 
 
 
119
 
120
  if task_authors_df is not None:
121
  task_authors_df = clustered_authors_df[clustered_authors_df.authorID.isin(task_authors_df.authorID.tolist())]
@@ -266,14 +278,14 @@ def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
266
 
267
  # If cache hit, load and return
268
  if os.path.exists(cache_path):
269
- print(f"Cache hit for {model_name} on column '{text_clm}'")
270
- print(cache_path)
271
  with open(cache_path, "rb") as f:
272
  background_corpus_df = pickle.load(f)
273
 
274
  else:
275
  # Otherwise, compute, cache, and return
276
- print(f"Computing embeddings for {model_name} on column '{text_clm}', saving to {cache_path}")
277
  task_and_background_embeddings = generate_style_embedding(background_corpus_df, text_clm, model_name, dimensionality_reduction=False)
278
  # Create a clean column name from the model name
279
  col_name = f'{model_name.split("/")[-1]}_style_embedding'
@@ -281,6 +293,7 @@ def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
281
 
282
  with open(cache_path, "wb") as f:
283
  pickle.dump(background_corpus_df, f)
 
284
 
285
  if task_authors_df is not None:
286
  task_authors_df = background_corpus_df[background_corpus_df.authorID.isin(task_authors_df.authorID.tolist())]
@@ -288,163 +301,167 @@ def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
288
 
289
  return background_corpus_df, task_authors_df
290
 
291
- def get_style_feats_distribution(documentIDs, style_feats_dict):
292
- style_feats = []
293
- for documentId in documentIDs:
294
- if documentId not in document_to_style_feats:
295
- #print(documentId)
296
- continue
297
-
298
- style_feats+= document_to_style_feats[documentId]
299
-
300
- tfidf = [style_feats.count(key) * val for key, val in style_feats_dict.items()]
301
-
302
- return tfidf
303
-
304
- def get_cluster_top_feats(style_feats_distribution, style_feats_list, top_k=5):
305
- sorted_feats = np.argsort(style_feats_distribution)[::-1]
306
- top_feats = [style_feats_list[x] for x in sorted_feats[:top_k] if style_feats_distribution[x] > 0]
307
- return top_feats
308
-
309
- def compute_clusters_style_representation(
310
- background_corpus_df: pd.DataFrame,
311
- cluster_ids: List[Any],
312
- other_cluster_ids: List[Any],
313
- features_clm_name: str,
314
- cluster_label_clm_name: str = 'cluster_label',
315
- top_n: int = 10
316
- ) -> List[str]:
317
- """
318
- Given a DataFrame with document IDs, cluster IDs, and feature lists,
319
- return the top N features that are most important in the specified `cluster_ids`
320
- while having low importance in `other_cluster_ids`.
321
- Importance is determined by TF-IDF scores. The final score for a feature is
322
- (summed TF-IDF in `cluster_ids`) - (summed TF-IDF in `other_cluster_ids`).
323
-
324
- Parameters:
325
- - background_corpus_df: pd.DataFrame. Must contain the columns specified by
326
- `cluster_label_clm_name` and `features_clm_name`.
327
- The column `features_clm_name` should contain lists of strings (features).
328
- - cluster_ids: List of cluster IDs for which to find representative features (target clusters).
329
- - other_cluster_ids: List of cluster IDs whose features should be down-weighted.
330
- Features prominent in these clusters will have their scores reduced.
331
- Pass an empty list or None if no contrastive clusters are needed.
332
- - features_clm_name: The name of the column in `background_corpus_df` that
333
- contains the list of features for each document.
334
- - cluster_label_clm_name: The name of the column in `background_corpus_df`
335
- that contains the cluster labels. Defaults to 'cluster_label'.
336
- - top_n: Number of top features to return.
337
- Returns:
338
- - List[str]: A list of feature names. These are up to `top_n` features
339
- ranked by their adjusted TF-IDF scores (score in `cluster_ids`
340
- minus score in `other_cluster_ids`). Only features with a final
341
- adjusted score > 0 are included.
342
- """
343
-
344
- assert background_corpus_df[features_clm_name].apply(
345
- lambda x: isinstance(x, list) and all(isinstance(feat, str) for feat in x)
346
- ).all(), f"Column '{features_clm_name}' must contain lists of strings."
347
-
348
- # Compute TF-IDF on the entire corpus
349
- vectorizer = TfidfVectorizer(
350
- tokenizer=lambda x: x,
351
- preprocessor=lambda x: x,
352
- token_pattern=None # Disable default token pattern, treat items in list as tokens
353
- )
354
- tfidf_matrix = vectorizer.fit_transform(background_corpus_df[features_clm_name])
355
- feature_names = vectorizer.get_feature_names_out()
356
-
357
- # Get boolean mask for documents in selected clusters
358
- selected_mask = background_corpus_df[cluster_label_clm_name].isin(cluster_ids).to_numpy()
359
-
360
- if not selected_mask.any():
361
- return [] # No documents found for the given cluster_ids
362
-
363
- # Subset the TF-IDF matrix using the boolean mask
364
- selected_tfidf = tfidf_matrix[selected_mask]
365
-
366
- # Sum TF-IDF scores across documents for each feature in the target clusters
367
- target_feature_scores_sum = selected_tfidf.sum(axis=0).A1 # Convert to 1D array
368
-
369
- # Initialize adjusted scores with target scores
370
- adjusted_feature_scores = target_feature_scores_sum.copy()
371
-
372
- # If other_cluster_ids are provided and not empty, subtract their TF-IDF sums
373
- if other_cluster_ids: # Checks if the list is not None and not empty
374
- other_selected_mask = background_corpus_df[cluster_label_clm_name].isin(other_cluster_ids).to_numpy()
375
-
376
- if other_selected_mask.any():
377
- other_selected_tfidf = tfidf_matrix[other_selected_mask]
378
- contrast_feature_scores_sum = other_selected_tfidf.sum(axis=0).A1
 
 
 
379
 
380
- # Element-wise subtraction; assumes feature_names aligns for both sums
381
- adjusted_feature_scores -= contrast_feature_scores_sum
382
-
383
- # Map scores to feature names
384
- feature_score_dict = dict(zip(feature_names, adjusted_feature_scores))
385
- # Sort features by score
386
- sorted_features = sorted(feature_score_dict.items(), key=lambda item: item[1], reverse=True)
387
-
388
- # Return the names of the top_n features that have a score > 0
389
- top_features = [feature for feature, score in sorted_features if score > 0][:top_n]
390
-
391
- return top_features
392
-
393
- def compute_clusters_style_representation_2(
394
- background_corpus_df: pd.DataFrame,
395
- cluster_ids: List[Any],
396
- cluster_label_clm_name: str = 'cluster_label',
397
- max_num_feats: int = 5,
398
- max_num_documents_per_author=3,
399
- max_num_authors=5):
400
- """
401
- Call openAI to analyze the common writing style features of the given list of texts
402
- """
403
- client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
404
-
405
- background_corpus_df['fullText'] = background_corpus_df['fullText'].map(lambda x: '\n\n'.join(x[:max_num_documents_per_author]) if isinstance(x, list) else x)
406
- background_corpus_df = background_corpus_df[background_corpus_df[cluster_label_clm_name].isin(cluster_ids)]
407
-
408
- author_texts = background_corpus_df['fullText'].tolist()[:max_num_authors]
409
- author_texts = "\n\n".join(["""Author {}:\n""".format(i+1) + text for i, text in enumerate(author_texts)])
410
- author_names = background_corpus_df[cluster_label_clm_name].tolist()[:max_num_authors]
411
- print(f"Number of authors: {len(background_corpus_df)}")
412
- print(author_names)
413
- print(author_texts)
414
- print(f"Number of authors: {len(author_names)}")
415
- print(f"Number of authors: {len(author_texts)}")
416
-
417
- prompt = f"""First identify a list of {max_num_feats} writing style features that are common between the given texts. Second for every author text and style feature, extract all spans that represent the feature. Output for every author all style features with their spans.
418
- Author Texts:
419
- \"\"\"{author_texts}\"\"\"
420
- """
421
-
422
- # Compute MD5 hash
423
- digest = hashlib.md5(prompt.encode("utf-8")).hexdigest()
424
- cache_path = os.path.join(CACHE_DIR, f"{digest}.pkl")
425
-
426
- # If cache hit, load and return
427
- if os.path.exists(cache_path):
428
- print(f"Loading authors writing style from cache ...")
429
- with open(cache_path, "rb") as f:
430
- parsed_response = pickle.load(f)
431
-
432
- else: # Else compute and cache
433
-
434
- response = client.chat.completions.create(
435
- model="gpt-4o-mini",
436
- messages=[
437
- {"role":"assistant","content":"You are a forensic linguistic who knows how to analyze similarites in writing styles."},
438
- {"role":"user","content":prompt}],
439
- response_format={"type": "json_schema", "json_schema": {"name": "style_analysis_schema", "schema": to_strict_json_schema(style_analysis_schema)}}
440
- )
441
-
442
- parsed_response = json.loads(response.choices[0].message.content)
443
-
444
- with open(cache_path, "wb") as f:
445
- pickle.dump(parsed_response, f)
446
-
447
- return parsed_response
 
448
 
449
  def generate_cache_key(author_names: List[str], max_num_feats: int) -> str:
450
  """Generate a unique cache key based on author names and max features"""
@@ -470,10 +487,11 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
470
 
471
  if cache_key in cache:
472
  print(f"\nCache hit! Using cached features for authors: {author_names}")
 
473
  return cache[cache_key]["features"]
474
  else:
475
- print(f"Cache miss. Computing features for authors: {author_names}")
476
-
477
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
478
  prompt = f"""Identify {max_num_feats} writing style features that are common between the authors texts.
479
  Author Texts:
@@ -481,9 +499,9 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
481
  {author_texts}
482
  """
483
 
484
- print('==================>>>>>>>>>>')
485
- print(prompt)
486
- print('==================>>>>>>>>>>')
487
  def _make_call():
488
  response = client.chat.completions.create(
489
  model="gpt-4o",
@@ -510,6 +528,8 @@ def identify_style_features(author_texts: list[str], author_names: list[str], ma
510
  # save_cache(cache)
511
  with open(ZOOM_CACHE, 'w') as f:
512
  json.dump(cache, f, indent=2)
 
 
513
 
514
  print(f"Cached features for authors: {author_names}")
515
 
@@ -538,7 +558,7 @@ def extract_all_spans(authors_df: pd.DataFrame, features: list[str], cluster_lab
538
 
539
  for _, row in authors_df.iterrows():
540
  author_name = str(row[cluster_label_clm_name])
541
- print(author_name)
542
  role = f"{author_name}"
543
  full_text = row['fullText']
544
  spans = generate_feature_spans_cached(client, full_text, features, role)
@@ -566,15 +586,15 @@ def compute_clusters_style_representation_3(
566
  author_texts = "\n\n".join(["""Author {}:\n""".format(i+1) + text for i, text in enumerate(author_texts)])
567
  author_names = background_corpus_df_feat_id[cluster_label_clm_name].tolist()[:max_num_authors]
568
  print(f"Number of authors: {len(background_corpus_df_feat_id)}")
569
- print(author_names)
570
  features = identify_style_features(author_texts, author_names, max_num_feats=max_num_feats)
571
 
572
- print("Features: ", features)
573
  # STEP 2: Prepare author pool for span extraction
574
  span_df = background_corpus_df.iloc[:max_authors_for_span_extraction]
575
  author_names = span_df[cluster_label_clm_name].tolist()[:max_authors_for_span_extraction]
576
  print(f"Number of authors for span detection : {len(span_df)}")
577
- print(author_names)
578
  spans_by_author = extract_all_spans(span_df, features, cluster_label_clm_name)
579
 
580
  # Filter-in only task authors that are part of the current selection
@@ -619,7 +639,7 @@ def compute_clusters_style_representation_3(
619
  for feature, spans in feature_map.items():
620
  if spans:
621
  feature_importance[feature] -= len(spans)
622
- print(feature_importance)
623
  selected_features_ranked = sorted(feature_importance, key=lambda f: -feature_importance[f])[:int(top_k)]
624
 
625
  #print('filtered set of features (min coverage', len(author_present_feature_sets), '): ', selected_features_ranked)
@@ -714,68 +734,69 @@ def compute_clusters_g2v_representation(
714
  key=lambda x: (-feature_span_counts.get(x[0], 0), -x[1])
715
  )
716
 
717
- print(f"[INFO] Sorted gram2vec features by span frequency: {[(f, feature_span_counts.get(f, 0), z) for f, z in sorted_by_spans[:top_n]]}")
718
 
719
  return sorted_by_spans[:top_n]
720
 
721
- def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
 
722
 
723
- styles_df = pd.read_csv(styles_df_path)[[feat_clm, "documentID"]]
724
 
725
- # A dictionary of style features and their IDF
726
- style_feats_agg_df = styles_df.groupby(feat_clm).agg({'documentID': lambda x : len(list(x))}).reset_index()
727
- style_feats_agg_df['document_freq'] = style_feats_agg_df.documentID
728
- style_to_feats_dfreq = {x[0]: math.log(styles_df.documentID.nunique()/x[1]) for x in zip(style_feats_agg_df[feat_clm].tolist(), style_feats_agg_df.document_freq.tolist())}
729
 
730
- # A list of style features we work with
731
- style_feats_list = style_feats_agg_df[feat_clm].tolist()
732
- print('Number of style feats ', len(style_feats_list))
733
 
734
- # A list of documents and what list of style features each has
735
- doc_style_agg_df = styles_df.groupby('documentID').agg({feat_clm: lambda x : list(x)}).reset_index()
736
- document_to_feats_dict = {x[0]: x[1] for x in zip(doc_style_agg_df.documentID.tolist(), doc_style_agg_df[feat_clm].tolist())}
737
 
738
 
739
 
740
- # Load the clustering information
741
- df = pd.read_pickle(interp_space_path)
742
- df = df[df.cluster_label != -1]
743
- # A cluster to list of documents
744
- clusterd_df = df.groupby('cluster_label').agg({
745
- 'documentID': lambda x: [d_id for doc_ids in x for d_id in doc_ids]
746
- }).reset_index()
747
 
748
- # Filter-in only documents that has a style description
749
- clusterd_df['documentID'] = clusterd_df.documentID.apply(lambda documentIDs: [documentID for documentID in documentIDs if documentID in document_to_feats_dict])
750
- # Map from cluster label to list of features through the document information
751
- clusterd_df[feat_clm] = clusterd_df.documentID.apply(lambda doc_ids: [f for d_id in doc_ids for f in document_to_feats_dict[d_id]])
752
 
753
- def compute_tfidf(row):
754
- style_counts = Counter(row[feat_clm])
755
- total_num_styles = sum(style_counts.values())
756
- #print(style_counts, total_num_styles)
757
- style_distribution = {
758
- style: math.log(1+count) * style_to_feats_dfreq[style] if style in style_to_feats_dfreq else 0 for style, count in style_counts.items()
759
- } #TF-IDF
760
 
761
- return style_distribution
762
 
763
- def create_tfidf_rep(tfidf_dist, num_feats):
764
- style_feats = sorted(tfidf_dist.items(), key=lambda x: -x[1])
765
- top_k_feats = [x[0] for x in style_feats[:num_feats] if str(x[0]) != 'nan']
766
- return top_k_feats
767
 
768
- clusterd_df[output_clm +'_dist'] = clusterd_df.apply(lambda row: compute_tfidf(row), axis=1)
769
- clusterd_df[output_clm] = clusterd_df[output_clm +'_dist'].apply(lambda dist: create_tfidf_rep(dist, num_feats))
770
 
771
 
772
- return clusterd_df
773
 
774
  def compute_predicted_author(task_authors_df: pd.DataFrame, col_name: str) -> int:
775
  """
776
  Computes the predicted author based on the style features.
777
  """
778
- print("Computing predicted author using LUAR-MUD-style embeddings...")
779
 
780
  # Extract LUAR embeddings from task authors dataframe
781
  mystery_embedding = np.array(task_authors_df.iloc[0][col_name]).reshape(1, -1)
@@ -816,11 +837,11 @@ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, pred_idx, model
816
  else:
817
  cache = {}
818
  if key in cache:
819
- print(f"\nCache hit! Using cached regions.")
820
  return cache[key]
821
  else:
822
- print(f"Cache miss. Computing regions.")
823
-
824
  regions = {}
825
 
826
  # All points for distance calculation (mystery + candidates + background)
@@ -935,22 +956,23 @@ def compute_precomputed_regions(bg_proj, bg_ids, q_proj, c_proj, pred_idx, model
935
  response = json.dumps(serializable_regions, default=str)
936
  cache[key] = response
937
  with open(REGION_CACHE, 'wb') as f:
 
938
  pickle.dump(cache, f)
939
 
940
  return response
941
 
942
- if __name__ == "__main__":
943
- background_corpus = pd.read_pickle('../datasets/luar_interp_space_cluster_19/train_authors.pkl')
944
- print(background_corpus.columns)
945
- print(background_corpus[['authorID', 'fullText', 'cluster_label']].head())
946
- # # Example: Find features for clusters [2,3,4] that are NOT prominent in cluster [1]
947
- # feats = compute_clusters_style_representation(
948
- # background_corpus_df=background_corpus,
949
- # cluster_ids=['00005a5c-5c06-3a36-37f9-53c6422a31d8',],
950
- # other_cluster_ids=[], # Pass the contrastive cluster IDs here
951
- # cluster_label_clm_name='authorID',
952
- # features_clm_name='final_attribute_name'
953
- # )
954
- # print(feats)
955
- generate_style_embedding(background_corpus, 'fullText', 'AnnaWegmann/Style-Embedding')
956
- print(background_corpus.columns)
 
25
  from sklearn.decomposition import PCA
26
 
27
  CACHE_DIR = "datasets/embeddings_cache"
28
+ G2V_CACHE = "datasets/gram2vec_cache"
29
  ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
30
  REGION_CACHE = "datasets/region_cache/regions_cache.pkl"
31
  os.makedirs(CACHE_DIR, exist_ok=True)
32
+ os.makedirs(G2V_CACHE, exist_ok=True)
33
  os.makedirs(os.path.dirname(ZOOM_CACHE), exist_ok=True)
34
  os.makedirs(os.path.dirname(REGION_CACHE), exist_ok=True)
35
  # Bump this whenever there is a change etc...
 
56
  print (f"concatenating task authors and background corpus authors")
57
  print(f"Number of task authors: {len(task_authors_df)}")
58
  print(f"task authors author_ids: {task_authors_df.authorID.tolist()}")
59
+ # print(f"task authors -->")
60
+ # print(task_authors_df)
61
  print(f"Number of background corpus authors: {len(clustered_authors_df)}")
62
  clustered_authors_df = pd.concat([task_authors_df, clustered_authors_df])
63
  print(f"Number of authors after concatenation: {len(clustered_authors_df)}")
 
65
  # Gather the input texts (preserves list-of-strings if any)
66
  #texts = background_corpus_df[text_clm].fillna("").tolist()
67
  author_texts = ['\n\n'.join(x) for x in clustered_authors_df.fullText.tolist()]
68
+ # print('author_text at 0:{}'.format(author_texts[0]))
69
  print(f"Number of author_texts: {len(author_texts)}")
70
 
71
  # Create a reproducible JSON serialization of the texts
72
+ # why are g2v features going into a new file inside embeddings_cache?
73
+ # changed to G2V_CACHE
74
  serialized = json.dumps({
75
  "col": text_clm,
76
  "texts": author_texts
 
78
 
79
  # Compute MD5 hash
80
  digest = hashlib.md5(serialized.encode("utf-8")).hexdigest()
81
+ cache_path = os.path.join(G2V_CACHE, f"{digest}.pkl")
82
 
83
  # If cache hit, load and return
84
  if os.path.exists(cache_path):
85
+ # print(f"Cache hit...")
86
+ # Making this green to make it stand out from rest of the logs
87
+ print(f"\n\n\n\033[1m\033[92m>>> Cache hit for {cache_path} <<<\033[0m\n")
88
  with open(cache_path, "rb") as f:
89
  clustered_authors_df = pickle.load(f)
90
 
91
  else: # Else compute and cache
92
+ # Making this red to make it stand out from rest of the logs
93
+ print(f"\n\n\n\033[1m\033[91m>>> Cache miss for {cache_path} => Computing fresh!! <<<\033[0m\n")
94
+
95
  g2v_feats_df = vectorizer.from_documents(author_texts, batch_size=8)
96
 
97
  print(f"Number of g2v features: {len(g2v_feats_df)}")
 
125
 
126
  with open(cache_path, "wb") as f:
127
  pickle.dump(clustered_authors_df, f)
128
+ # Making this green to make it stand out from rest of the logs
129
+ print(f"\n\n\n\033[1m\033[92m>>> Saved to {cache_path} <<<\033[0m\n")
130
+ # the file generated here contains g2v + style embeddings.
131
 
132
  if task_authors_df is not None:
133
  task_authors_df = clustered_authors_df[clustered_authors_df.authorID.isin(task_authors_df.authorID.tolist())]
 
278
 
279
  # If cache hit, load and return
280
  if os.path.exists(cache_path):
281
+ # Making this green to make it stand out from rest of the logs
282
+ print(f"\n\n\n\033[1m\033[92m>>> Cache hit for {cache_path} for {model_name} on column '{text_clm} <<<\033[0m\n")
283
  with open(cache_path, "rb") as f:
284
  background_corpus_df = pickle.load(f)
285
 
286
  else:
287
  # Otherwise, compute, cache, and return
288
+ print(f"\n\n\n\033[1m\033[91m>>> Cache miss for {cache_path} for {model_name} on column '{text_clm} <<<\033[0m\n")
289
  task_and_background_embeddings = generate_style_embedding(background_corpus_df, text_clm, model_name, dimensionality_reduction=False)
290
  # Create a clean column name from the model name
291
  col_name = f'{model_name.split("/")[-1]}_style_embedding'
 
293
 
294
  with open(cache_path, "wb") as f:
295
  pickle.dump(background_corpus_df, f)
296
+ print(f"\n\n\n\033[1m\033[92m>>> Cache saved for {cache_path} for {model_name} on column '{text_clm} <<<\033[0m\n")
297
 
298
  if task_authors_df is not None:
299
  task_authors_df = background_corpus_df[background_corpus_df.authorID.isin(task_authors_df.authorID.tolist())]
 
301
 
302
  return background_corpus_df, task_authors_df
303
 
304
+ # Noticed the following function isnt actually referenced anywhere.
305
+ # def get_style_feats_distribution(documentIDs, style_feats_dict):
306
+ # style_feats = []
307
+ # for documentId in documentIDs:
308
+ # if documentId not in document_to_style_feats:
309
+ # #print(documentId)
310
+ # continue
311
+
312
+ # style_feats+= document_to_style_feats[documentId]
313
+
314
+ # tfidf = [style_feats.count(key) * val for key, val in style_feats_dict.items()]
315
+
316
+ # return tfidf
317
+ #
318
+ # Noticed the following function isnt actually referenced anywhere.
319
+ # def get_cluster_top_feats(style_feats_distribution, style_feats_list, top_k=5):
320
+ # sorted_feats = np.argsort(style_feats_distribution)[::-1]
321
+ # top_feats = [style_feats_list[x] for x in sorted_feats[:top_k] if style_feats_distribution[x] > 0]
322
+ # return top_feats
323
+
324
+ # Noticed the following function isnt actually referenced anywhere.
325
+ # def compute_clusters_style_representation(
326
+ # background_corpus_df: pd.DataFrame,
327
+ # cluster_ids: List[Any],
328
+ # other_cluster_ids: List[Any],
329
+ # features_clm_name: str,
330
+ # cluster_label_clm_name: str = 'cluster_label',
331
+ # top_n: int = 10
332
+ # ) -> List[str]:
333
+ # """
334
+ # Given a DataFrame with document IDs, cluster IDs, and feature lists,
335
+ # return the top N features that are most important in the specified `cluster_ids`
336
+ # while having low importance in `other_cluster_ids`.
337
+ # Importance is determined by TF-IDF scores. The final score for a feature is
338
+ # (summed TF-IDF in `cluster_ids`) - (summed TF-IDF in `other_cluster_ids`).
339
+
340
+ # Parameters:
341
+ # - background_corpus_df: pd.DataFrame. Must contain the columns specified by
342
+ # `cluster_label_clm_name` and `features_clm_name`.
343
+ # The column `features_clm_name` should contain lists of strings (features).
344
+ # - cluster_ids: List of cluster IDs for which to find representative features (target clusters).
345
+ # - other_cluster_ids: List of cluster IDs whose features should be down-weighted.
346
+ # Features prominent in these clusters will have their scores reduced.
347
+ # Pass an empty list or None if no contrastive clusters are needed.
348
+ # - features_clm_name: The name of the column in `background_corpus_df` that
349
+ # contains the list of features for each document.
350
+ # - cluster_label_clm_name: The name of the column in `background_corpus_df`
351
+ # that contains the cluster labels. Defaults to 'cluster_label'.
352
+ # - top_n: Number of top features to return.
353
+ # Returns:
354
+ # - List[str]: A list of feature names. These are up to `top_n` features
355
+ # ranked by their adjusted TF-IDF scores (score in `cluster_ids`
356
+ # minus score in `other_cluster_ids`). Only features with a final
357
+ # adjusted score > 0 are included.
358
+ # """
359
+
360
+ # assert background_corpus_df[features_clm_name].apply(
361
+ # lambda x: isinstance(x, list) and all(isinstance(feat, str) for feat in x)
362
+ # ).all(), f"Column '{features_clm_name}' must contain lists of strings."
363
+
364
+ # # Compute TF-IDF on the entire corpus
365
+ # vectorizer = TfidfVectorizer(
366
+ # tokenizer=lambda x: x,
367
+ # preprocessor=lambda x: x,
368
+ # token_pattern=None # Disable default token pattern, treat items in list as tokens
369
+ # )
370
+ # tfidf_matrix = vectorizer.fit_transform(background_corpus_df[features_clm_name])
371
+ # feature_names = vectorizer.get_feature_names_out()
372
+
373
+ # # Get boolean mask for documents in selected clusters
374
+ # selected_mask = background_corpus_df[cluster_label_clm_name].isin(cluster_ids).to_numpy()
375
+
376
+ # if not selected_mask.any():
377
+ # return [] # No documents found for the given cluster_ids
378
+
379
+ # # Subset the TF-IDF matrix using the boolean mask
380
+ # selected_tfidf = tfidf_matrix[selected_mask]
381
+
382
+ # # Sum TF-IDF scores across documents for each feature in the target clusters
383
+ # target_feature_scores_sum = selected_tfidf.sum(axis=0).A1 # Convert to 1D array
384
+
385
+ # # Initialize adjusted scores with target scores
386
+ # adjusted_feature_scores = target_feature_scores_sum.copy()
387
+
388
+ # # If other_cluster_ids are provided and not empty, subtract their TF-IDF sums
389
+ # if other_cluster_ids: # Checks if the list is not None and not empty
390
+ # other_selected_mask = background_corpus_df[cluster_label_clm_name].isin(other_cluster_ids).to_numpy()
391
+
392
+ # if other_selected_mask.any():
393
+ # other_selected_tfidf = tfidf_matrix[other_selected_mask]
394
+ # contrast_feature_scores_sum = other_selected_tfidf.sum(axis=0).A1
395
 
396
+ # # Element-wise subtraction; assumes feature_names aligns for both sums
397
+ # adjusted_feature_scores -= contrast_feature_scores_sum
398
+
399
+ # # Map scores to feature names
400
+ # feature_score_dict = dict(zip(feature_names, adjusted_feature_scores))
401
+ # # Sort features by score
402
+ # sorted_features = sorted(feature_score_dict.items(), key=lambda item: item[1], reverse=True)
403
+
404
+ # # Return the names of the top_n features that have a score > 0
405
+ # top_features = [feature for feature, score in sorted_features if score > 0][:top_n]
406
+
407
+ # return top_features
408
+
409
+ # Noticed the following function isnt actually referenced anywhere.
410
+ # def compute_clusters_style_representation_2(
411
+ # background_corpus_df: pd.DataFrame,
412
+ # cluster_ids: List[Any],
413
+ # cluster_label_clm_name: str = 'cluster_label',
414
+ # max_num_feats: int = 5,
415
+ # max_num_documents_per_author=3,
416
+ # max_num_authors=5):
417
+ # """
418
+ # Call openAI to analyze the common writing style features of the given list of texts
419
+ # """
420
+ # client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
421
+
422
+ # background_corpus_df['fullText'] = background_corpus_df['fullText'].map(lambda x: '\n\n'.join(x[:max_num_documents_per_author]) if isinstance(x, list) else x)
423
+ # background_corpus_df = background_corpus_df[background_corpus_df[cluster_label_clm_name].isin(cluster_ids)]
424
+
425
+ # author_texts = background_corpus_df['fullText'].tolist()[:max_num_authors]
426
+ # author_texts = "\n\n".join(["""Author {}:\n""".format(i+1) + text for i, text in enumerate(author_texts)])
427
+ # author_names = background_corpus_df[cluster_label_clm_name].tolist()[:max_num_authors]
428
+ # print(f"Number of authors: {len(background_corpus_df)}")
429
+ # print(author_names)
430
+ # print(author_texts)
431
+ # print(f"Number of authors: {len(author_names)}")
432
+ # print(f"Number of authors: {len(author_texts)}")
433
+
434
+ # prompt = f"""First identify a list of {max_num_feats} writing style features that are common between the given texts. Second for every author text and style feature, extract all spans that represent the feature. Output for every author all style features with their spans.
435
+ # Author Texts:
436
+ # \"\"\"{author_texts}\"\"\"
437
+ # """
438
+
439
+ # # Compute MD5 hash
440
+ # digest = hashlib.md5(prompt.encode("utf-8")).hexdigest()
441
+ # cache_path = os.path.join(CACHE_DIR, f"{digest}.pkl")
442
+
443
+ # # If cache hit, load and return
444
+ # if os.path.exists(cache_path):
445
+ # print(f"Loading authors writing style from cache ...")
446
+ # with open(cache_path, "rb") as f:
447
+ # parsed_response = pickle.load(f)
448
+
449
+ # else: # Else compute and cache
450
+
451
+ # response = client.chat.completions.create(
452
+ # model="gpt-4o-mini",
453
+ # messages=[
454
+ # {"role":"assistant","content":"You are a forensic linguistic who knows how to analyze similarites in writing styles."},
455
+ # {"role":"user","content":prompt}],
456
+ # response_format={"type": "json_schema", "json_schema": {"name": "style_analysis_schema", "schema": to_strict_json_schema(style_analysis_schema)}}
457
+ # )
458
+
459
+ # parsed_response = json.loads(response.choices[0].message.content)
460
+
461
+ # with open(cache_path, "wb") as f:
462
+ # pickle.dump(parsed_response, f)
463
+
464
+ # return parsed_response
465
 
466
  def generate_cache_key(author_names: List[str], max_num_feats: int) -> str:
467
  """Generate a unique cache key based on author names and max features"""
 
487
 
488
  if cache_key in cache:
489
  print(f"\nCache hit! Using cached features for authors: {author_names}")
490
+ print(f"\n\n\n\033[1m\033[92m>>> Cache hit for {cache_key} in {ZOOM_CACHE} <<<\033[0m\n")
491
  return cache[cache_key]["features"]
492
  else:
493
+ print(f"\n\n\n\033[1m\033[91m>>> Cache miss for {cache_key} in {ZOOM_CACHE} \nComputing features for authors: {author_names}<<<\033[0m\n")
494
+
495
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
496
  prompt = f"""Identify {max_num_feats} writing style features that are common between the authors texts.
497
  Author Texts:
 
499
  {author_texts}
500
  """
501
 
502
+ # print('==================>>>>>>>>>>')
503
+ # print(prompt)
504
+ # print('==================>>>>>>>>>>')
505
  def _make_call():
506
  response = client.chat.completions.create(
507
  model="gpt-4o",
 
528
  # save_cache(cache)
529
  with open(ZOOM_CACHE, 'w') as f:
530
  json.dump(cache, f, indent=2)
531
+ print(f"\n\n\n\033[1m\033[92m>>> Cache saved for {cache_key} in {ZOOM_CACHE}<<<\033[0m\n")
532
+
533
 
534
  print(f"Cached features for authors: {author_names}")
535
 
 
558
 
559
  for _, row in authors_df.iterrows():
560
  author_name = str(row[cluster_label_clm_name])
561
+ # print(author_name)
562
  role = f"{author_name}"
563
  full_text = row['fullText']
564
  spans = generate_feature_spans_cached(client, full_text, features, role)
 
586
  author_texts = "\n\n".join(["""Author {}:\n""".format(i+1) + text for i, text in enumerate(author_texts)])
587
  author_names = background_corpus_df_feat_id[cluster_label_clm_name].tolist()[:max_num_authors]
588
  print(f"Number of authors: {len(background_corpus_df_feat_id)}")
589
+ # print(author_names)
590
  features = identify_style_features(author_texts, author_names, max_num_feats=max_num_feats)
591
 
592
+ # print("Features: ", features)
593
  # STEP 2: Prepare author pool for span extraction
594
  span_df = background_corpus_df.iloc[:max_authors_for_span_extraction]
595
  author_names = span_df[cluster_label_clm_name].tolist()[:max_authors_for_span_extraction]
596
  print(f"Number of authors for span detection : {len(span_df)}")
597
+ # print(author_names)
598
  spans_by_author = extract_all_spans(span_df, features, cluster_label_clm_name)
599
 
600
  # Filter-in only task authors that are part of the current selection
 
639
  for feature, spans in feature_map.items():
640
  if spans:
641
  feature_importance[feature] -= len(spans)
642
+ # print(feature_importance)
643
  selected_features_ranked = sorted(feature_importance, key=lambda f: -feature_importance[f])[:int(top_k)]
644
 
645
  #print('filtered set of features (min coverage', len(author_present_feature_sets), '): ', selected_features_ranked)
 
734
  key=lambda x: (-feature_span_counts.get(x[0], 0), -x[1])
735
  )
736
 
737
+ # print(f"[INFO] Sorted gram2vec features by span frequency: {[(f, feature_span_counts.get(f, 0), z) for f, z in sorted_by_spans[:top_n]]}")
738
 
739
  return sorted_by_spans[:top_n]
740
 
741
+ # Noticed the following function isnt actually referenced anywhere.
742
+ # def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
743
 
744
+ # styles_df = pd.read_csv(styles_df_path)[[feat_clm, "documentID"]]
745
 
746
+ # # A dictionary of style features and their IDF
747
+ # style_feats_agg_df = styles_df.groupby(feat_clm).agg({'documentID': lambda x : len(list(x))}).reset_index()
748
+ # style_feats_agg_df['document_freq'] = style_feats_agg_df.documentID
749
+ # style_to_feats_dfreq = {x[0]: math.log(styles_df.documentID.nunique()/x[1]) for x in zip(style_feats_agg_df[feat_clm].tolist(), style_feats_agg_df.document_freq.tolist())}
750
 
751
+ # # A list of style features we work with
752
+ # style_feats_list = style_feats_agg_df[feat_clm].tolist()
753
+ # print('Number of style feats ', len(style_feats_list))
754
 
755
+ # # A list of documents and what list of style features each has
756
+ # doc_style_agg_df = styles_df.groupby('documentID').agg({feat_clm: lambda x : list(x)}).reset_index()
757
+ # document_to_feats_dict = {x[0]: x[1] for x in zip(doc_style_agg_df.documentID.tolist(), doc_style_agg_df[feat_clm].tolist())}
758
 
759
 
760
 
761
+ # # Load the clustering information
762
+ # df = pd.read_pickle(interp_space_path)
763
+ # df = df[df.cluster_label != -1]
764
+ # # A cluster to list of documents
765
+ # clusterd_df = df.groupby('cluster_label').agg({
766
+ # 'documentID': lambda x: [d_id for doc_ids in x for d_id in doc_ids]
767
+ # }).reset_index()
768
 
769
+ # # Filter-in only documents that has a style description
770
+ # clusterd_df['documentID'] = clusterd_df.documentID.apply(lambda documentIDs: [documentID for documentID in documentIDs if documentID in document_to_feats_dict])
771
+ # # Map from cluster label to list of features through the document information
772
+ # clusterd_df[feat_clm] = clusterd_df.documentID.apply(lambda doc_ids: [f for d_id in doc_ids for f in document_to_feats_dict[d_id]])
773
 
774
+ # def compute_tfidf(row):
775
+ # style_counts = Counter(row[feat_clm])
776
+ # total_num_styles = sum(style_counts.values())
777
+ # #print(style_counts, total_num_styles)
778
+ # style_distribution = {
779
+ # style: math.log(1+count) * style_to_feats_dfreq[style] if style in style_to_feats_dfreq else 0 for style, count in style_counts.items()
780
+ # } #TF-IDF
781
 
782
+ # return style_distribution
783
 
784
+ # def create_tfidf_rep(tfidf_dist, num_feats):
785
+ # style_feats = sorted(tfidf_dist.items(), key=lambda x: -x[1])
786
+ # top_k_feats = [x[0] for x in style_feats[:num_feats] if str(x[0]) != 'nan']
787
+ # return top_k_feats
788
 
789
+ # clusterd_df[output_clm +'_dist'] = clusterd_df.apply(lambda row: compute_tfidf(row), axis=1)
790
+ # clusterd_df[output_clm] = clusterd_df[output_clm +'_dist'].apply(lambda dist: create_tfidf_rep(dist, num_feats))
791
 
792
 
793
+ # return clusterd_df
794
 
795
  def compute_predicted_author(task_authors_df: pd.DataFrame, col_name: str) -> int:
796
  """
797
  Computes the predicted author based on the style features.
798
  """
799
+ print("Computing predicted author using embeddings...")
800
 
801
  # Extract LUAR embeddings from task authors dataframe
802
  mystery_embedding = np.array(task_authors_df.iloc[0][col_name]).reshape(1, -1)
 
837
  else:
838
  cache = {}
839
  if key in cache:
840
+ print(f"\n\n\n\033[1m\033[92m>>> Cache hit for {key} in {REGION_CACHE}: Using cached regions<<<\033[0m\n")
841
  return cache[key]
842
  else:
843
+ print(f"\n\n\n\033[1m\033[91m>>> Cache miss for {key} in {REGION_CACHE}: Computing Regions<<<\033[0m\n")
844
+
845
  regions = {}
846
 
847
  # All points for distance calculation (mystery + candidates + background)
 
956
  response = json.dumps(serializable_regions, default=str)
957
  cache[key] = response
958
  with open(REGION_CACHE, 'wb') as f:
959
+ print(f"\n\n\n\033[1m\033[92m>>> Cache saved for {key} in {REGION_CACHE} <<<\033[0m\n")
960
  pickle.dump(cache, f)
961
 
962
  return response
963
 
964
+ # if __name__ == "__main__":
965
+ # background_corpus = pd.read_pickle('../datasets/luar_interp_space_cluster_19/train_authors.pkl')
966
+ # print(background_corpus.columns)
967
+ # print(background_corpus[['authorID', 'fullText', 'cluster_label']].head())
968
+ # # # Example: Find features for clusters [2,3,4] that are NOT prominent in cluster [1]
969
+ # # feats = compute_clusters_style_representation(
970
+ # # background_corpus_df=background_corpus,
971
+ # # cluster_ids=['00005a5c-5c06-3a36-37f9-53c6422a31d8',],
972
+ # # other_cluster_ids=[], # Pass the contrastive cluster IDs here
973
+ # # cluster_label_clm_name='authorID',
974
+ # # features_clm_name='final_attribute_name'
975
+ # # )
976
+ # # print(feats)
977
+ # generate_style_embedding(background_corpus, 'fullText', 'AnnaWegmann/Style-Embedding')
978
+ # print(background_corpus.columns)