Milad Alshomary
commited on
Commit
Β·
87e3b98
1
Parent(s):
74947b9
updates
Browse files- app.py +1 -1
- config/config.yaml +2 -2
- utils/interp_space_utils.py +34 -12
- utils/ui.py +6 -9
app.py
CHANGED
|
@@ -58,7 +58,7 @@ def app(share=False, use_cluster_feats=False):
|
|
| 58 |
instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
|
| 59 |
|
| 60 |
interp = load_interp_space(cfg)
|
| 61 |
-
clustered_authors_df = interp['clustered_authors_df'][:
|
| 62 |
clustered_authors_df['fullText'] = clustered_authors_df['fullText'].map(lambda l: l[:3]) # Take at most 3 texts per author
|
| 63 |
|
| 64 |
with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
|
|
|
|
| 58 |
instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
|
| 59 |
|
| 60 |
interp = load_interp_space(cfg)
|
| 61 |
+
clustered_authors_df = interp['clustered_authors_df'][:200]
|
| 62 |
clustered_authors_df['fullText'] = clustered_authors_df['fullText'].map(lambda l: l[:3]) # Take at most 3 texts per author
|
| 63 |
|
| 64 |
with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
|
config/config.yaml
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
# config.yaml
|
| 2 |
instances_to_explain_path: "./datasets/hrs_explanations.json"
|
| 3 |
instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
|
| 4 |
-
interp_space_path: "./datasets/
|
| 5 |
-
interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/
|
| 6 |
gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
|
| 7 |
gram2vec_feats_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
|
| 8 |
|
|
|
|
| 1 |
# config.yaml
|
| 2 |
instances_to_explain_path: "./datasets/hrs_explanations.json"
|
| 3 |
instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_18_balanced.json?/download=true"
|
| 4 |
+
interp_space_path: "./datasets/sentence_luar_interp_clusters_18.zip/"
|
| 5 |
+
interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/luar_interp_space_cluster_18.zip?download=true"
|
| 6 |
gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
|
| 7 |
gram2vec_feats_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
|
| 8 |
|
utils/interp_space_utils.py
CHANGED
|
@@ -20,6 +20,7 @@ from utils.llm_feat_utils import generate_feature_spans_cached
|
|
| 20 |
from collections import Counter
|
| 21 |
import numpy as np
|
| 22 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
| 23 |
|
| 24 |
CACHE_DIR = "datasets/embeddings_cache"
|
| 25 |
ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
|
|
@@ -140,7 +141,7 @@ def instance_to_df(instance, predicted_author=None, ground_truth_author=None):
|
|
| 140 |
return task_authos_df
|
| 141 |
|
| 142 |
|
| 143 |
-
def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str, model_name: str) -> pd.DataFrame:
|
| 144 |
"""
|
| 145 |
Generates style embeddings for documents in a background corpus using a specified model.
|
| 146 |
If a row in `text_clm` contains a list of strings, the final embedding for that row
|
|
@@ -218,22 +219,33 @@ def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str,
|
|
| 218 |
embeddings = model.encode(texts, show_progress_bar=True)
|
| 219 |
final_embeddings = list(embeddings)
|
| 220 |
|
| 221 |
-
#
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
| 224 |
|
| 225 |
-
return
|
| 226 |
|
| 227 |
# ββ wrapper with caching βββββββββββββββββββββββββββββββββββββββ
|
| 228 |
def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
|
| 229 |
text_clm: str,
|
| 230 |
-
model_name: str
|
|
|
|
| 231 |
"""
|
| 232 |
Wraps `generate_style_embedding`, caching its output in pickle files
|
| 233 |
keyed by an MD5 of (model_name + text list). If the cache exists,
|
| 234 |
loads and returns it instead of recomputing.
|
| 235 |
"""
|
| 236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
# Gather the input texts (preserves list-of-strings if any)
|
| 238 |
texts = background_corpus_df[text_clm].fillna("").tolist()
|
| 239 |
|
|
@@ -255,12 +267,22 @@ def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
|
|
| 255 |
with open(cache_path, "rb") as f:
|
| 256 |
return pickle.load(f)
|
| 257 |
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
def get_style_feats_distribution(documentIDs, style_feats_dict):
|
| 266 |
style_feats = []
|
|
|
|
| 20 |
from collections import Counter
|
| 21 |
import numpy as np
|
| 22 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 23 |
+
from sklearn.decomposition import PCA
|
| 24 |
|
| 25 |
CACHE_DIR = "datasets/embeddings_cache"
|
| 26 |
ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
|
|
|
|
| 141 |
return task_authos_df
|
| 142 |
|
| 143 |
|
| 144 |
+
def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str, model_name: str, dimensionality_reduction: bool = True, dimensions: int = 100) -> pd.DataFrame:
|
| 145 |
"""
|
| 146 |
Generates style embeddings for documents in a background corpus using a specified model.
|
| 147 |
If a row in `text_clm` contains a list of strings, the final embedding for that row
|
|
|
|
| 219 |
embeddings = model.encode(texts, show_progress_bar=True)
|
| 220 |
final_embeddings = list(embeddings)
|
| 221 |
|
| 222 |
+
# Apply PCA over the embeddings to reduce the dimentionality
|
| 223 |
+
if dimensionality_reduction:
|
| 224 |
+
if len(final_embeddings) > 0 and len(final_embeddings[0]) > dimensions: # Only apply PCA if embeddings exist and dim > dimensions
|
| 225 |
+
pca = PCA(n_components=dimensions)
|
| 226 |
+
final_embeddings = pca.fit_transform(final_embeddings)
|
| 227 |
|
| 228 |
+
return list(final_embeddings)
|
| 229 |
|
| 230 |
# ββ wrapper with caching βββββββββββββββββββββββββββββββββββββββ
|
| 231 |
def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
|
| 232 |
text_clm: str,
|
| 233 |
+
model_name: str,
|
| 234 |
+
task_authors_df: pd.DataFrame = None) -> pd.DataFrame:
|
| 235 |
"""
|
| 236 |
Wraps `generate_style_embedding`, caching its output in pickle files
|
| 237 |
keyed by an MD5 of (model_name + text list). If the cache exists,
|
| 238 |
loads and returns it instead of recomputing.
|
| 239 |
"""
|
| 240 |
|
| 241 |
+
if task_authors_df is not None:
|
| 242 |
+
print (f"concatenating task authors and background corpus authors")
|
| 243 |
+
print(f"Number of task authors: {len(task_authors_df)}")
|
| 244 |
+
print(f"task authors author_ids: {task_authors_df.authorID.tolist()}")
|
| 245 |
+
print(f"Number of background corpus authors: {len(background_corpus_df)}")
|
| 246 |
+
background_corpus_df = pd.concat([task_authors_df, background_corpus_df])
|
| 247 |
+
print(f"Number of authors after concatenation: {len(background_corpus_df)}")
|
| 248 |
+
|
| 249 |
# Gather the input texts (preserves list-of-strings if any)
|
| 250 |
texts = background_corpus_df[text_clm].fillna("").tolist()
|
| 251 |
|
|
|
|
| 267 |
with open(cache_path, "rb") as f:
|
| 268 |
return pickle.load(f)
|
| 269 |
|
| 270 |
+
else:
|
| 271 |
+
# Otherwise, compute, cache, and return
|
| 272 |
+
print(f"Computing embeddings for {model_name} on column '{text_clm}', saving to {cache_path}")
|
| 273 |
+
task_and_background_embeddings = generate_style_embedding(background_corpus_df, text_clm, model_name, dimensionality_reduction=True)
|
| 274 |
+
# Create a clean column name from the model name
|
| 275 |
+
col_name = f'{model_name.split("/")[-1]}_style_embedding'
|
| 276 |
+
background_corpus_df[col_name] = task_and_background_embeddings
|
| 277 |
+
|
| 278 |
+
with open(cache_path, "wb") as f:
|
| 279 |
+
pickle.dump(background_corpus_df, f)
|
| 280 |
+
|
| 281 |
+
if task_authors_df is not None:
|
| 282 |
+
task_authors_df = background_corpus_df[background_corpus_df.authorID.isin(task_authors_df.authorID.tolist())]
|
| 283 |
+
background_corpus_df = background_corpus_df[~background_corpus_df.authorID.isin(task_authors_df.authorID.tolist())]
|
| 284 |
+
|
| 285 |
+
return background_corpus_df, task_authors_df
|
| 286 |
|
| 287 |
def get_style_feats_distribution(documentIDs, style_feats_dict):
|
| 288 |
style_feats = []
|
utils/ui.py
CHANGED
|
@@ -102,7 +102,6 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
|
|
| 102 |
#create a dataframe of the task authors
|
| 103 |
task_authors_df = instance_to_df(instances[iid], predicted_author=predicted_author, ground_truth_author=ground_truth_author)
|
| 104 |
print(f"\n\n\n ----> Loaded task {iid} with {len(task_authors_df)} authors\n\n\n")
|
| 105 |
-
print(task_authors_df)
|
| 106 |
else:
|
| 107 |
header_html = "<h3>Custom Uploaded Task</h3>"
|
| 108 |
mystery_txt = read_txt(mystery_file)
|
|
@@ -119,15 +118,15 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
|
|
| 119 |
'a2_fullText': c3_txt
|
| 120 |
}
|
| 121 |
task_authors_df = instance_to_df(custom_task_instance)
|
| 122 |
-
print(task_authors_df)
|
| 123 |
|
| 124 |
-
print(f"Generating embeddings for {model_name} on task authors")
|
| 125 |
-
task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
|
| 126 |
-
print("Task authors after embedding generation:")
|
| 127 |
-
print(task_authors_df)
|
|
|
|
| 128 |
# Generate the new embedding of all the background_df authors
|
| 129 |
print(f"Generating embeddings for {model_name} on background corpus")
|
| 130 |
-
background_df = cached_generate_style_embedding(background_df, 'fullText', model_name)
|
| 131 |
print(f"Generated embeddings for {len(background_df)} texts using model '{model_name}'")
|
| 132 |
|
| 133 |
# computing g2v features
|
|
@@ -137,8 +136,6 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
|
|
| 137 |
task_authors_df['g2v_vector'] = task_authors_g2v
|
| 138 |
print(f"Gram2Vec feature generation complete")
|
| 139 |
|
| 140 |
-
print(background_df.columns)
|
| 141 |
-
|
| 142 |
if mode != "Predefined HRS Task":
|
| 143 |
# Computing predicted author by checking pairwise cosine similarity over luar embeddings
|
| 144 |
col_name = f'{model_name.split("/")[-1]}_style_embedding'
|
|
|
|
| 102 |
#create a dataframe of the task authors
|
| 103 |
task_authors_df = instance_to_df(instances[iid], predicted_author=predicted_author, ground_truth_author=ground_truth_author)
|
| 104 |
print(f"\n\n\n ----> Loaded task {iid} with {len(task_authors_df)} authors\n\n\n")
|
|
|
|
| 105 |
else:
|
| 106 |
header_html = "<h3>Custom Uploaded Task</h3>"
|
| 107 |
mystery_txt = read_txt(mystery_file)
|
|
|
|
| 118 |
'a2_fullText': c3_txt
|
| 119 |
}
|
| 120 |
task_authors_df = instance_to_df(custom_task_instance)
|
|
|
|
| 121 |
|
| 122 |
+
#print(f"Generating embeddings for {model_name} on task authors")
|
| 123 |
+
# task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
|
| 124 |
+
# print("Task authors after embedding generation:")
|
| 125 |
+
# print(task_authors_df)
|
| 126 |
+
|
| 127 |
# Generate the new embedding of all the background_df authors
|
| 128 |
print(f"Generating embeddings for {model_name} on background corpus")
|
| 129 |
+
background_df, task_authors_df = cached_generate_style_embedding(background_df, 'fullText', model_name, task_authors_df=task_authors_df)
|
| 130 |
print(f"Generated embeddings for {len(background_df)} texts using model '{model_name}'")
|
| 131 |
|
| 132 |
# computing g2v features
|
|
|
|
| 136 |
task_authors_df['g2v_vector'] = task_authors_g2v
|
| 137 |
print(f"Gram2Vec feature generation complete")
|
| 138 |
|
|
|
|
|
|
|
| 139 |
if mode != "Predefined HRS Task":
|
| 140 |
# Computing predicted author by checking pairwise cosine similarity over luar embeddings
|
| 141 |
col_name = f'{model_name.split("/")[-1]}_style_embedding'
|