Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

Milad Alshomary commited on Oct 23

Commit

e5d9888

1 Parent(s): e7bcc02

updates

Browse files

Files changed (3) hide show

prepare_data.py +23 -8
utils/interp_space_utils.py +1 -1
utils/ui.py +6 -0

prepare_data.py CHANGED Viewed

@@ -44,19 +44,22 @@ def sample_ds(input_file, output_file, num_insts=10000, min_num_text_per_inst=0,
     df = pd.DataFrame(out_list)
     df.to_pickle(output_file)
-def get_reddit_data(input_path, random_seed=123, num_instances=50, num_documents_per_author=4):
     df = pd.read_pickle(open(input_path, 'rb'))
     output_objs = []
-    for idx, row in df.iterrows():
         # Get the current author's documents
         query_author_df   = df[df.authorID == row['authorID']]
         # split the author's documents into two: query and correct author
-        author_documents = query_author_df.fullText.tolist()[0]
-        if len(author_documents) < num_documents_per_author * 2:
             continue
         query_documents    = author_documents[:num_documents_per_author]
@@ -67,17 +70,29 @@ def get_reddit_data(input_path, random_seed=123, num_instances=50, num_documents
         other_authors_df = df[df.authorID != row['authorID']]
         other_two_authors = other_authors_df.sample(2, random_state=random_seed)
         output_objs.append({
             "Q_authorID":  str(row["authorID"]),
-            "Q_fullText": query_documents,
             "a0_authorID":  str(other_two_authors.iloc[0]["authorID"]),
-            "a0_fullText": other_two_authors.iloc[0]["fullText"][:num_documents_per_author],
             "a1_authorID":  str(other_two_authors.iloc[1]["authorID"]),
-            "a1_fullText": other_two_authors.iloc[1]["fullText"][:num_documents_per_author],
             "a2_authorID": str(row["authorID"]) + "_correct",
-            "a2_fullText": correct_documents,
             "gt_idx": 2
         })
         random_seed += 1 # Increment seed to get different authors for the next task
         if len(output_objs) >= num_instances:
             break

     df = pd.DataFrame(out_list)
     df.to_pickle(output_file)
+def get_reddit_data(input_path, random_seed=123, num_instances=50, num_documents_per_author=8, min_instance_len=10):
     df = pd.read_pickle(open(input_path, 'rb'))
+    df['fullText'] = df.fullText.map(lambda x: [d for d in x if len(d.split()) > min_instance_len])
+    df = df[df.fullText.str.len() > num_documents_per_author * 2]
     output_objs = []
+    for _, row in df.iterrows():
         # Get the current author's documents
         query_author_df   = df[df.authorID == row['authorID']]
         # split the author's documents into two: query and correct author
+        author_documents = [x for x in query_author_df.fullText.tolist()[0] if len(x.split()) > min_instance_len]
+        if len(author_documents) <= num_documents_per_author * 2:
             continue
         query_documents    = author_documents[:num_documents_per_author]
         other_authors_df = df[df.authorID != row['authorID']]
         other_two_authors = other_authors_df.sample(2, random_state=random_seed)
+        # output_objs.append({
+        #     "Q_authorID":  str(row["authorID"]),
+        #     "Q_fullText": "\n\n".join(["Text:\n{}".format(d) for d in query_documents]),
+        #     "a0_authorID":  str(other_two_authors.iloc[0]["authorID"]),
+        #     "a0_fullText": "\n\n".join(["Text:\n{}".format(d) for d in other_two_authors.iloc[0]["fullText"][:num_documents_per_author]]),
+        #     "a1_authorID":  str(other_two_authors.iloc[1]["authorID"]),
+        #     "a1_fullText": "\n\n".join(["Text:\n{}".format(d) for d in other_two_authors.iloc[1]["fullText"][:num_documents_per_author]]),
+        #     "a2_authorID": str(row["authorID"]) + "_correct",
+        #     "a2_fullText": "\n\n".join(["Text:\n{}".format(d) for d in correct_documents]),
+        #     "gt_idx": 2
+        # })
         output_objs.append({
             "Q_authorID":  str(row["authorID"]),
+            "Q_fullText":  ["Text:\n{}".format(d) for d in query_documents],
             "a0_authorID":  str(other_two_authors.iloc[0]["authorID"]),
+            "a0_fullText": ["Text:\n{}".format(d) for d in other_two_authors.iloc[0]["fullText"][:num_documents_per_author]],
             "a1_authorID":  str(other_two_authors.iloc[1]["authorID"]),
+            "a1_fullText": ["Text:\n{}".format(d) for d in other_two_authors.iloc[1]["fullText"][:num_documents_per_author]],
             "a2_authorID": str(row["authorID"]) + "_correct",
+            "a2_fullText": ["Text:\n{}".format(d) for d in correct_documents],
             "gt_idx": 2
         })
+        print( "Text:\n\n".join(query_documents))
         random_seed += 1 # Increment seed to get different authors for the next task
         if len(output_objs) >= num_instances:
             break

utils/interp_space_utils.py CHANGED Viewed

@@ -61,7 +61,7 @@ def compute_g2v_features(clustered_authors_df: pd.DataFrame, task_authors_df: pd
     # Gather the input texts (preserves list-of-strings if any)
     #texts = background_corpus_df[text_clm].fillna("").tolist()
     author_texts = ['\n\n'.join(x) for x in clustered_authors_df.fullText.tolist()]
     print(f"Number of author_texts: {len(author_texts)}")
     # Create a reproducible JSON serialization of the texts

     # Gather the input texts (preserves list-of-strings if any)
     #texts = background_corpus_df[text_clm].fillna("").tolist()
     author_texts = ['\n\n'.join(x) for x in clustered_authors_df.fullText.tolist()]
+    print('author_text at 0:{}'.format(author_texts[0]))
     print(f"Number of author_texts: {len(author_texts)}")
     # Create a reproducible JSON serialization of the texts

utils/ui.py CHANGED Viewed

@@ -159,6 +159,12 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
     ]
 def task_HTML(mystery_text, candidate_texts, predicted_author, ground_truth_author):
     header_html = f"""
     <div style="border:1px solid #ccc; padding:10px; margin-bottom:10px;">
       <h3>Here’s the mystery passage alongside three candidate texts—look for the green highlight to see the predicted author.</h3>

     ]
 def task_HTML(mystery_text, candidate_texts, predicted_author, ground_truth_author):
+    # if any of the texts is a list of text then concatenate them
+    if isinstance(mystery_text, list):
+        mystery_text = "\n\n".join(["Text: {}".format(x) for x in mystery_text])
+        candidate_texts = ["\n\n".join(["Text: {}".format(t) for t in x]) for x in candidate_texts]
     header_html = f"""
     <div style="border:1px solid #ccc; padding:10px; margin-bottom:10px;">
       <h3>Here’s the mystery passage alongside three candidate texts—look for the green highlight to see the predicted author.</h3>