Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

Milad Alshomary commited on Oct 23

Commit

f61a01f

1 Parent(s): 161cb59

updates

Browse files

Files changed (1) hide show

prepare_data.py +7 -12

prepare_data.py CHANGED Viewed

@@ -44,7 +44,7 @@ def sample_ds(input_file, output_file, num_insts=10000, min_num_text_per_inst=0,
     df = pd.DataFrame(out_list)
     df.to_pickle(output_file)
-def get_reddit_data(input_path, random_seed=123, num_instances=50, num_documents_per_author=8, min_instance_len=10):
     df = pd.read_pickle(open(input_path, 'rb'))
     df['fullText'] = df.fullText.map(lambda x: [d for d in x if len(d.split()) > min_instance_len])
@@ -70,17 +70,12 @@ def get_reddit_data(input_path, random_seed=123, num_instances=50, num_documents
         other_authors_df = df[df.authorID != row['authorID']]
         other_two_authors = other_authors_df.sample(2, random_state=random_seed)
-        # output_objs.append({
-        #     "Q_authorID":  str(row["authorID"]),
-        #     "Q_fullText": "\n\n".join(["Text:\n{}".format(d) for d in query_documents]),
-        #     "a0_authorID":  str(other_two_authors.iloc[0]["authorID"]),
-        #     "a0_fullText": "\n\n".join(["Text:\n{}".format(d) for d in other_two_authors.iloc[0]["fullText"][:num_documents_per_author]]),
-        #     "a1_authorID":  str(other_two_authors.iloc[1]["authorID"]),
-        #     "a1_fullText": "\n\n".join(["Text:\n{}".format(d) for d in other_two_authors.iloc[1]["fullText"][:num_documents_per_author]]),
-        #     "a2_authorID": str(row["authorID"]) + "_correct",
-        #     "a2_fullText": "\n\n".join(["Text:\n{}".format(d) for d in correct_documents]),
-        #     "gt_idx": 2
-        # })
         output_objs.append({
             "Q_authorID":  str(row["authorID"]),
             "Q_fullText":  ["Text:\n{}".format(d) for d in query_documents],

     df = pd.DataFrame(out_list)
     df.to_pickle(output_file)
+def get_reddit_data(input_path, random_seed=123, num_instances=100, num_documents_per_author=8, min_instance_len=10):
     df = pd.read_pickle(open(input_path, 'rb'))
     df['fullText'] = df.fullText.map(lambda x: [d for d in x if len(d.split()) > min_instance_len])
         other_authors_df = df[df.authorID != row['authorID']]
         other_two_authors = other_authors_df.sample(2, random_state=random_seed)
+        # Make sure all authors are are of equivelant number of texts
+        min_found_texts = min([len(correct_documents), len(query_documents)] + [len(x) for x in other_two_authors.fullText.tolist()])
+        query_documents = query_documents[:min_found_texts]
+        correct_documents = correct_documents[:min_found_texts]
+        other_two_authors.fullText = other_two_authors.fullText.apply(lambda x: x[:min_found_texts])
         output_objs.append({
             "Q_authorID":  str(row["authorID"]),
             "Q_fullText":  ["Text:\n{}".format(d) for d in query_documents],