Milad Alshomary commited on
Commit
f61a01f
·
1 Parent(s): 161cb59
Files changed (1) hide show
  1. prepare_data.py +7 -12
prepare_data.py CHANGED
@@ -44,7 +44,7 @@ def sample_ds(input_file, output_file, num_insts=10000, min_num_text_per_inst=0,
44
  df = pd.DataFrame(out_list)
45
  df.to_pickle(output_file)
46
 
47
- def get_reddit_data(input_path, random_seed=123, num_instances=50, num_documents_per_author=8, min_instance_len=10):
48
 
49
  df = pd.read_pickle(open(input_path, 'rb'))
50
  df['fullText'] = df.fullText.map(lambda x: [d for d in x if len(d.split()) > min_instance_len])
@@ -70,17 +70,12 @@ def get_reddit_data(input_path, random_seed=123, num_instances=50, num_documents
70
  other_authors_df = df[df.authorID != row['authorID']]
71
  other_two_authors = other_authors_df.sample(2, random_state=random_seed)
72
 
73
- # output_objs.append({
74
- # "Q_authorID": str(row["authorID"]),
75
- # "Q_fullText": "\n\n".join(["Text:\n{}".format(d) for d in query_documents]),
76
- # "a0_authorID": str(other_two_authors.iloc[0]["authorID"]),
77
- # "a0_fullText": "\n\n".join(["Text:\n{}".format(d) for d in other_two_authors.iloc[0]["fullText"][:num_documents_per_author]]),
78
- # "a1_authorID": str(other_two_authors.iloc[1]["authorID"]),
79
- # "a1_fullText": "\n\n".join(["Text:\n{}".format(d) for d in other_two_authors.iloc[1]["fullText"][:num_documents_per_author]]),
80
- # "a2_authorID": str(row["authorID"]) + "_correct",
81
- # "a2_fullText": "\n\n".join(["Text:\n{}".format(d) for d in correct_documents]),
82
- # "gt_idx": 2
83
- # })
84
  output_objs.append({
85
  "Q_authorID": str(row["authorID"]),
86
  "Q_fullText": ["Text:\n{}".format(d) for d in query_documents],
 
44
  df = pd.DataFrame(out_list)
45
  df.to_pickle(output_file)
46
 
47
+ def get_reddit_data(input_path, random_seed=123, num_instances=100, num_documents_per_author=8, min_instance_len=10):
48
 
49
  df = pd.read_pickle(open(input_path, 'rb'))
50
  df['fullText'] = df.fullText.map(lambda x: [d for d in x if len(d.split()) > min_instance_len])
 
70
  other_authors_df = df[df.authorID != row['authorID']]
71
  other_two_authors = other_authors_df.sample(2, random_state=random_seed)
72
 
73
+ # Make sure all authors are are of equivelant number of texts
74
+ min_found_texts = min([len(correct_documents), len(query_documents)] + [len(x) for x in other_two_authors.fullText.tolist()])
75
+ query_documents = query_documents[:min_found_texts]
76
+ correct_documents = correct_documents[:min_found_texts]
77
+ other_two_authors.fullText = other_two_authors.fullText.apply(lambda x: x[:min_found_texts])
78
+
 
 
 
 
 
79
  output_objs.append({
80
  "Q_authorID": str(row["authorID"]),
81
  "Q_fullText": ["Text:\n{}".format(d) for d in query_documents],