Milad Alshomary
commited on
Commit
·
f61a01f
1
Parent(s):
161cb59
updates
Browse files- prepare_data.py +7 -12
prepare_data.py
CHANGED
|
@@ -44,7 +44,7 @@ def sample_ds(input_file, output_file, num_insts=10000, min_num_text_per_inst=0,
|
|
| 44 |
df = pd.DataFrame(out_list)
|
| 45 |
df.to_pickle(output_file)
|
| 46 |
|
| 47 |
-
def get_reddit_data(input_path, random_seed=123, num_instances=
|
| 48 |
|
| 49 |
df = pd.read_pickle(open(input_path, 'rb'))
|
| 50 |
df['fullText'] = df.fullText.map(lambda x: [d for d in x if len(d.split()) > min_instance_len])
|
|
@@ -70,17 +70,12 @@ def get_reddit_data(input_path, random_seed=123, num_instances=50, num_documents
|
|
| 70 |
other_authors_df = df[df.authorID != row['authorID']]
|
| 71 |
other_two_authors = other_authors_df.sample(2, random_state=random_seed)
|
| 72 |
|
| 73 |
-
#
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
# "a1_fullText": "\n\n".join(["Text:\n{}".format(d) for d in other_two_authors.iloc[1]["fullText"][:num_documents_per_author]]),
|
| 80 |
-
# "a2_authorID": str(row["authorID"]) + "_correct",
|
| 81 |
-
# "a2_fullText": "\n\n".join(["Text:\n{}".format(d) for d in correct_documents]),
|
| 82 |
-
# "gt_idx": 2
|
| 83 |
-
# })
|
| 84 |
output_objs.append({
|
| 85 |
"Q_authorID": str(row["authorID"]),
|
| 86 |
"Q_fullText": ["Text:\n{}".format(d) for d in query_documents],
|
|
|
|
| 44 |
df = pd.DataFrame(out_list)
|
| 45 |
df.to_pickle(output_file)
|
| 46 |
|
| 47 |
+
def get_reddit_data(input_path, random_seed=123, num_instances=100, num_documents_per_author=8, min_instance_len=10):
|
| 48 |
|
| 49 |
df = pd.read_pickle(open(input_path, 'rb'))
|
| 50 |
df['fullText'] = df.fullText.map(lambda x: [d for d in x if len(d.split()) > min_instance_len])
|
|
|
|
| 70 |
other_authors_df = df[df.authorID != row['authorID']]
|
| 71 |
other_two_authors = other_authors_df.sample(2, random_state=random_seed)
|
| 72 |
|
| 73 |
+
# Make sure all authors are are of equivelant number of texts
|
| 74 |
+
min_found_texts = min([len(correct_documents), len(query_documents)] + [len(x) for x in other_two_authors.fullText.tolist()])
|
| 75 |
+
query_documents = query_documents[:min_found_texts]
|
| 76 |
+
correct_documents = correct_documents[:min_found_texts]
|
| 77 |
+
other_two_authors.fullText = other_two_authors.fullText.apply(lambda x: x[:min_found_texts])
|
| 78 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
output_objs.append({
|
| 80 |
"Q_authorID": str(row["authorID"]),
|
| 81 |
"Q_fullText": ["Text:\n{}".format(d) for d in query_documents],
|