File size: 1,530 Bytes
f179148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""
RANDOMLY Takes 10,000 lines from ../raw_data/raw_data_msmarco_train.csv, 1,000 lines from ../raw_data/raw_data_msmarco_val.csv. Then converts each one to JSONL.

"""
import random
import json
import csv
import os

def reservoir_sample_csv(file_path, k):
    rows = []
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            rows.append(row)
    if len(rows) <= k:
        return rows
    return random.sample(rows, k)

def write_jsonl(rows, output_path):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        for i, row in enumerate(rows):
            new_data = {
                "id": i,
                "original_source": "MSMarco",
                "query": row.get("query", ""),
                "answers": row.get("answers", ""),
                "passage": row.get("finalpassage", "")
            }
            json.dump(new_data, f, indent=2)
            f.write('\n')

print("Cleaning MSMarco dataset...")

ta = '../raw_data/raw_data_msmarco_train.csv'
tb = '../raw_data/raw_data_msmarco_val.csv'

train_loc = '../clean1/msm/msmarco_train_10k.jsonl'
test_loc = '../clean1/msm/msmarco_val_1k.jsonl'

print("Sampling rows from raw data CSV files...")

train_rows = reservoir_sample_csv(ta, 10000)
test_rows = reservoir_sample_csv(tb, 1000)

print("Collected Samples. Writing to JSONL files...")

write_jsonl(train_rows, train_loc)
write_jsonl(test_rows, test_loc)

print("Done")