File size: 1,709 Bytes
f71ba81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
RANDOMLY Takes 10,000 lines from ../raw_data/raw_dialogsum_train.csv, 1,000 lines from ../raw_data/raw_dialogsum_test.csv, and 700 lines from ../raw_data/raw_dialogsum_val.csv. Then converts each one to JSONL.

"""
import random
import json
import csv
import os

def reservoir_sample_csv(file_path, k):
    rows = []
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            rows.append(row)
    if len(rows) <= k:
        return rows
    return random.sample(rows, k)

def write_jsonl(rows, output_path):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        for row in rows:
            new_data = {
                "id": row["id"],
                "original_source": "DialogSum",
                "dialogue": row["dialogue"],
                "summary": row["summary"],
                "topic": row["topic"]
            }
            json.dump(new_data, f, indent=2)
            f.write('\n')

print("Cleaning DialogSum dataset...")

ta = '../raw_data/raw_dialogsum_train.csv'
tb = '../raw_data/raw_dialogsum_test.csv'
vc = '../raw_data/raw_dialogsum_val.csv'

train_loc = '../clean1/ds/dialogsum_train_10k.jsonl'
test_loc = '../clean1/ds/dialogsum_test_1k.jsonl'
val_loc = '../clean1/ds/dialogsum_val_700.jsonl'

print("Sampling rows from raw data CSV files...")

train_rows = reservoir_sample_csv(ta, 10000)
test_rows = reservoir_sample_csv(tb, 1000)
val_rows = reservoir_sample_csv(vc, 700)

print("Collected Samples. Writing to JSONL files...")

write_jsonl(train_rows, train_loc)
write_jsonl(test_rows, test_loc)
write_jsonl(val_rows, val_loc)

print("Done")