File size: 1,884 Bytes
f71ba81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""
RANDOMLY Takes 10,000 lines from ../raw_data/raw_mediasum_train_data.txt, 1,000 lines from ../raw_data/raw_mediasum_test_data.txt, and 1,000 lines from ../raw_data/raw_mediasum_val_data.txt. Then converts each one to JSONL.

"""
import random
import json
import os

def reservoir_sample(file_path, k):
    reservoir = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i < k:
                reservoir.append(line.strip())
            else:
                j = random.randint(0, i)
                if j < k:
                    reservoir[j] = line.strip()
    return reservoir

def write_jsonl(lines, output_path):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        for line in lines:
            data = json.loads(line)
            new_data = {
                "id": data["id"],
                "original_source": "MediaSum",
                "url": data["url"],
                "summary": data["summary"],
                "transcript": data["utt"],
                "speaker": data["speaker"]
            }
            json.dump(new_data, f, indent=2)
            f.write('\n')

print("Cleaning Mediasum dataset...")

ta = '../raw_data/raw_mediasum_train_data.txt'
tb = '../raw_data/raw_mediasum_test_data.txt'
vc = '../raw_data/raw_mediasum_val_data.txt'

train_loc = '../clean1/ms/mediasum_train_10k.jsonl'
test_loc = '../clean1/ms/mediasum_test_1k.jsonl'
val_loc = '../clean1/ms/mediasum_val_1k.jsonl'

print("Sampling lines from raw data files...")

train_lines = reservoir_sample(ta, 10000)
test_lines = reservoir_sample(tb, 1000)
val_lines = reservoir_sample(vc, 1000)

print("Collected Samples. Writing to JSONL files...")

write_jsonl(train_lines, train_loc)
write_jsonl(test_lines, test_loc)
write_jsonl(val_lines, val_loc)

print("Done")