Spaces:
Build error
Build error
| """ | |
| RANDOMLY Takes 10,000 lines from ../raw_data/raw_mediasum_train_data.txt, 1,000 lines from ../raw_data/raw_mediasum_test_data.txt, and 1,000 lines from ../raw_data/raw_mediasum_val_data.txt. Then converts each one to JSONL. | |
| """ | |
| import random | |
| import json | |
| import os | |
| def reservoir_sample(file_path, k): | |
| reservoir = [] | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| for i, line in enumerate(f): | |
| if i < k: | |
| reservoir.append(line.strip()) | |
| else: | |
| j = random.randint(0, i) | |
| if j < k: | |
| reservoir[j] = line.strip() | |
| return reservoir | |
| def write_jsonl(lines, output_path): | |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| for line in lines: | |
| data = json.loads(line) | |
| new_data = { | |
| "id": data["id"], | |
| "original_source": "MediaSum", | |
| "url": data["url"], | |
| "summary": data["summary"], | |
| "transcript": data["utt"], | |
| "speaker": data["speaker"] | |
| } | |
| json.dump(new_data, f, indent=2) | |
| f.write('\n') | |
| print("Cleaning Mediasum dataset...") | |
| ta = '../raw_data/raw_mediasum_train_data.txt' | |
| tb = '../raw_data/raw_mediasum_test_data.txt' | |
| vc = '../raw_data/raw_mediasum_val_data.txt' | |
| train_loc = '../clean1/ms/mediasum_train_10k.jsonl' | |
| test_loc = '../clean1/ms/mediasum_test_1k.jsonl' | |
| val_loc = '../clean1/ms/mediasum_val_1k.jsonl' | |
| print("Sampling lines from raw data files...") | |
| train_lines = reservoir_sample(ta, 10000) | |
| test_lines = reservoir_sample(tb, 1000) | |
| val_lines = reservoir_sample(vc, 1000) | |
| print("Collected Samples. Writing to JSONL files...") | |
| write_jsonl(train_lines, train_loc) | |
| write_jsonl(test_lines, test_loc) | |
| write_jsonl(val_lines, val_loc) | |
| print("Done") | |