Spaces:
Build error
Build error
| """ | |
| RANDOMLY Takes 10,000 lines from ../raw_data/raw_dialogsum_train.csv, 1,000 lines from ../raw_data/raw_dialogsum_test.csv, and 700 lines from ../raw_data/raw_dialogsum_val.csv. Then converts each one to JSONL. | |
| """ | |
| import random | |
| import json | |
| import csv | |
| import os | |
| def reservoir_sample_csv(file_path, k): | |
| rows = [] | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| rows.append(row) | |
| if len(rows) <= k: | |
| return rows | |
| return random.sample(rows, k) | |
| def write_jsonl(rows, output_path): | |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| for row in rows: | |
| new_data = { | |
| "id": row["id"], | |
| "original_source": "DialogSum", | |
| "dialogue": row["dialogue"], | |
| "summary": row["summary"], | |
| "topic": row["topic"] | |
| } | |
| json.dump(new_data, f, indent=2) | |
| f.write('\n') | |
| print("Cleaning DialogSum dataset...") | |
| ta = '../raw_data/raw_dialogsum_train.csv' | |
| tb = '../raw_data/raw_dialogsum_test.csv' | |
| vc = '../raw_data/raw_dialogsum_val.csv' | |
| train_loc = '../clean1/ds/dialogsum_train_10k.jsonl' | |
| test_loc = '../clean1/ds/dialogsum_test_1k.jsonl' | |
| val_loc = '../clean1/ds/dialogsum_val_700.jsonl' | |
| print("Sampling rows from raw data CSV files...") | |
| train_rows = reservoir_sample_csv(ta, 10000) | |
| test_rows = reservoir_sample_csv(tb, 1000) | |
| val_rows = reservoir_sample_csv(vc, 700) | |
| print("Collected Samples. Writing to JSONL files...") | |
| write_jsonl(train_rows, train_loc) | |
| write_jsonl(test_rows, test_loc) | |
| write_jsonl(val_rows, val_loc) | |
| print("Done") |