Spaces:
Build error
Build error
| """ | |
| RANDOMLY Takes 10,000 lines from ../raw_data/raw_squality_train.jsonl, 1,000 lines from ../raw_data/raw_squality_test.jsonl, and 1,000 lines from ../raw_data/raw_squality_val.jsonl. Then converts each one to simplified JSONL. | |
| """ | |
| import random | |
| import json | |
| import os | |
| def reservoir_sample(file_path, k): | |
| reservoir = [] | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| for i, line in enumerate(f): | |
| if i < k: | |
| reservoir.append(line.strip()) | |
| else: | |
| j = random.randint(0, i) | |
| if j < k: | |
| reservoir[j] = line.strip() | |
| return reservoir | |
| def write_jsonl(lines, output_path): | |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| for i, line in enumerate(lines): | |
| data = json.loads(line) | |
| # Extract key fields from the data | |
| source_type = data.get("source_type", "") | |
| query_synthesized = data.get("query_synthesized", "") | |
| summary = data.get("summary", "") | |
| document = data.get("document", "") | |
| new_data = { | |
| "id": i, | |
| "original_source": "SQuALITY", | |
| "source_type": source_type, | |
| "query": query_synthesized, | |
| "summary": summary, | |
| "document": document[:500] if document else "" # Truncate long documents | |
| } | |
| json.dump(new_data, f, indent=2) | |
| f.write('\n') | |
| print("Cleaning SQuALITY dataset...") | |
| ta = '../raw_data/raw_squality_train.jsonl' | |
| tb = '../raw_data/raw_squality_test.jsonl' | |
| vc = '../raw_data/raw_squality_val.jsonl' | |
| train_loc = '../clean1/squality/squality_train_10k.jsonl' | |
| test_loc = '../clean1/squality/squality_test_1k.jsonl' | |
| val_loc = '../clean1/squality/squality_val_1k.jsonl' | |
| print("Sampling lines from raw data files...") | |
| train_lines = reservoir_sample(ta, 10000) | |
| test_lines = reservoir_sample(tb, 1000) | |
| val_lines = reservoir_sample(vc, 1000) | |
| print("Collected Samples. Writing to JSONL files...") | |
| write_jsonl(train_lines, train_loc) | |
| write_jsonl(test_lines, test_loc) | |
| write_jsonl(val_lines, val_loc) | |
| print("Done") | |