File size: 1,653 Bytes
7c50656 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | #!/usr/bin/env python3
"""
下载 GSM8K 和 HumanEval 数据集到本地
"""
import os
import json
import requests
from datasets import load_dataset
DATA_DIR = "/workspace/hanrui/datasets"
os.makedirs(DATA_DIR, exist_ok=True)
print("=" * 60)
print("下载 GSM8K 数据集")
print("=" * 60)
try:
# 下载 GSM8K
gsm8k_dir = os.path.join(DATA_DIR, "gsm8k")
os.makedirs(gsm8k_dir, exist_ok=True)
print("Loading GSM8K from HuggingFace...")
dataset = load_dataset("gsm8k", "main", split="test")
# 保存为 jsonl
output_file = os.path.join(gsm8k_dir, "test.jsonl")
with open(output_file, 'w') as f:
for item in dataset:
f.write(json.dumps(item) + '\n')
print(f"✓ GSM8K saved to {output_file}")
print(f" Total samples: {len(dataset)}")
except Exception as e:
print(f"✗ GSM8K download failed: {e}")
print("\n" + "=" * 60)
print("下载 HumanEval 数据集")
print("=" * 60)
try:
# 下载 HumanEval
humaneval_dir = os.path.join(DATA_DIR, "humaneval")
os.makedirs(humaneval_dir, exist_ok=True)
print("Loading HumanEval from HuggingFace...")
dataset = load_dataset("openai_humaneval", split="test")
# 保存为 jsonl
output_file = os.path.join(humaneval_dir, "test.jsonl")
with open(output_file, 'w') as f:
for item in dataset:
f.write(json.dumps(item) + '\n')
print(f"✓ HumanEval saved to {output_file}")
print(f" Total samples: {len(dataset)}")
except Exception as e:
print(f"✗ HumanEval download failed: {e}")
print("\n" + "=" * 60)
print("下载完成")
print("=" * 60)
print(f"数据保存在: {DATA_DIR}")
|