File size: 1,653 Bytes
7c50656
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python3
"""
下载 GSM8K 和 HumanEval 数据集到本地
"""
import os
import json
import requests
from datasets import load_dataset

DATA_DIR = "/workspace/hanrui/datasets"
os.makedirs(DATA_DIR, exist_ok=True)

print("=" * 60)
print("下载 GSM8K 数据集")
print("=" * 60)

try:
    # 下载 GSM8K
    gsm8k_dir = os.path.join(DATA_DIR, "gsm8k")
    os.makedirs(gsm8k_dir, exist_ok=True)

    print("Loading GSM8K from HuggingFace...")
    dataset = load_dataset("gsm8k", "main", split="test")

    # 保存为 jsonl
    output_file = os.path.join(gsm8k_dir, "test.jsonl")
    with open(output_file, 'w') as f:
        for item in dataset:
            f.write(json.dumps(item) + '\n')

    print(f"✓ GSM8K saved to {output_file}")
    print(f"  Total samples: {len(dataset)}")

except Exception as e:
    print(f"✗ GSM8K download failed: {e}")

print("\n" + "=" * 60)
print("下载 HumanEval 数据集")
print("=" * 60)

try:
    # 下载 HumanEval
    humaneval_dir = os.path.join(DATA_DIR, "humaneval")
    os.makedirs(humaneval_dir, exist_ok=True)

    print("Loading HumanEval from HuggingFace...")
    dataset = load_dataset("openai_humaneval", split="test")

    # 保存为 jsonl
    output_file = os.path.join(humaneval_dir, "test.jsonl")
    with open(output_file, 'w') as f:
        for item in dataset:
            f.write(json.dumps(item) + '\n')

    print(f"✓ HumanEval saved to {output_file}")
    print(f"  Total samples: {len(dataset)}")

except Exception as e:
    print(f"✗ HumanEval download failed: {e}")

print("\n" + "=" * 60)
print("下载完成")
print("=" * 60)
print(f"数据保存在: {DATA_DIR}")