File size: 2,435 Bytes
1b35d41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""Split timemachine-bench-verified.jsonl into train/eval with stratified difficulty.

Usage:
    python code_migration/split_dataset.py [--eval-ratio 0.2] [--seed 42]

Output:
    code_migration/data/train.jsonl
    code_migration/data/eval.jsonl
"""

import json
import random
import argparse
from pathlib import Path
from collections import Counter

DATA_DIR = Path(__file__).parent / "data"
SOURCE = DATA_DIR / "timemachine-bench-verified.jsonl"


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--eval-ratio", type=float, default=0.2)
    parser.add_argument("--seed", type=int, default=42)
    args = parser.parse_args()

    # Load all tasks
    tasks = []
    with open(SOURCE) as f:
        for line in f:
            tasks.append(json.loads(line.strip()))

    # Group by difficulty
    by_difficulty = {}
    for t in tasks:
        d = t.get("difficulty", "Easy")
        by_difficulty.setdefault(d, []).append(t)

    random.seed(args.seed)

    train, eval_ = [], []

    print(f"Source: {SOURCE} ({len(tasks)} tasks)")
    print(f"Eval ratio: {args.eval_ratio}, Seed: {args.seed}")
    print()

    for difficulty in ["Easy", "Medium", "Hard"]:
        group = by_difficulty.get(difficulty, [])
        random.shuffle(group)

        n_eval = max(1, int(len(group) * args.eval_ratio))
        n_train = len(group) - n_eval

        eval_.extend(group[:n_eval])
        train.extend(group[n_eval:])

        print(f"  {difficulty:8s}: {len(group):3d} total → {n_train:3d} train, {n_eval:3d} eval")

    # Shuffle within splits
    random.shuffle(train)
    random.shuffle(eval_)

    # Write
    train_path = DATA_DIR / "train.jsonl"
    eval_path = DATA_DIR / "eval.jsonl"

    with open(train_path, "w") as f:
        for t in train:
            f.write(json.dumps(t, ensure_ascii=False) + "\n")

    with open(eval_path, "w") as f:
        for t in eval_:
            f.write(json.dumps(t, ensure_ascii=False) + "\n")

    print()
    print(f"Train: {train_path} ({len(train)} tasks)")
    print(f"Eval:  {eval_path} ({len(eval_)} tasks)")

    # Verify distribution
    print()
    print("Train distribution:")
    for d, c in sorted(Counter(t["difficulty"] for t in train).items()):
        print(f"  {d}: {c}")
    print("Eval distribution:")
    for d, c in sorted(Counter(t["difficulty"] for t in eval_).items()):
        print(f"  {d}: {c}")


if __name__ == "__main__":
    main()