File size: 3,377 Bytes
1b35d41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""Re-split the original Easy tasks into Easy/Medium/Hard based on gold patch size.

Takes only the 64 "Easy" tasks from the verified dataset and re-categorizes:
  - Easy:   gold patch ≤ 4 lines changed (simple one-liner fixes)
  - Medium: gold patch 5-15 lines changed
  - Hard:   gold patch > 15 lines changed

Then splits 80/20 into train/eval with stratified difficulty.

Usage:
    python code_migration/resplit_easy.py
"""

import json
import random
from pathlib import Path
from collections import Counter

DATA_DIR = Path(__file__).parent / "data"
SOURCE = DATA_DIR / "timemachine-bench-verified.jsonl"


def count_patch_lines(patch: str) -> int:
    """Count lines changed (added + removed) in a unified diff."""
    added = sum(1 for l in patch.splitlines() if l.startswith("+") and not l.startswith("+++"))
    removed = sum(1 for l in patch.splitlines() if l.startswith("-") and not l.startswith("---"))
    return added + removed


def main():
    # Load only original Easy tasks
    all_tasks = []
    with open(SOURCE) as f:
        for line in f:
            t = json.loads(line.strip())
            if t.get("difficulty") == "Easy":
                all_tasks.append(t)

    print(f"Original Easy tasks: {len(all_tasks)}")

    # Compute gold patch sizes
    for t in all_tasks:
        t["_patch_lines"] = count_patch_lines(t.get("gold_patch", ""))

    # Re-categorize
    for t in all_tasks:
        lines = t["_patch_lines"]
        if lines <= 4:
            t["difficulty"] = "Easy"
        elif lines <= 15:
            t["difficulty"] = "Medium"
        else:
            t["difficulty"] = "Hard"

    # Show distribution
    dist = Counter(t["difficulty"] for t in all_tasks)
    print(f"\nNew difficulty distribution:")
    for d in ["Easy", "Medium", "Hard"]:
        tasks_in_d = [t for t in all_tasks if t["difficulty"] == d]
        patch_sizes = [t["_patch_lines"] for t in tasks_in_d]
        if patch_sizes:
            print(f"  {d:8s}: {dist.get(d, 0):3d} tasks  "
                  f"(patch lines: min={min(patch_sizes)}, max={max(patch_sizes)}, "
                  f"avg={sum(patch_sizes)/len(patch_sizes):.1f})")

    # 50/50 split: 20 train, rest eval (up to ~20+)
    random.seed(42)
    train, eval_ = [], []

    # Take first 20 shuffled as train, rest as eval
    all_shuffled = list(all_tasks)
    random.shuffle(all_shuffled)
    train = all_shuffled[:20]
    eval_ = all_shuffled[20:40]  # next 20 for eval

    random.shuffle(train)
    random.shuffle(eval_)

    # Remove temp field before saving
    for t in train + eval_:
        t.pop("_patch_lines", None)

    # Write
    train_path = DATA_DIR / "train.jsonl"
    eval_path = DATA_DIR / "eval.jsonl"

    with open(train_path, "w") as f:
        for t in train:
            f.write(json.dumps(t, ensure_ascii=False) + "\n")

    with open(eval_path, "w") as f:
        for t in eval_:
            f.write(json.dumps(t, ensure_ascii=False) + "\n")

    print(f"\nTrain: {train_path} ({len(train)} tasks)")
    print(f"Eval:  {eval_path} ({len(eval_)} tasks)")

    print("\nTrain distribution:")
    for d, c in sorted(Counter(t["difficulty"] for t in train).items()):
        print(f"  {d}: {c}")
    print("Eval distribution:")
    for d, c in sorted(Counter(t["difficulty"] for t in eval_).items()):
        print(f"  {d}: {c}")


if __name__ == "__main__":
    main()