migratron / code_migration /resplit_easy.py
amrithanandini's picture
integrated backend and frontend
1b35d41
"""Re-split the original Easy tasks into Easy/Medium/Hard based on gold patch size.
Takes only the 64 "Easy" tasks from the verified dataset and re-categorizes:
- Easy: gold patch ≤ 4 lines changed (simple one-liner fixes)
- Medium: gold patch 5-15 lines changed
- Hard: gold patch > 15 lines changed
Then splits 80/20 into train/eval with stratified difficulty.
Usage:
python code_migration/resplit_easy.py
"""
import json
import random
from pathlib import Path
from collections import Counter
DATA_DIR = Path(__file__).parent / "data"
SOURCE = DATA_DIR / "timemachine-bench-verified.jsonl"
def count_patch_lines(patch: str) -> int:
"""Count lines changed (added + removed) in a unified diff."""
added = sum(1 for l in patch.splitlines() if l.startswith("+") and not l.startswith("+++"))
removed = sum(1 for l in patch.splitlines() if l.startswith("-") and not l.startswith("---"))
return added + removed
def main():
# Load only original Easy tasks
all_tasks = []
with open(SOURCE) as f:
for line in f:
t = json.loads(line.strip())
if t.get("difficulty") == "Easy":
all_tasks.append(t)
print(f"Original Easy tasks: {len(all_tasks)}")
# Compute gold patch sizes
for t in all_tasks:
t["_patch_lines"] = count_patch_lines(t.get("gold_patch", ""))
# Re-categorize
for t in all_tasks:
lines = t["_patch_lines"]
if lines <= 4:
t["difficulty"] = "Easy"
elif lines <= 15:
t["difficulty"] = "Medium"
else:
t["difficulty"] = "Hard"
# Show distribution
dist = Counter(t["difficulty"] for t in all_tasks)
print(f"\nNew difficulty distribution:")
for d in ["Easy", "Medium", "Hard"]:
tasks_in_d = [t for t in all_tasks if t["difficulty"] == d]
patch_sizes = [t["_patch_lines"] for t in tasks_in_d]
if patch_sizes:
print(f" {d:8s}: {dist.get(d, 0):3d} tasks "
f"(patch lines: min={min(patch_sizes)}, max={max(patch_sizes)}, "
f"avg={sum(patch_sizes)/len(patch_sizes):.1f})")
# 50/50 split: 20 train, rest eval (up to ~20+)
random.seed(42)
train, eval_ = [], []
# Take first 20 shuffled as train, rest as eval
all_shuffled = list(all_tasks)
random.shuffle(all_shuffled)
train = all_shuffled[:20]
eval_ = all_shuffled[20:40] # next 20 for eval
random.shuffle(train)
random.shuffle(eval_)
# Remove temp field before saving
for t in train + eval_:
t.pop("_patch_lines", None)
# Write
train_path = DATA_DIR / "train.jsonl"
eval_path = DATA_DIR / "eval.jsonl"
with open(train_path, "w") as f:
for t in train:
f.write(json.dumps(t, ensure_ascii=False) + "\n")
with open(eval_path, "w") as f:
for t in eval_:
f.write(json.dumps(t, ensure_ascii=False) + "\n")
print(f"\nTrain: {train_path} ({len(train)} tasks)")
print(f"Eval: {eval_path} ({len(eval_)} tasks)")
print("\nTrain distribution:")
for d, c in sorted(Counter(t["difficulty"] for t in train).items()):
print(f" {d}: {c}")
print("Eval distribution:")
for d, c in sorted(Counter(t["difficulty"] for t in eval_).items()):
print(f" {d}: {c}")
if __name__ == "__main__":
main()