"""Re-split the original Easy tasks into Easy/Medium/Hard based on gold patch size. Takes only the 64 "Easy" tasks from the verified dataset and re-categorizes: - Easy: gold patch ≤ 4 lines changed (simple one-liner fixes) - Medium: gold patch 5-15 lines changed - Hard: gold patch > 15 lines changed Then splits 80/20 into train/eval with stratified difficulty. Usage: python code_migration/resplit_easy.py """ import json import random from pathlib import Path from collections import Counter DATA_DIR = Path(__file__).parent / "data" SOURCE = DATA_DIR / "timemachine-bench-verified.jsonl" def count_patch_lines(patch: str) -> int: """Count lines changed (added + removed) in a unified diff.""" added = sum(1 for l in patch.splitlines() if l.startswith("+") and not l.startswith("+++")) removed = sum(1 for l in patch.splitlines() if l.startswith("-") and not l.startswith("---")) return added + removed def main(): # Load only original Easy tasks all_tasks = [] with open(SOURCE) as f: for line in f: t = json.loads(line.strip()) if t.get("difficulty") == "Easy": all_tasks.append(t) print(f"Original Easy tasks: {len(all_tasks)}") # Compute gold patch sizes for t in all_tasks: t["_patch_lines"] = count_patch_lines(t.get("gold_patch", "")) # Re-categorize for t in all_tasks: lines = t["_patch_lines"] if lines <= 4: t["difficulty"] = "Easy" elif lines <= 15: t["difficulty"] = "Medium" else: t["difficulty"] = "Hard" # Show distribution dist = Counter(t["difficulty"] for t in all_tasks) print(f"\nNew difficulty distribution:") for d in ["Easy", "Medium", "Hard"]: tasks_in_d = [t for t in all_tasks if t["difficulty"] == d] patch_sizes = [t["_patch_lines"] for t in tasks_in_d] if patch_sizes: print(f" {d:8s}: {dist.get(d, 0):3d} tasks " f"(patch lines: min={min(patch_sizes)}, max={max(patch_sizes)}, " f"avg={sum(patch_sizes)/len(patch_sizes):.1f})") # 50/50 split: 20 train, rest eval (up to ~20+) random.seed(42) train, eval_ = [], [] # Take first 20 shuffled as train, rest as eval all_shuffled = list(all_tasks) random.shuffle(all_shuffled) train = all_shuffled[:20] eval_ = all_shuffled[20:40] # next 20 for eval random.shuffle(train) random.shuffle(eval_) # Remove temp field before saving for t in train + eval_: t.pop("_patch_lines", None) # Write train_path = DATA_DIR / "train.jsonl" eval_path = DATA_DIR / "eval.jsonl" with open(train_path, "w") as f: for t in train: f.write(json.dumps(t, ensure_ascii=False) + "\n") with open(eval_path, "w") as f: for t in eval_: f.write(json.dumps(t, ensure_ascii=False) + "\n") print(f"\nTrain: {train_path} ({len(train)} tasks)") print(f"Eval: {eval_path} ({len(eval_)} tasks)") print("\nTrain distribution:") for d, c in sorted(Counter(t["difficulty"] for t in train).items()): print(f" {d}: {c}") print("Eval distribution:") for d, c in sorted(Counter(t["difficulty"] for t in eval_).items()): print(f" {d}: {c}") if __name__ == "__main__": main()