Spaces:
Sleeping
Sleeping
| """Re-split the original Easy tasks into Easy/Medium/Hard based on gold patch size. | |
| Takes only the 64 "Easy" tasks from the verified dataset and re-categorizes: | |
| - Easy: gold patch ≤ 4 lines changed (simple one-liner fixes) | |
| - Medium: gold patch 5-15 lines changed | |
| - Hard: gold patch > 15 lines changed | |
| Then splits 80/20 into train/eval with stratified difficulty. | |
| Usage: | |
| python code_migration/resplit_easy.py | |
| """ | |
| import json | |
| import random | |
| from pathlib import Path | |
| from collections import Counter | |
| DATA_DIR = Path(__file__).parent / "data" | |
| SOURCE = DATA_DIR / "timemachine-bench-verified.jsonl" | |
| def count_patch_lines(patch: str) -> int: | |
| """Count lines changed (added + removed) in a unified diff.""" | |
| added = sum(1 for l in patch.splitlines() if l.startswith("+") and not l.startswith("+++")) | |
| removed = sum(1 for l in patch.splitlines() if l.startswith("-") and not l.startswith("---")) | |
| return added + removed | |
| def main(): | |
| # Load only original Easy tasks | |
| all_tasks = [] | |
| with open(SOURCE) as f: | |
| for line in f: | |
| t = json.loads(line.strip()) | |
| if t.get("difficulty") == "Easy": | |
| all_tasks.append(t) | |
| print(f"Original Easy tasks: {len(all_tasks)}") | |
| # Compute gold patch sizes | |
| for t in all_tasks: | |
| t["_patch_lines"] = count_patch_lines(t.get("gold_patch", "")) | |
| # Re-categorize | |
| for t in all_tasks: | |
| lines = t["_patch_lines"] | |
| if lines <= 4: | |
| t["difficulty"] = "Easy" | |
| elif lines <= 15: | |
| t["difficulty"] = "Medium" | |
| else: | |
| t["difficulty"] = "Hard" | |
| # Show distribution | |
| dist = Counter(t["difficulty"] for t in all_tasks) | |
| print(f"\nNew difficulty distribution:") | |
| for d in ["Easy", "Medium", "Hard"]: | |
| tasks_in_d = [t for t in all_tasks if t["difficulty"] == d] | |
| patch_sizes = [t["_patch_lines"] for t in tasks_in_d] | |
| if patch_sizes: | |
| print(f" {d:8s}: {dist.get(d, 0):3d} tasks " | |
| f"(patch lines: min={min(patch_sizes)}, max={max(patch_sizes)}, " | |
| f"avg={sum(patch_sizes)/len(patch_sizes):.1f})") | |
| # 50/50 split: 20 train, rest eval (up to ~20+) | |
| random.seed(42) | |
| train, eval_ = [], [] | |
| # Take first 20 shuffled as train, rest as eval | |
| all_shuffled = list(all_tasks) | |
| random.shuffle(all_shuffled) | |
| train = all_shuffled[:20] | |
| eval_ = all_shuffled[20:40] # next 20 for eval | |
| random.shuffle(train) | |
| random.shuffle(eval_) | |
| # Remove temp field before saving | |
| for t in train + eval_: | |
| t.pop("_patch_lines", None) | |
| # Write | |
| train_path = DATA_DIR / "train.jsonl" | |
| eval_path = DATA_DIR / "eval.jsonl" | |
| with open(train_path, "w") as f: | |
| for t in train: | |
| f.write(json.dumps(t, ensure_ascii=False) + "\n") | |
| with open(eval_path, "w") as f: | |
| for t in eval_: | |
| f.write(json.dumps(t, ensure_ascii=False) + "\n") | |
| print(f"\nTrain: {train_path} ({len(train)} tasks)") | |
| print(f"Eval: {eval_path} ({len(eval_)} tasks)") | |
| print("\nTrain distribution:") | |
| for d, c in sorted(Counter(t["difficulty"] for t in train).items()): | |
| print(f" {d}: {c}") | |
| print("Eval distribution:") | |
| for d, c in sorted(Counter(t["difficulty"] for t in eval_).items()): | |
| print(f" {d}: {c}") | |
| if __name__ == "__main__": | |
| main() | |