Spaces:

amrithanandini
/

migratron

Sleeping

App Files Files Community

migratron / code_migration /resplit_easy.py

amrithanandini

integrated backend and frontend

1b35d41 about 1 month ago

raw

history blame contribute delete

3.38 kB

	"""Re-split the original Easy tasks into Easy/Medium/Hard based on gold patch size.

	Takes only the 64 "Easy" tasks from the verified dataset and re-categorizes:
	- Easy: gold patch ≤ 4 lines changed (simple one-liner fixes)
	- Medium: gold patch 5-15 lines changed
	- Hard: gold patch > 15 lines changed

	Then splits 80/20 into train/eval with stratified difficulty.

	Usage:
	python code_migration/resplit_easy.py
	"""

	import json
	import random
	from pathlib import Path
	from collections import Counter

	DATA_DIR = Path(__file__).parent / "data"
	SOURCE = DATA_DIR / "timemachine-bench-verified.jsonl"


	def count_patch_lines(patch: str) -> int:
	"""Count lines changed (added + removed) in a unified diff."""
	added = sum(1 for l in patch.splitlines() if l.startswith("+") and not l.startswith("+++"))
	removed = sum(1 for l in patch.splitlines() if l.startswith("-") and not l.startswith("---"))
	return added + removed


	def main():
	# Load only original Easy tasks
	all_tasks = []
	with open(SOURCE) as f:
	for line in f:
	t = json.loads(line.strip())
	if t.get("difficulty") == "Easy":
	all_tasks.append(t)

	print(f"Original Easy tasks: {len(all_tasks)}")

	# Compute gold patch sizes
	for t in all_tasks:
	t["_patch_lines"] = count_patch_lines(t.get("gold_patch", ""))

	# Re-categorize
	for t in all_tasks:
	lines = t["_patch_lines"]
	if lines <= 4:
	t["difficulty"] = "Easy"
	elif lines <= 15:
	t["difficulty"] = "Medium"
	else:
	t["difficulty"] = "Hard"

	# Show distribution
	dist = Counter(t["difficulty"] for t in all_tasks)
	print(f"\nNew difficulty distribution:")
	for d in ["Easy", "Medium", "Hard"]:
	tasks_in_d = [t for t in all_tasks if t["difficulty"] == d]
	patch_sizes = [t["_patch_lines"] for t in tasks_in_d]
	if patch_sizes:
	print(f" {d:8s}: {dist.get(d, 0):3d} tasks "
	f"(patch lines: min={min(patch_sizes)}, max={max(patch_sizes)}, "
	f"avg={sum(patch_sizes)/len(patch_sizes):.1f})")

	# 50/50 split: 20 train, rest eval (up to ~20+)
	random.seed(42)
	train, eval_ = [], []

	# Take first 20 shuffled as train, rest as eval
	all_shuffled = list(all_tasks)
	random.shuffle(all_shuffled)
	train = all_shuffled[:20]
	eval_ = all_shuffled[20:40] # next 20 for eval

	random.shuffle(train)
	random.shuffle(eval_)

	# Remove temp field before saving
	for t in train + eval_:
	t.pop("_patch_lines", None)

	# Write
	train_path = DATA_DIR / "train.jsonl"
	eval_path = DATA_DIR / "eval.jsonl"

	with open(train_path, "w") as f:
	for t in train:
	f.write(json.dumps(t, ensure_ascii=False) + "\n")

	with open(eval_path, "w") as f:
	for t in eval_:
	f.write(json.dumps(t, ensure_ascii=False) + "\n")

	print(f"\nTrain: {train_path} ({len(train)} tasks)")
	print(f"Eval: {eval_path} ({len(eval_)} tasks)")

	print("\nTrain distribution:")
	for d, c in sorted(Counter(t["difficulty"] for t in train).items()):
	print(f" {d}: {c}")
	print("Eval distribution:")
	for d, c in sorted(Counter(t["difficulty"] for t in eval_).items()):
	print(f" {d}: {c}")


	if __name__ == "__main__":
	main()