Spaces:

Viraj0112
/

rl_code_fix_env

Running

App Files Files Community

rl_code_fix_env / dataset /prepare_swebench.py

Viraj0112

Upload folder using huggingface_hub

03a907a verified about 18 hours ago

raw

history blame contribute delete

8.51 kB

	"""
	Script to download and materialize SWE-bench Lite tasks.

	This script:
	1. Downloads SWE-bench Lite dataset from HuggingFace
	2. Extracts the buggy code and creates test files
	3. Organizes them into the expected directory structure

	Usage:
	python -m dataset.prepare_swebench [--max-tasks N] [--difficulty easy\|medium\|hard\|all]
	"""

	import argparse
	import os
	import sys
	from pathlib import Path

	# Add parent to path for imports
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from datasets import load_dataset


	def get_problem_statement(row):
	"""Extract problem statement from row."""
	return row.get("problem_statement", "")


	def get_patch(row):
	"""Extract the patch/fix from row."""
	return row.get("patch", "")


	def get_instance_id(row):
	"""Get instance ID from row."""
	return row.get("instance_id", "")


	def create_buggy_file(instance_dir: Path, row):
	"""
	Create buggy.py from the base commit and instance.

	The SWE-bench dataset provides the full repository at base_commit.
	We need to extract just the relevant file that has the bug.
	"""
	# For SWE-bench, the "buggy" version is actually the version BEFORE the patch
	# We need to get the file content from the base commit
	# This is complex as it requires cloning the repo at a specific commit

	# For simplicity, we'll use a different approach:
	# The problem_statement describes the bug, and we can create a simplified
	# buggy version based on that description

	instance_id = get_instance_id(row)
	problem_stmt = get_problem_statement(row)

	# Try to extract the file from the created files in the instance
	# SWE-bench provides 'repo' and we need to find the relevant file
	created_files = row.get("created_files", [])

	if not created_files:
	# Fallback: create a placeholder
	buggy_code = f'''# Buggy code for {instance_id}
	# Problem: {problem_stmt[:200]}...

	def solution():
	"""Placeholder solution - needs to be fixed."""
	pass
	'''
	else:
	# For now, create a simple placeholder
	# In a full implementation, we'd clone the repo at base_commit
	file_path = created_files[0] if created_files else "solution.py"
	buggy_code = f'''# Buggy code for {instance_id}
	# File: {file_path}
	# Problem: {problem_stmt[:200]}...

	def solution():
	"""Placeholder solution - needs to be fixed."""
	pass
	'''

	buggy_file = instance_dir / "buggy.py"
	buggy_file.write_text(buggy_code, encoding="utf-8")
	return buggy_file


	def create_test_file(instance_dir: Path, row):
	"""
	Create test.py based on the problem statement.

	For SWE-bench, tests are typically derived from the issue description.
	We'll create a simple test that checks if the solution works.
	"""
	instance_id = get_instance_id(row)
	problem_stmt = get_problem_statement(row)

	# Create a simple test file
	# In practice, SWE-bench has a test.json file with test cases
	test_cases = row.get("test_cases", [])

	if test_cases:
	# Create tests from provided test cases
	test_code = "import unittest\\n\\n"
	for i, tc in enumerate(test_cases):
	input_str = tc.get("input", "")
	output_str = tc.get("output", "")
	test_code += f'''class TestSolution(unittest.TestCase):
	def test_case_{i+1}(self):
	# Input: {input_str}
	# Expected: {output_str}
	pass # TODO: Add actual test
	'''
	else:
	# Create a basic test based on problem statement
	test_code = f'''"""Test file for {instance_id}"""

	import unittest
	from buggy import solution


	class TestSolution(unittest.TestCase):
	def test_basic(self):
	"""Test based on problem statement."""
	# Problem: {problem_stmt[:300]}...
	result = solution()
	self.assertIsNotNone(result)


	if __name__ == "__main__":
	unittest.main()
	'''

	test_file = instance_dir / "test.py"
	test_file.write_text(test_code, encoding="utf-8")
	return test_file


	def create_metadata_file(instance_dir: Path, row):
	"""Create metadata.json with instance info."""
	import json

	metadata = {
	"instance_id": get_instance_id(row),
	"repo": row.get("repo", ""),
	"base_commit": row.get("base_commit", ""),
	"problem_statement": get_problem_statement(row),
	"patch": get_patch(row),
	"difficulty": "medium", # Will be set based on index
	}

	metadata_file = instance_dir / "metadata.json"
	metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
	return metadata_file


	def prepare_swebench_tasks(
	output_dir: Path,
	max_tasks: int = 30,
	difficulty: str = "all"
	):
	"""
	Download and prepare SWE-bench Lite tasks.

	Args:
	output_dir: Directory to save tasks
	max_tasks: Maximum number of tasks to download
	difficulty: "easy", "medium", "hard", or "all"
	"""
	print(f"Loading SWE-bench Lite dataset...")

	try:
	ds = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
	except Exception as e:
	print(f"Error loading dataset: {e}")
	print("Trying alternative dataset name...")
	ds = load_dataset("swe-bench/swe-bench-lite", split="test")

	print(f"Loaded {len(ds)} tasks")

	# Calculate difficulty bounds
	total = len(ds)
	one_third = max(total // 3, 1)
	two_third = max((2 * total) // 3, one_third + 1)

	difficulty_ranges = {
	"easy": (0, one_third),
	"medium": (one_third, two_third),
	"hard": (two_third, total),
	}

	# Determine which tasks to download
	if difficulty == "all":
	ranges = list(difficulty_ranges.values())
	indices = []
	for start, end in ranges:
	indices.extend(range(start, min(end, start + max_tasks // 3)))
	else:
	start, end = difficulty_ranges.get(difficulty, (0, total))
	indices = list(range(start, min(end, max_tasks)))

	# Create output directory
	output_dir = Path(output_dir)
	output_dir.mkdir(parents=True, exist_ok=True)

	print(f"Preparing {len(indices)} tasks...")

	success_count = 0
	for i, idx in enumerate(indices):
	try:
	row = ds[idx]
	instance_id = get_instance_id(row)

	# Create instance directory
	instance_dir = output_dir / instance_id
	instance_dir.mkdir(parents=True, exist_ok=True)

	# Create files
	create_buggy_file(instance_dir, row)
	create_test_file(instance_dir, row)
	create_metadata_file(instance_dir, row)

	success_count += 1
	if (i + 1) % 10 == 0:
	print(f" Processed {i + 1}/{len(indices)} tasks...")

	except Exception as e:
	print(f" Warning: Failed to process task {idx}: {e}")
	continue

	print(f"\nDone! Prepared {success_count}/{len(indices)} tasks in {output_dir}")
	print(f"Set SWEBENCH_TASKS_ROOT={output_dir.absolute()} to use these tasks.")


	def main():
	parser = argparse.ArgumentParser(description="Prepare SWE-bench Lite tasks")
	parser.add_argument(
	"--max-tasks",
	type=int,
	default=30,
	help="Maximum number of tasks to download (default: 30)"
	)
	parser.add_argument(
	"--difficulty",
	type=str,
	default="all",
	choices=["easy", "medium", "hard", "all"],
	help="Difficulty level to download (default: all)"
	)
	parser.add_argument(
	"--output-dir",
	type=str,
	default=None,
	help="Output directory (default: dataset/swebench_lite_tasks)"
	)

	args = parser.parse_args()

	# Determine output directory
	if args.output_dir:
	output_dir = Path(args.output_dir)
	else:
	script_dir = Path(__file__).parent
	output_dir = script_dir / "swebench_lite_tasks"

	prepare_swebench_tasks(
	output_dir=output_dir,
	max_tasks=args.max_tasks,
	difficulty=args.difficulty
	)


	if __name__ == "__main__":
	main()