Spaces:

Viraj0112
/

rl_code_fix_env

Running

File size: 17,072 Bytes

03a907a

"""

Generate synthetic SWE-bench style tasks for testing.



This creates tasks that mimic the SWE-bench format:

- instance_id/buggy.py - the buggy code

- instance_id/test.py - test file

- instance_id/metadata.json - metadata



Usage:

    python -m dataset.generate_swebench_tasks [--count N]

"""

import argparse
import json
import random
from pathlib import Path


# Sample SWE-bench style problems
SWE_BENCH_PROBLEMS = [
    {
        "instance_id": "django__django-11098",
        "repo": "django/django",
        "problem": "Fix the user creation form validation error",
        "buggy_code": '''from django import forms

from django.contrib.auth.models import User



class UserCreationForm(forms.ModelForm):

    """Form for creating new users."""

    password1 = forms.CharField(widget=forms.PasswordInput)

    password2 = forms.CharField(widget=forms.PasswordInput)



    class Meta:

        model = User

        fields = ('username', 'email')



    def clean(self):

        cleaned_data = super().clean()

        password1 = cleaned_data.get('password1')

        password2 = cleaned_data.get('password2')

        

        # BUG: This comparison is case-sensitive but should be case-insensitive

        if password1 != password2:

            raise forms.ValidationError("Passwords don't match")

        

        return cleaned_data



    def save(self, commit=True):

        user = super().save(commit=False)

        user.set_password(self.cleaned_data['password1'])

        if commit:

            user.save()

        return user

''',
        "test_code": '''import unittest

from buggy import UserCreationForm



class TestUserCreationForm(unittest.TestCase):

    def test_password_matching(self):

        """Test that matching passwords pass validation."""

        form = UserCreationForm(data={

            'username': 'testuser',

            'email': 'test@example.com',

            'password1': 'TestPass123',

            'password2': 'TestPass123',

        })

        self.assertTrue(form.is_valid())



    def test_password_mismatch(self):

        """Test that mismatched passwords fail validation."""

        form = UserCreationForm(data={

            'username': 'testuser',

            'email': 'test@example.com',

            'password1': 'TestPass123',

            'password2': 'testpass123',  # Different case

        })

        self.assertFalse(form.is_valid())

        self.assertIn('passwords', str(form.errors).lower())

''',
    },
    {
        "instance_id": "flask__flask-1048",
        "repo": "pallets/flask",
        "problem": "Fix JSON encoding for datetime objects",
        "buggy_code": '''import json

from datetime import datetime, date



class JSONEncoder(json.JSONEncoder):

    """Custom JSON encoder for Flask."""

    

    def default(self, obj):

        # BUG: Missing handling for datetime objects

        if isinstance(obj, date):

            return obj.isoformat()

        return super().default(obj)



def to_json(obj):

    """Convert object to JSON string."""

    return json.dumps(obj, cls=JSONEncoder)

''',
        "test_code": '''import unittest

from datetime import datetime

from buggy import to_json



class TestJSONEncoding(unittest.TestCase):

    def test_encode_datetime(self):

        """Test that datetime objects are properly encoded."""

        dt = datetime(2024, 1, 15, 10, 30, 0)

        result = to_json({'timestamp': dt})

        self.assertIn('2024-01-15', result)

        self.assertIn('10:30:00', result)



    def test_encode_date(self):

        """Test that date objects are properly encoded."""

        d = date(2024, 1, 15)

        result = to_json({'date': d})

        self.assertIn('2024-01-15', result)

''',
    },
    {
        "instance_id": "requests__requests-2875",
        "repo": "psf/requests",
        "problem": "Fix cookie domain matching",
        "buggy_code": '''import re

from urllib.parse import urlparse



def match_cookie_domain(cookie_domain, request_domain):

    """Check if cookie domain matches request domain."""

    # BUG: Should handle leading dots differently

    # .example.com should match sub.example.com but not example.com

    cookie_domain = cookie_domain.lower()

    request_domain = request_domain.lower()

    

    if cookie_domain.startswith('.'):

        return request_domain.endswith(cookie_domain)

    

    return cookie_domain == request_domain

''',
        "test_code": '''import unittest

from buggy import match_cookie_domain



class TestCookieDomain(unittest.TestCase):

    def test_exact_match(self):

        """Test exact domain matching."""

        self.assertTrue(match_cookie_domain('example.com', 'example.com'))



    def test_subdomain_with_dot(self):

        """Test subdomain matching with leading dot."""

        # .example.com should match sub.example.com

        self.assertTrue(match_cookie_domain('.example.com', 'sub.example.com'))

        self.assertFalse(match_cookie_domain('.example.com', 'example.com'))



    def test_different_domains(self):

        """Test different domains don't match."""

        self.assertFalse(match_cookie_domain('example.com', 'other.com'))

''',
    },
    {
        "instance_id": "numpy__numpy-10825",
        "repo": "numpy/numpy",
        "problem": "Fix array concatenation edge case",
        "buggy_code": '''import numpy as np



def concatenate_arrays(*arrays):

    """Concatenate multiple arrays along axis 0."""

    if not arrays:

        return np.array([])

    

    # BUG: Should handle None arrays gracefully

    result = arrays[0]

    for arr in arrays[1:]:

        result = np.concatenate([result, arr])

    

    return result

''',
        "test_code": '''import unittest

import numpy as np

from buggy import concatenate_arrays



class TestArrayConcatenation(unittest.TestCase):

    def test_basic_concatenation(self):

        """Test basic array concatenation."""

        a = np.array([1, 2, 3])

        b = np.array([4, 5, 6])

        result = concatenate_arrays(a, b)

        np.testing.assert_array_equal(result, np.array([1, 2, 3, 4, 5, 6]))



    def test_empty_input(self):

        """Test empty input returns empty array."""

        result = concatenate_arrays()

        self.assertEqual(len(result), 0)



    def test_single_array(self):

        """Test single array passes through."""

        a = np.array([1, 2, 3])

        result = concatenate_arrays(a)

        np.testing.assert_array_equal(result, a)

''',
    },
    {
        "instance_id": "pandas__pandas-15230",
        "repo": "pandas-dev/pandas",
        "problem": "Fix DataFrame groupby aggregation",
        "buggy_code": '''import pandas as pd



def group_and_aggregate(df, group_col, agg_col, agg_func='mean'):

    """Group DataFrame and aggregate."""

    # BUG: Should handle non-numeric columns gracefully

    if agg_func == 'mean':

        return df.groupby(group_col)[agg_col].mean()

    elif agg_func == 'sum':

        return df.groupby(group_col)[agg_col].sum()

    elif agg_func == 'count':

        return df.groupby(group_col)[agg_col].count()

    else:

        raise ValueError(f"Unknown aggregation function: {agg_func}")

''',
        "test_code": '''import unittest

import pandas as pd

from buggy import group_and_aggregate



class TestGroupBy(unittest.TestCase):

    def test_mean_aggregation(self):

        """Test mean aggregation."""

        df = pd.DataFrame({

            'category': ['A', 'A', 'B', 'B'],

            'value': [1, 2, 3, 4]

        })

        result = group_and_aggregate(df, 'category', 'value', 'mean')

        self.assertEqual(result['A'], 1.5)

        self.assertEqual(result['B'], 3.5)



    def test_sum_aggregation(self):

        """Test sum aggregation."""

        df = pd.DataFrame({

            'category': ['A', 'A', 'B'],

            'value': [1, 2, 3]

        })

        result = group_and_aggregate(df, 'category', 'value', 'sum')

        self.assertEqual(result['A'], 3)

        self.assertEqual(result['B'], 3)

''',
    },
    {
        "instance_id": "scipy__scipy-1925",
        "repo": "scipy/scipy",
        "problem": "Fix signal filtering edge case",
        "buggy_code": '''import numpy as np

from scipy import signal



def apply_lowpass_filter(data, cutoff, fs, order=5):

    """Apply lowpass filter to data."""

    # BUG: Should validate cutoff frequency

    nyquist = fs / 2

    normalized_cutoff = cutoff / nyquist

    

    # BUG: Using invalid cutoff can cause filter design failure

    b, a = signal.butter(order, normalized_cutoff, btype='low')

    filtered = signal.filtfilt(b, a, data)

    

    return filtered

''',
        "test_code": '''import unittest

import numpy as np

from buggy import apply_lowpass_filter



class TestSignalFiltering(unittest.TestCase):

    def test_valid_filter(self):

        """Test filtering with valid parameters."""

        fs = 1000  # Sampling frequency

        cutoff = 100  # Cutoff frequency

        t = np.linspace(0, 1, fs)

        data = np.sin(2 * np.pi * 50 * t) + 0.5 * np.sin(2 * np.pi * 200 * t)

        

        result = apply_lowpass_filter(data, cutoff, fs)

        self.assertEqual(len(result), len(data))

        # Low frequency component should be preserved

        self.assertTrue(np.abs(result[100]) > 0.5)



    def test_invalid_cutoff(self):

        """Test that invalid cutoff raises error."""

        fs = 1000

        cutoff = 2000  # Above Nyquist frequency - should fail

        data = np.array([1, 2, 3, 4, 5])

        

        with self.assertRaises(ValueError):

            apply_lowpass_filter(data, cutoff, fs)

''',
    },
    {
        "instance_id": "sklearn__sklearn-12345",
        "repo": "scikit-learn/scikit-learn",
        "problem": "Fix cross-validation split",
        "buggy_code": '''import numpy as np

from sklearn.model_selection import KFold



def get_cv_splits(X, n_splits=5, shuffle=True, random_state=42):

    """Get cross-validation splits."""

    # BUG: random_state should be used for reproducibility

    kf = KFold(n_splits=n_splits, shuffle=shuffle)

    

    splits = []

    for train_idx, test_idx in kf.split(X):

        splits.append((train_idx, test_idx))

    

    return splits

''',
        "test_code": '''import unittest

import numpy as np

from buggy import get_cv_splits



class TestCVSplits(unittest.TestCase):

    def test_split_count(self):

        """Test that correct number of splits is generated."""

        X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])

        splits = get_cv_splits(X, n_splits=3)

        self.assertEqual(len(splits), 3)



    def test_reproducibility(self):

        """Test that splits are reproducible with same random_state."""

        X = np.random.rand(100, 5)

        splits1 = get_cv_splits(X, n_splits=5, random_state=42)

        splits2 = get_cv_splits(X, n_splits=5, random_state=42)

        

        for (train1, test1), (train2, test2) in zip(splits1, splits2):

            np.testing.assert_array_equal(train1, train2)

            np.testing.assert_array_equal(test1, test2)

''',
    },
    {
        "instance_id": "pytest__pytest-7426",
        "repo": "pytest-dev/pytest",
        "problem": "Fix test collection order",
        "buggy_code": '''import os

import re



def collect_tests(directory, pattern='test_*.py'):

    """Collect test files from directory."""

    # BUG: Should sort files for consistent ordering

    test_files = []

    

    for root, dirs, files in os.walk(directory):

        for file in files:

            if re.match(pattern, file):

                test_files.append(os.path.join(root, file))

    

    return test_files

''',
        "test_code": '''import unittest

import os

import tempfile

from buggy import collect_tests



class TestCollection(unittest.TestCase):

    def test_collect_pattern(self):

        """Test that correct pattern is matched."""

        with tempfile.TemporaryDirectory() as tmpdir:

            # Create test files

            open(os.path.join(tmpdir, 'test_a.py'), 'w').close()

            open(os.path.join(tmpdir, 'test_b.py'), 'w').close()

            open(os.path.join(tmpdir, 'not_a_test.py'), 'w').close()

            

            tests = collect_tests(tmpdir, 'test_*.py')

            self.assertEqual(len(tests), 2)



    def test_consistent_order(self):

        """Test that file order is consistent."""

        with tempfile.TemporaryDirectory() as tmpdir:

            for name in ['test_c.py', 'test_a.py', 'test_b.py']:

                open(os.path.join(tmpdir, name), 'w').close()

            

            tests1 = collect_tests(tmpdir)

            tests2 = collect_tests(tmpdir)

            

            self.assertEqual(tests1, tests2)

''',
    },
    {
        "instance_id": "transformers__transformers-12345",
        "repo": "huggingface/transformers",
        "problem": "Fix tokenization padding",
        "buggy_code": '''from typing import List



def tokenize_and_pad(tokenizer, texts: List[str], max_length: int = 512):

    """Tokenize texts and pad to max length."""

    # BUG: Should handle padding correctly

    encoded = tokenizer(

        texts,

        padding=True,  # This pads to longest in batch, not max_length

        truncation=True,

        max_length=max_length,

        return_tensors='pt'

    )

    

    return encoded

''',
        "test_code": '''import unittest

from buggy import tokenize_and_pad



class MockTokenizer:

    def __call__(self, texts, padding=True, truncation=True, max_length=512, return_tensors=None):

        # Simplified mock

        return {

            'input_ids': [[1, 2, 3]] if isinstance(texts, list) else [1, 2, 3],

            'attention_mask': [[1, 1, 1]] if isinstance(texts, list) else [1, 1, 1]

        }



class TestTokenization(unittest.TestCase):

    def test_single_text(self):

        """Test tokenizing single text."""

        tokenizer = MockTokenizer()

        result = tokenize_and_pad(tokenizer, ["hello world"])

        self.assertIn('input_ids', result)



    def test_max_length_respected(self):

        """Test that max_length is respected."""

        tokenizer = MockTokenizer()

        # Should not raise even with long text

        result = tokenize_and_pad(tokenizer, ["short"], max_length=10)

        self.assertIn('input_ids', result)

''',
    },
]

# Easy, Medium, Hard difficulty assignments
DIFFICULTY_TASKS = {
    "easy": SWE_BENCH_PROBLEMS[:3],
    "medium": SWE_BENCH_PROBLEMS[3:6],
    "hard": SWE_BENCH_PROBLEMS[6:],
}


def generate_tasks(output_dir: Path, count_per_difficulty: int = 3):
    """Generate SWE-bench style tasks."""
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    total_created = 0
    
    for difficulty, problems in DIFFICULTY_TASKS.items():
        for i, problem in enumerate(problems[:count_per_difficulty]):
            instance_id = f"{problem['instance_id']}_{difficulty}_{i}"
            instance_dir = output_dir / instance_id
            instance_dir.mkdir(parents=True, exist_ok=True)
            
            # Write buggy.py
            buggy_file = instance_dir / "buggy.py"
            buggy_file.write_text(problem["buggy_code"], encoding="utf-8")
            
            # Write test.py
            test_file = instance_dir / "test.py"
            test_file.write_text(problem["test_code"], encoding="utf-8")
            
            # Write metadata.json
            metadata = {
                "instance_id": instance_id,
                "repo": problem["repo"],
                "problem_statement": problem["problem"],
                "difficulty": difficulty,
            }
            metadata_file = instance_dir / "metadata.json"
            metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
            
            total_created += 1
    
    print(f"Created {total_created} tasks in {output_dir}")
    print(f"Set environment variable: SWEBENCH_TASKS_ROOT={output_dir.absolute()}")
    print(f"Or run with: TASK_SOURCE=swebench python inference.py")


def main():
    parser = argparse.ArgumentParser(description="Generate SWE-bench style tasks")
    parser.add_argument(
        "--count", 
        type=int, 
        default=3,
        help="Number of tasks per difficulty (default: 3)"
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default=None,
        help="Output directory (default: dataset/swebench_lite_tasks)"
    )
    
    args = parser.parse_args()
    
    if args.output_dir:
        output_dir = Path(args.output_dir)
    else:
        script_dir = Path(__file__).parent
        output_dir = script_dir / "swebench_lite_tasks"
    
    generate_tasks(output_dir, args.count)


if __name__ == "__main__":
    main()