Spaces:

Viraj0112
/

rl_code_fix_env

Sleeping

App Files Files Community

Viraaj Sawant commited on Apr 12

Commit

18625ef

1 Parent(s): fe42848

new push with SWE dataset

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +3 -0
prompts.py +16 -33
rl_code_fix_env/.gitignore +2 -1
rl_code_fix_env/README.md +1 -0
rl_code_fix_env/_aliases.py +21 -0
rl_code_fix_env/conftest.py +0 -17
rl_code_fix_env/dataset/generate_swebench_tasks.py +498 -0
rl_code_fix_env/dataset/prepare_swebench.py +274 -0
rl_code_fix_env/dataset/problem_1/buggy.py +7 -5
rl_code_fix_env/dataset/problem_1/metadata.json +1 -1
rl_code_fix_env/dataset/problem_1/test.py +10 -6
rl_code_fix_env/dataset/problem_10/buggy.py +1 -1
rl_code_fix_env/dataset/problem_10/test.py +1 -1
rl_code_fix_env/dataset/problem_11/test.py +1 -1
rl_code_fix_env/dataset/problem_12/test.py +1 -1
rl_code_fix_env/dataset/problem_13/buggy.py +1 -1
rl_code_fix_env/dataset/problem_13/test.py +1 -1
rl_code_fix_env/dataset/problem_14/test.py +1 -1
rl_code_fix_env/dataset/problem_15/test.py +1 -1
rl_code_fix_env/dataset/problem_16/buggy.py +1 -1
rl_code_fix_env/dataset/problem_16/test.py +1 -1
rl_code_fix_env/dataset/problem_17/test.py +1 -1
rl_code_fix_env/dataset/problem_18/buggy.py +1 -1
rl_code_fix_env/dataset/problem_18/test.py +1 -1
rl_code_fix_env/dataset/problem_19/test.py +1 -1
rl_code_fix_env/dataset/problem_2/buggy.py +14 -5
rl_code_fix_env/dataset/problem_2/metadata.json +2 -2
rl_code_fix_env/dataset/problem_2/test.py +9 -6
rl_code_fix_env/dataset/problem_20/test.py +1 -1
rl_code_fix_env/dataset/problem_21/test.py +1 -1
rl_code_fix_env/dataset/problem_22/test.py +1 -1
rl_code_fix_env/dataset/problem_23/test.py +1 -1
rl_code_fix_env/dataset/problem_3/buggy.py +37 -10
rl_code_fix_env/dataset/problem_3/metadata.json +3 -3
rl_code_fix_env/dataset/problem_3/test.py +43 -14
rl_code_fix_env/dataset/problem_4/test.py +1 -1
rl_code_fix_env/dataset/problem_5/test.py +1 -1
rl_code_fix_env/dataset/problem_6/buggy.py +1 -1
rl_code_fix_env/dataset/problem_6/test.py +1 -1
rl_code_fix_env/dataset/problem_7/test.py +1 -1
rl_code_fix_env/dataset/problem_8/test.py +1 -1
rl_code_fix_env/dataset/problem_9/test.py +1 -1
rl_code_fix_env/dataset/swebench_adapter.py +81 -35
rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/buggy.py +29 -0
rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/metadata.json +6 -0
rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/test.py +24 -0
rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/buggy.py +15 -0
rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/metadata.json +6 -0
rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/test.py +17 -0
rl_code_fix_env/dataset/swebench_lite_tasks/numpy__numpy-10825_medium_0/buggy.py +13 -0

.gitignore CHANGED Viewed

@@ -6,3 +6,6 @@ __pycache__/
 commands.md
 logs.md
 inference&docker.md

 commands.md
 logs.md
 inference&docker.md
+logs2.md
+.env.example
+file.txt

prompts.py CHANGED Viewed

@@ -1,37 +1,20 @@
 LLM_SCORER_PROMPT = """
-    You are a reward model for an autonomous code bug-fixing agent trained with reinforcement learning.
-    Your scores are used directly as a learning signal — be precise, consistent, and strict.
-    You will receive:
-    - ORIGINAL: the buggy code before the agent's fix
-    - PATCHED: the code after the agent applied its patch
-    Evaluate the agent's fix on exactly three axes, each scored 0.0–10.0:
-    1. CORRECTNESS  — Does the patch fix the bug(s) without introducing new ones?
-                      Full marks only if the fix is semantically correct and complete.
-                      Penalise partial fixes, over-patches, or fixes that mask rather than resolve the root cause.
-    2. MINIMALITY   — Is the diff minimal? Penalise unnecessary refactors, renames, whitespace-only changes,
-                      or reformatting of lines unrelated to the bug.
-    3. QUALITY      — Is the patched code readable and idiomatic? Penalise: broken naming conventions,
-                      added dead code, removed necessary comments, or degraded clarity vs. the original.
-    Respond ONLY with this JSON — no preamble, no trailing text:
-    {
-      "correctness": <float 0.0-10.0>,
-      "minimality":  <float 0.0-10.0>,
-      "quality":     <float 0.0-10.0>,
-      "reasoning":   "<one concise sentence per axis, pipe-separated>"
-    }
 """
-USER_TEMPLATE ="""
-    ORIGINAL:
-    ```python
-    {original_code}
-    ```
-    Return only the JSON.
 """

 LLM_SCORER_PROMPT = """
+You are a reward model for a code-fixing RL agent. Evaluate the PATCHED code vs. ORIGINAL on three axes (0.0–10.0):
+1. CORRECTNESS — Does the patch fix the bug(s) without new bugs?
+2. MINIMALITY  — Is the diff minimal? Penalize unrelated changes.
+3. QUALITY     — Is the code readable and idiomatic?
+Respond ONLY with this JSON (no preamble):
+{"correctness": <float>, "minimality": <float>, "quality": <float>, "reasoning": "<one sentence per axis, pipe-separated>"}
 """
+USER_TEMPLATE = """
+ORIGINAL:
+```python
+{original_code}
+```
+PATCHED:
+```python
+{patched_code}
+```
+Return only the JSON.
 """

rl_code_fix_env/.gitignore CHANGED Viewed

@@ -5,4 +5,5 @@ __pycache__/
 .env
 *.pyc
 *.egg
-pytest-cache-files-*/

 .env
 *.pyc
 *.egg
+pytest-cache-files-*/
+*.ps1

rl_code_fix_env/README.md CHANGED Viewed

@@ -5,6 +5,7 @@ colorFrom: green
 colorTo: purple
 sdk: docker
 pinned: false
 app_port: 8000
 base_path: /web
 tags:

 colorTo: purple
 sdk: docker
 pinned: false
+dockerfile: server/Dockerfile
 app_port: 8000
 base_path: /web
 tags:

rl_code_fix_env/_aliases.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import sys
+import importlib
+from pathlib import Path
+_REPO_ROOT = str(Path(__file__).parent)
+if _REPO_ROOT not in sys.path:
+    sys.path.insert(0, _REPO_ROOT)
+import dataset as _real_dataset
+sys.modules.setdefault("src.dataset", _real_dataset)
+import pkgutil
+for _pkg in pkgutil.iter_modules(_real_dataset.__path__):
+    _full = f"dataset.{_pkg.name}"
+    _alias = f"src.dataset.{_pkg.name}"
+    try:
+        _mod = importlib.import_module(_full)
+        sys.modules.setdefault(_alias, _mod)
+    except Exception:
+        pass

rl_code_fix_env/conftest.py CHANGED Viewed

@@ -1,20 +1,3 @@
-"""
-conftest.py  repo-root pytest configuration.
-Registers `src.dataset` as a sys.modules alias for `dataset` so that all
-problem test files using `from src.dataset.problem_X.buggy import ...`
-resolve correctly without needing to rename 24 test files.
-The physical layout is:
-    <repo_root>/dataset/problem_X/buggy.py    real files
-    <repo_root>/src/                          has environment/, reward/, etc.
-                                               but NO dataset/ subfolder
-With PYTHONPATH=<repo_root>:
-    import dataset.problem_1.buggy    works natively
-    import src.dataset.problem_1.buggy   would fail  fixed here via alias
-"""
 import sys
 import importlib
 from pathlib import Path

 import sys
 import importlib
 from pathlib import Path

rl_code_fix_env/dataset/generate_swebench_tasks.py ADDED Viewed

	@@ -0,0 +1,498 @@

+"""
+Generate synthetic SWE-bench style tasks for testing.
+This creates tasks that mimic the SWE-bench format:
+- instance_id/buggy.py - the buggy code
+- instance_id/test.py - test file
+- instance_id/metadata.json - metadata
+Usage:
+    python -m dataset.generate_swebench_tasks [--count N]
+"""
+import argparse
+import json
+import random
+from pathlib import Path
+# Sample SWE-bench style problems
+SWE_BENCH_PROBLEMS = [
+    {
+        "instance_id": "django__django-11098",
+        "repo": "django/django",
+        "problem": "Fix the user creation form validation error",
+        "buggy_code": '''from django import forms
+from django.contrib.auth.models import User
+class UserCreationForm(forms.ModelForm):
+    """Form for creating new users."""
+    password1 = forms.CharField(widget=forms.PasswordInput)
+    password2 = forms.CharField(widget=forms.PasswordInput)
+    class Meta:
+        model = User
+        fields = ('username', 'email')
+    def clean(self):
+        cleaned_data = super().clean()
+        password1 = cleaned_data.get('password1')
+        password2 = cleaned_data.get('password2')
+        # BUG: This comparison is case-sensitive but should be case-insensitive
+        if password1 != password2:
+            raise forms.ValidationError("Passwords don't match")
+        return cleaned_data
+    def save(self, commit=True):
+        user = super().save(commit=False)
+        user.set_password(self.cleaned_data['password1'])
+        if commit:
+            user.save()
+        return user
+''',
+        "test_code": '''import unittest
+from buggy import UserCreationForm
+class TestUserCreationForm(unittest.TestCase):
+    def test_password_matching(self):
+        """Test that matching passwords pass validation."""
+        form = UserCreationForm(data={
+            'username': 'testuser',
+            'email': 'test@example.com',
+            'password1': 'TestPass123',
+            'password2': 'TestPass123',
+        })
+        self.assertTrue(form.is_valid())
+    def test_password_mismatch(self):
+        """Test that mismatched passwords fail validation."""
+        form = UserCreationForm(data={
+            'username': 'testuser',
+            'email': 'test@example.com',
+            'password1': 'TestPass123',
+            'password2': 'testpass123',  # Different case
+        })
+        self.assertFalse(form.is_valid())
+        self.assertIn('passwords', str(form.errors).lower())
+''',
+    },
+    {
+        "instance_id": "flask__flask-1048",
+        "repo": "pallets/flask",
+        "problem": "Fix JSON encoding for datetime objects",
+        "buggy_code": '''import json
+from datetime import datetime, date
+class JSONEncoder(json.JSONEncoder):
+    """Custom JSON encoder for Flask."""
+    def default(self, obj):
+        # BUG: Missing handling for datetime objects
+        if isinstance(obj, date):
+            return obj.isoformat()
+        return super().default(obj)
+def to_json(obj):
+    """Convert object to JSON string."""
+    return json.dumps(obj, cls=JSONEncoder)
+''',
+        "test_code": '''import unittest
+from datetime import datetime
+from buggy import to_json
+class TestJSONEncoding(unittest.TestCase):
+    def test_encode_datetime(self):
+        """Test that datetime objects are properly encoded."""
+        dt = datetime(2024, 1, 15, 10, 30, 0)
+        result = to_json({'timestamp': dt})
+        self.assertIn('2024-01-15', result)
+        self.assertIn('10:30:00', result)
+    def test_encode_date(self):
+        """Test that date objects are properly encoded."""
+        d = date(2024, 1, 15)
+        result = to_json({'date': d})
+        self.assertIn('2024-01-15', result)
+''',
+    },
+    {
+        "instance_id": "requests__requests-2875",
+        "repo": "psf/requests",
+        "problem": "Fix cookie domain matching",
+        "buggy_code": '''import re
+from urllib.parse import urlparse
+def match_cookie_domain(cookie_domain, request_domain):
+    """Check if cookie domain matches request domain."""
+    # BUG: Should handle leading dots differently
+    # .example.com should match sub.example.com but not example.com
+    cookie_domain = cookie_domain.lower()
+    request_domain = request_domain.lower()
+    if cookie_domain.startswith('.'):
+        return request_domain.endswith(cookie_domain)
+    return cookie_domain == request_domain
+''',
+        "test_code": '''import unittest
+from buggy import match_cookie_domain
+class TestCookieDomain(unittest.TestCase):
+    def test_exact_match(self):
+        """Test exact domain matching."""
+        self.assertTrue(match_cookie_domain('example.com', 'example.com'))
+    def test_subdomain_with_dot(self):
+        """Test subdomain matching with leading dot."""
+        # .example.com should match sub.example.com
+        self.assertTrue(match_cookie_domain('.example.com', 'sub.example.com'))
+        self.assertFalse(match_cookie_domain('.example.com', 'example.com'))
+    def test_different_domains(self):
+        """Test different domains don't match."""
+        self.assertFalse(match_cookie_domain('example.com', 'other.com'))
+''',
+    },
+    {
+        "instance_id": "numpy__numpy-10825",
+        "repo": "numpy/numpy",
+        "problem": "Fix array concatenation edge case",
+        "buggy_code": '''import numpy as np
+def concatenate_arrays(*arrays):
+    """Concatenate multiple arrays along axis 0."""
+    if not arrays:
+        return np.array([])
+    # BUG: Should handle None arrays gracefully
+    result = arrays[0]
+    for arr in arrays[1:]:
+        result = np.concatenate([result, arr])
+    return result
+''',
+        "test_code": '''import unittest
+import numpy as np
+from buggy import concatenate_arrays
+class TestArrayConcatenation(unittest.TestCase):
+    def test_basic_concatenation(self):
+        """Test basic array concatenation."""
+        a = np.array([1, 2, 3])
+        b = np.array([4, 5, 6])
+        result = concatenate_arrays(a, b)
+        np.testing.assert_array_equal(result, np.array([1, 2, 3, 4, 5, 6]))
+    def test_empty_input(self):
+        """Test empty input returns empty array."""
+        result = concatenate_arrays()
+        self.assertEqual(len(result), 0)
+    def test_single_array(self):
+        """Test single array passes through."""
+        a = np.array([1, 2, 3])
+        result = concatenate_arrays(a)
+        np.testing.assert_array_equal(result, a)
+''',
+    },
+    {
+        "instance_id": "pandas__pandas-15230",
+        "repo": "pandas-dev/pandas",
+        "problem": "Fix DataFrame groupby aggregation",
+        "buggy_code": '''import pandas as pd
+def group_and_aggregate(df, group_col, agg_col, agg_func='mean'):
+    """Group DataFrame and aggregate."""
+    # BUG: Should handle non-numeric columns gracefully
+    if agg_func == 'mean':
+        return df.groupby(group_col)[agg_col].mean()
+    elif agg_func == 'sum':
+        return df.groupby(group_col)[agg_col].sum()
+    elif agg_func == 'count':
+        return df.groupby(group_col)[agg_col].count()
+    else:
+        raise ValueError(f"Unknown aggregation function: {agg_func}")
+''',
+        "test_code": '''import unittest
+import pandas as pd
+from buggy import group_and_aggregate
+class TestGroupBy(unittest.TestCase):
+    def test_mean_aggregation(self):
+        """Test mean aggregation."""
+        df = pd.DataFrame({
+            'category': ['A', 'A', 'B', 'B'],
+            'value': [1, 2, 3, 4]
+        })
+        result = group_and_aggregate(df, 'category', 'value', 'mean')
+        self.assertEqual(result['A'], 1.5)
+        self.assertEqual(result['B'], 3.5)
+    def test_sum_aggregation(self):
+        """Test sum aggregation."""
+        df = pd.DataFrame({
+            'category': ['A', 'A', 'B'],
+            'value': [1, 2, 3]
+        })
+        result = group_and_aggregate(df, 'category', 'value', 'sum')
+        self.assertEqual(result['A'], 3)
+        self.assertEqual(result['B'], 3)
+''',
+    },
+    {
+        "instance_id": "scipy__scipy-1925",
+        "repo": "scipy/scipy",
+        "problem": "Fix signal filtering edge case",
+        "buggy_code": '''import numpy as np
+from scipy import signal
+def apply_lowpass_filter(data, cutoff, fs, order=5):
+    """Apply lowpass filter to data."""
+    # BUG: Should validate cutoff frequency
+    nyquist = fs / 2
+    normalized_cutoff = cutoff / nyquist
+    # BUG: Using invalid cutoff can cause filter design failure
+    b, a = signal.butter(order, normalized_cutoff, btype='low')
+    filtered = signal.filtfilt(b, a, data)
+    return filtered
+''',
+        "test_code": '''import unittest
+import numpy as np
+from buggy import apply_lowpass_filter
+class TestSignalFiltering(unittest.TestCase):
+    def test_valid_filter(self):
+        """Test filtering with valid parameters."""
+        fs = 1000  # Sampling frequency
+        cutoff = 100  # Cutoff frequency
+        t = np.linspace(0, 1, fs)
+        data = np.sin(2 * np.pi * 50 * t) + 0.5 * np.sin(2 * np.pi * 200 * t)
+        result = apply_lowpass_filter(data, cutoff, fs)
+        self.assertEqual(len(result), len(data))
+        # Low frequency component should be preserved
+        self.assertTrue(np.abs(result[100]) > 0.5)
+    def test_invalid_cutoff(self):
+        """Test that invalid cutoff raises error."""
+        fs = 1000
+        cutoff = 2000  # Above Nyquist frequency - should fail
+        data = np.array([1, 2, 3, 4, 5])
+        with self.assertRaises(ValueError):
+            apply_lowpass_filter(data, cutoff, fs)
+''',
+    },
+    {
+        "instance_id": "sklearn__sklearn-12345",
+        "repo": "scikit-learn/scikit-learn",
+        "problem": "Fix cross-validation split",
+        "buggy_code": '''import numpy as np
+from sklearn.model_selection import KFold
+def get_cv_splits(X, n_splits=5, shuffle=True, random_state=42):
+    """Get cross-validation splits."""
+    # BUG: random_state should be used for reproducibility
+    kf = KFold(n_splits=n_splits, shuffle=shuffle)
+    splits = []
+    for train_idx, test_idx in kf.split(X):
+        splits.append((train_idx, test_idx))
+    return splits
+''',
+        "test_code": '''import unittest
+import numpy as np
+from buggy import get_cv_splits
+class TestCVSplits(unittest.TestCase):
+    def test_split_count(self):
+        """Test that correct number of splits is generated."""
+        X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
+        splits = get_cv_splits(X, n_splits=3)
+        self.assertEqual(len(splits), 3)
+    def test_reproducibility(self):
+        """Test that splits are reproducible with same random_state."""
+        X = np.random.rand(100, 5)
+        splits1 = get_cv_splits(X, n_splits=5, random_state=42)
+        splits2 = get_cv_splits(X, n_splits=5, random_state=42)
+        for (train1, test1), (train2, test2) in zip(splits1, splits2):
+            np.testing.assert_array_equal(train1, train2)
+            np.testing.assert_array_equal(test1, test2)
+''',
+    },
+    {
+        "instance_id": "pytest__pytest-7426",
+        "repo": "pytest-dev/pytest",
+        "problem": "Fix test collection order",
+        "buggy_code": '''import os
+import re
+def collect_tests(directory, pattern='test_*.py'):
+    """Collect test files from directory."""
+    # BUG: Should sort files for consistent ordering
+    test_files = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if re.match(pattern, file):
+                test_files.append(os.path.join(root, file))
+    return test_files
+''',
+        "test_code": '''import unittest
+import os
+import tempfile
+from buggy import collect_tests
+class TestCollection(unittest.TestCase):
+    def test_collect_pattern(self):
+        """Test that correct pattern is matched."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create test files
+            open(os.path.join(tmpdir, 'test_a.py'), 'w').close()
+            open(os.path.join(tmpdir, 'test_b.py'), 'w').close()
+            open(os.path.join(tmpdir, 'not_a_test.py'), 'w').close()
+            tests = collect_tests(tmpdir, 'test_*.py')
+            self.assertEqual(len(tests), 2)
+    def test_consistent_order(self):
+        """Test that file order is consistent."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            for name in ['test_c.py', 'test_a.py', 'test_b.py']:
+                open(os.path.join(tmpdir, name), 'w').close()
+            tests1 = collect_tests(tmpdir)
+            tests2 = collect_tests(tmpdir)
+            self.assertEqual(tests1, tests2)
+''',
+    },
+    {
+        "instance_id": "transformers__transformers-12345",
+        "repo": "huggingface/transformers",
+        "problem": "Fix tokenization padding",
+        "buggy_code": '''from typing import List
+def tokenize_and_pad(tokenizer, texts: List[str], max_length: int = 512):
+    """Tokenize texts and pad to max length."""
+    # BUG: Should handle padding correctly
+    encoded = tokenizer(
+        texts,
+        padding=True,  # This pads to longest in batch, not max_length
+        truncation=True,
+        max_length=max_length,
+        return_tensors='pt'
+    )
+    return encoded
+''',
+        "test_code": '''import unittest
+from buggy import tokenize_and_pad
+class MockTokenizer:
+    def __call__(self, texts, padding=True, truncation=True, max_length=512, return_tensors=None):
+        # Simplified mock
+        return {
+            'input_ids': [[1, 2, 3]] if isinstance(texts, list) else [1, 2, 3],
+            'attention_mask': [[1, 1, 1]] if isinstance(texts, list) else [1, 1, 1]
+        }
+class TestTokenization(unittest.TestCase):
+    def test_single_text(self):
+        """Test tokenizing single text."""
+        tokenizer = MockTokenizer()
+        result = tokenize_and_pad(tokenizer, ["hello world"])
+        self.assertIn('input_ids', result)
+    def test_max_length_respected(self):
+        """Test that max_length is respected."""
+        tokenizer = MockTokenizer()
+        # Should not raise even with long text
+        result = tokenize_and_pad(tokenizer, ["short"], max_length=10)
+        self.assertIn('input_ids', result)
+''',
+    },
+]
+# Easy, Medium, Hard difficulty assignments
+DIFFICULTY_TASKS = {
+    "easy": SWE_BENCH_PROBLEMS[:3],
+    "medium": SWE_BENCH_PROBLEMS[3:6],
+    "hard": SWE_BENCH_PROBLEMS[6:],
+}
+def generate_tasks(output_dir: Path, count_per_difficulty: int = 3):
+    """Generate SWE-bench style tasks."""
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    total_created = 0
+    for difficulty, problems in DIFFICULTY_TASKS.items():
+        for i, problem in enumerate(problems[:count_per_difficulty]):
+            instance_id = f"{problem['instance_id']}_{difficulty}_{i}"
+            instance_dir = output_dir / instance_id
+            instance_dir.mkdir(parents=True, exist_ok=True)
+            # Write buggy.py
+            buggy_file = instance_dir / "buggy.py"
+            buggy_file.write_text(problem["buggy_code"], encoding="utf-8")
+            # Write test.py
+            test_file = instance_dir / "test.py"
+            test_file.write_text(problem["test_code"], encoding="utf-8")
+            # Write metadata.json
+            metadata = {
+                "instance_id": instance_id,
+                "repo": problem["repo"],
+                "problem_statement": problem["problem"],
+                "difficulty": difficulty,
+            }
+            metadata_file = instance_dir / "metadata.json"
+            metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
+            total_created += 1
+    print(f"Created {total_created} tasks in {output_dir}")
+    print(f"Set environment variable: SWEBENCH_TASKS_ROOT={output_dir.absolute()}")
+    print(f"Or run with: TASK_SOURCE=swebench python inference.py")
+def main():
+    parser = argparse.ArgumentParser(description="Generate SWE-bench style tasks")
+    parser.add_argument(
+        "--count",
+        type=int,
+        default=3,
+        help="Number of tasks per difficulty (default: 3)"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Output directory (default: dataset/swebench_lite_tasks)"
+    )
+    args = parser.parse_args()
+    if args.output_dir:
+        output_dir = Path(args.output_dir)
+    else:
+        script_dir = Path(__file__).parent
+        output_dir = script_dir / "swebench_lite_tasks"
+    generate_tasks(output_dir, args.count)
+if __name__ == "__main__":
+    main()

rl_code_fix_env/dataset/prepare_swebench.py ADDED Viewed

	@@ -0,0 +1,274 @@

+"""
+Script to download and materialize SWE-bench Lite tasks.
+This script:
+1. Downloads SWE-bench Lite dataset from HuggingFace
+2. Extracts the buggy code and creates test files
+3. Organizes them into the expected directory structure
+Usage:
+    python -m dataset.prepare_swebench [--max-tasks N] [--difficulty easy|medium|hard|all]
+"""
+import argparse
+import os
+import sys
+from pathlib import Path
+# Add parent to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from datasets import load_dataset
+def get_problem_statement(row):
+    """Extract problem statement from row."""
+    return row.get("problem_statement", "")
+def get_patch(row):
+    """Extract the patch/fix from row."""
+    return row.get("patch", "")
+def get_instance_id(row):
+    """Get instance ID from row."""
+    return row.get("instance_id", "")
+def create_buggy_file(instance_dir: Path, row):
+    """
+    Create buggy.py from the base commit and instance.
+    The SWE-bench dataset provides the full repository at base_commit.
+    We need to extract just the relevant file that has the bug.
+    """
+    # For SWE-bench, the "buggy" version is actually the version BEFORE the patch
+    # We need to get the file content from the base commit
+    # This is complex as it requires cloning the repo at a specific commit
+    # For simplicity, we'll use a different approach:
+    # The problem_statement describes the bug, and we can create a simplified
+    # buggy version based on that description
+    instance_id = get_instance_id(row)
+    problem_stmt = get_problem_statement(row)
+    # Try to extract the file from the created files in the instance
+    # SWE-bench provides 'repo' and we need to find the relevant file
+    created_files = row.get("created_files", [])
+    if not created_files:
+        # Fallback: create a placeholder
+        buggy_code = f'''# Buggy code for {instance_id}
+# Problem: {problem_stmt[:200]}...
+def solution():
+    """Placeholder solution - needs to be fixed."""
+    pass
+'''
+    else:
+        # For now, create a simple placeholder
+        # In a full implementation, we'd clone the repo at base_commit
+        file_path = created_files[0] if created_files else "solution.py"
+        buggy_code = f'''# Buggy code for {instance_id}
+# File: {file_path}
+# Problem: {problem_stmt[:200]}...
+def solution():
+    """Placeholder solution - needs to be fixed."""
+    pass
+'''
+    buggy_file = instance_dir / "buggy.py"
+    buggy_file.write_text(buggy_code, encoding="utf-8")
+    return buggy_file
+def create_test_file(instance_dir: Path, row):
+    """
+    Create test.py based on the problem statement.
+    For SWE-bench, tests are typically derived from the issue description.
+    We'll create a simple test that checks if the solution works.
+    """
+    instance_id = get_instance_id(row)
+    problem_stmt = get_problem_statement(row)
+    # Create a simple test file
+    # In practice, SWE-bench has a test.json file with test cases
+    test_cases = row.get("test_cases", [])
+    if test_cases:
+        # Create tests from provided test cases
+        test_code = "import unittest\\n\\n"
+        for i, tc in enumerate(test_cases):
+            input_str = tc.get("input", "")
+            output_str = tc.get("output", "")
+            test_code += f'''class TestSolution(unittest.TestCase):
+    def test_case_{i+1}(self):
+        # Input: {input_str}
+        # Expected: {output_str}
+        pass  # TODO: Add actual test
+'''
+    else:
+        # Create a basic test based on problem statement
+        test_code = f'''"""Test file for {instance_id}"""
+import unittest
+from buggy import solution
+class TestSolution(unittest.TestCase):
+    def test_basic(self):
+        """Test based on problem statement."""
+        # Problem: {problem_stmt[:300]}...
+        result = solution()
+        self.assertIsNotNone(result)
+if __name__ == "__main__":
+    unittest.main()
+'''
+    test_file = instance_dir / "test.py"
+    test_file.write_text(test_code, encoding="utf-8")
+    return test_file
+def create_metadata_file(instance_dir: Path, row):
+    """Create metadata.json with instance info."""
+    import json
+    metadata = {
+        "instance_id": get_instance_id(row),
+        "repo": row.get("repo", ""),
+        "base_commit": row.get("base_commit", ""),
+        "problem_statement": get_problem_statement(row),
+        "patch": get_patch(row),
+        "difficulty": "medium",  # Will be set based on index
+    }
+    metadata_file = instance_dir / "metadata.json"
+    metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
+    return metadata_file
+def prepare_swebench_tasks(
+    output_dir: Path,
+    max_tasks: int = 30,
+    difficulty: str = "all"
+):
+    """
+    Download and prepare SWE-bench Lite tasks.
+    Args:
+        output_dir: Directory to save tasks
+        max_tasks: Maximum number of tasks to download
+        difficulty: "easy", "medium", "hard", or "all"
+    """
+    print(f"Loading SWE-bench Lite dataset...")
+    try:
+        ds = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
+    except Exception as e:
+        print(f"Error loading dataset: {e}")
+        print("Trying alternative dataset name...")
+        ds = load_dataset("swe-bench/swe-bench-lite", split="test")
+    print(f"Loaded {len(ds)} tasks")
+    # Calculate difficulty bounds
+    total = len(ds)
+    one_third = max(total // 3, 1)
+    two_third = max((2 * total) // 3, one_third + 1)
+    difficulty_ranges = {
+        "easy": (0, one_third),
+        "medium": (one_third, two_third),
+        "hard": (two_third, total),
+    }
+    # Determine which tasks to download
+    if difficulty == "all":
+        ranges = list(difficulty_ranges.values())
+        indices = []
+        for start, end in ranges:
+            indices.extend(range(start, min(end, start + max_tasks // 3)))
+    else:
+        start, end = difficulty_ranges.get(difficulty, (0, total))
+        indices = list(range(start, min(end, max_tasks)))
+    # Create output directory
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Preparing {len(indices)} tasks...")
+    success_count = 0
+    for i, idx in enumerate(indices):
+        try:
+            row = ds[idx]
+            instance_id = get_instance_id(row)
+            # Create instance directory
+            instance_dir = output_dir / instance_id
+            instance_dir.mkdir(parents=True, exist_ok=True)
+            # Create files
+            create_buggy_file(instance_dir, row)
+            create_test_file(instance_dir, row)
+            create_metadata_file(instance_dir, row)
+            success_count += 1
+            if (i + 1) % 10 == 0:
+                print(f"  Processed {i + 1}/{len(indices)} tasks...")
+        except Exception as e:
+            print(f"  Warning: Failed to process task {idx}: {e}")
+            continue
+    print(f"\nDone! Prepared {success_count}/{len(indices)} tasks in {output_dir}")
+    print(f"Set SWEBENCH_TASKS_ROOT={output_dir.absolute()} to use these tasks.")
+def main():
+    parser = argparse.ArgumentParser(description="Prepare SWE-bench Lite tasks")
+    parser.add_argument(
+        "--max-tasks",
+        type=int,
+        default=30,
+        help="Maximum number of tasks to download (default: 30)"
+    )
+    parser.add_argument(
+        "--difficulty",
+        type=str,
+        default="all",
+        choices=["easy", "medium", "hard", "all"],
+        help="Difficulty level to download (default: all)"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Output directory (default: dataset/swebench_lite_tasks)"
+    )
+    args = parser.parse_args()
+    # Determine output directory
+    if args.output_dir:
+        output_dir = Path(args.output_dir)
+    else:
+        script_dir = Path(__file__).parent
+        output_dir = script_dir / "swebench_lite_tasks"
+    prepare_swebench_tasks(
+        output_dir=output_dir,
+        max_tasks=args.max_tasks,
+        difficulty=args.difficulty
+    )
+if __name__ == "__main__":
+    main()

rl_code_fix_env/dataset/problem_1/buggy.py CHANGED Viewed

@@ -1,5 +1,7 @@
-def reverse_words(text: str) -> str:
-    """Return the words in reverse order."""
-    # BUG: split(" ") keeps empty items for repeated spaces.
-    words = text.split(" ")
-    return " ".join(reversed(words))

+def safe_divide(a: float, b: float) -> float:
+    """Divide a by b; only return inf for division by zero."""
+    try:
+        return a / b
+    except Exception:
+        # BUG: catches unrelated errors too broadly.
+        return float("inf")

rl_code_fix_env/dataset/problem_1/metadata.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
   "difficulty": "easy",
-  "bug_type": "string-splitting",
   "expected_steps": 1
 }

 {
   "difficulty": "easy",
+  "bug_type": "exception-handling",
   "expected_steps": 1
 }

rl_code_fix_env/dataset/problem_1/test.py CHANGED Viewed

@@ -1,13 +1,17 @@
 import unittest
-from src.dataset.problem_1.buggy import reverse_words
-class TestReverseWords(unittest.TestCase):
-    def test_simple(self):
-        self.assertEqual(reverse_words("hello world"), "world hello")
-    def test_multiple_spaces(self):
-        self.assertEqual(reverse_words("one   two three"), "three two one")
 if __name__ == "__main__":

 import unittest
+from dataset.problem_1.buggy import safe_divide
+class TestSafeDivide(unittest.TestCase):
+    def test_normal(self):
+        self.assertEqual(safe_divide(8, 2), 4)
+    def test_zero_division(self):
+        self.assertEqual(safe_divide(1, 0), float("inf"))
+    def test_type_error_should_raise(self):
+        with self.assertRaises(TypeError):
+            safe_divide("1", 1)
 if __name__ == "__main__":

rl_code_fix_env/dataset/problem_10/buggy.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from src.dataset.problem_10.helpers import transpose
 def rotate_90_clockwise(matrix: list[list[int]]) -> list[list[int]]:


1	+ from dataset.problem_10.helpers import transpose
2
3
4	def rotate_90_clockwise(matrix: list[list[int]]) -> list[list[int]]:

rl_code_fix_env/dataset/problem_10/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_10.buggy import rotate_90_clockwise
 class TestRotateMatrix(unittest.TestCase):

 import unittest
+from dataset.problem_10.buggy import rotate_90_clockwise
 class TestRotateMatrix(unittest.TestCase):

rl_code_fix_env/dataset/problem_11/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_11.buggy import binary_search
 class TestBinarySearch(unittest.TestCase):

 import unittest
+from dataset.problem_11.buggy import binary_search
 class TestBinarySearch(unittest.TestCase):

rl_code_fix_env/dataset/problem_12/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_12.buggy import parse_pairs
 class TestParsePairs(unittest.TestCase):

 import unittest
+from dataset.problem_12.buggy import parse_pairs
 class TestParsePairs(unittest.TestCase):

rl_code_fix_env/dataset/problem_13/buggy.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from src.dataset.problem_13.cache import LRUCache
 def run_ops() -> tuple[int, int]:


1	+ from dataset.problem_13.cache import LRUCache
2
3
4	def run_ops() -> tuple[int, int]:

rl_code_fix_env/dataset/problem_13/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_13.buggy import run_ops
 class TestLRU(unittest.TestCase):

 import unittest
+from dataset.problem_13.buggy import run_ops
 class TestLRU(unittest.TestCase):

rl_code_fix_env/dataset/problem_14/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_14.buggy import fibonacci_recursive
 class TestFibonacciRecursive(unittest.TestCase):

 import unittest
+from dataset.problem_14.buggy import fibonacci_recursive
 class TestFibonacciRecursive(unittest.TestCase):

rl_code_fix_env/dataset/problem_15/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_15.buggy import has_overlap
 class TestIntervalOverlap(unittest.TestCase):

 import unittest
+from dataset.problem_15.buggy import has_overlap
 class TestIntervalOverlap(unittest.TestCase):

rl_code_fix_env/dataset/problem_16/buggy.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from src.dataset.problem_16.helpers import normalize_scores
 def top_label(scores: dict[str, float]) -> str:


1	+ from dataset.problem_16.helpers import normalize_scores
2
3
4	def top_label(scores: dict[str, float]) -> str:

rl_code_fix_env/dataset/problem_16/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_16.buggy import top_label
 class TestTopLabel(unittest.TestCase):

 import unittest
+from dataset.problem_16.buggy import top_label
 class TestTopLabel(unittest.TestCase):

rl_code_fix_env/dataset/problem_17/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_17.buggy import dedupe_preserve_order
 class TestDedupe(unittest.TestCase):

 import unittest
+from dataset.problem_17.buggy import dedupe_preserve_order
 class TestDedupe(unittest.TestCase):

rl_code_fix_env/dataset/problem_18/buggy.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from src.dataset.problem_18.math_utils import clamp
 def moving_average(nums: list[int], window: int) -> list[float]:


1	+ from dataset.problem_18.math_utils import clamp
2
3
4	def moving_average(nums: list[int], window: int) -> list[float]:

rl_code_fix_env/dataset/problem_18/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_18.buggy import moving_average
 class TestMovingAverage(unittest.TestCase):

 import unittest
+from dataset.problem_18.buggy import moving_average
 class TestMovingAverage(unittest.TestCase):

rl_code_fix_env/dataset/problem_19/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import pytest
-from src.dataset.problem_19.buggy import calculate_employee_bonus
 def test_calculate_employee_bonus():
     employees = [

 import pytest
+from dataset.problem_19.buggy import calculate_employee_bonus
 def test_calculate_employee_bonus():
     employees = [

rl_code_fix_env/dataset/problem_2/buggy.py CHANGED Viewed

@@ -1,5 +1,14 @@
-def is_palindrome(text: str) -> bool:
-    """Check whether text is a palindrome."""
-    # BUG: does not normalize case or skip non-alphanumeric chars.
-    cleaned = text.strip()
-    return cleaned == cleaned[::-1]

+def binary_search(nums: list[int], target: int) -> int:
+    """Return index of target, or -1 if not found."""
+    left, right = 0, len(nums) - 1
+    while left < right:
+        mid = (left + right) // 2
+        if nums[mid] == target:
+            return mid
+        if nums[mid] < target:
+            left = mid + 1
+        else:
+            right = mid - 1
+    return -1

rl_code_fix_env/dataset/problem_2/metadata.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "difficulty": "easy",
-  "bug_type": "string-normalization",
   "expected_steps": 2
 }

 {
+  "difficulty": "medium",
+  "bug_type": "boundary-condition",
   "expected_steps": 2
 }

rl_code_fix_env/dataset/problem_2/test.py CHANGED Viewed

@@ -1,13 +1,16 @@
 import unittest
-from src.dataset.problem_2.buggy import is_palindrome
-class TestPalindrome(unittest.TestCase):
-    def test_basic_true(self):
-        self.assertTrue(is_palindrome("level"))
-    def test_ignores_case_and_symbols(self):
-        self.assertTrue(is_palindrome("A man, a plan, a canal: Panama"))
 if __name__ == "__main__":

 import unittest
+from dataset.problem_11.buggy import binary_search
+class TestBinarySearch(unittest.TestCase):
+    def test_found_middle(self):
+        self.assertEqual(binary_search([1, 3, 5, 7], 5), 2)
+    def test_found_last(self):
+        self.assertEqual(binary_search([1, 3, 5, 7], 7), 3)
+    def test_not_found(self):
+        self.assertEqual(binary_search([1, 3, 5, 7], 4), -1)
 if __name__ == "__main__":

rl_code_fix_env/dataset/problem_20/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import pytest
-from src.dataset.problem_20.buggy import analyze_user_activity
 def test_analyze_user_activity():
     logs = [

 import pytest
+from dataset.problem_20.buggy import analyze_user_activity
 def test_analyze_user_activity():
     logs = [

rl_code_fix_env/dataset/problem_21/test.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pytest
 import os
 import tempfile
 import json
-from src.dataset.problem_21.buggy import process_inventory_data
 def test_process_inventory_data():
     data = {

 import os
 import tempfile
 import json
+from dataset.problem_21.buggy import process_inventory_data
 def test_process_inventory_data():
     data = {

rl_code_fix_env/dataset/problem_22/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import pytest
-from src.dataset.problem_22.buggy import parse_and_validate_emails
 def test_parse_and_validate_emails():
     emails = [

 import pytest
+from dataset.problem_22.buggy import parse_and_validate_emails
 def test_parse_and_validate_emails():
     emails = [

rl_code_fix_env/dataset/problem_23/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import pytest
-from src.dataset.problem_23.buggy import optimize_portfolio
 def test_optimize_portfolio():
     investments = [

 import pytest
+from dataset.problem_23.buggy import optimize_portfolio
 def test_optimize_portfolio():
     investments = [

rl_code_fix_env/dataset/problem_3/buggy.py CHANGED Viewed

@@ -1,10 +1,37 @@
-def fibonacci(n: int) -> int:
-    """Return the n-th Fibonacci number (0-indexed)."""
-    if n <= 1:
-        return n
-    a, b = 0, 1
-    # BUG: loop count is one step short.
-    for _ in range(2, n):
-        a, b = b, a + b
-    return b

+def optimize_portfolio(investments: list[dict], budget: float) -> list[dict]:
+    """
+    Selects the optimal subset of investments to maximize return within a budget.
+    (0-1 Knapsack problem approximation)
+    investments: list of dicts with 'id', 'cost', 'expected_return'
+    budget: float, maximum total cost allowed
+    Returns:
+    list of chosen investments
+    """
+    # Base case checks
+    if budget <= 0 or not investments:
+        return []
+    # BUG 1: Sorting modifies the original list, should use sorted() or copy
+    # BUG 2: Sorting by expected_return ascending instead of return/cost ratio descending
+    investments.sort(key=lambda x: x['expected_return'])
+    chosen = []
+    current_spent = 0
+    # BUG 3: For loop variable shadowing the loop scope if cost/return variables are misspelled
+    for item in investments:
+        # BUG 4: item.get() but missing default values if keys are absent, could cause TypeError if None
+        cost = item.get('cost')
+        ret = item.get('expected_return')
+        # BUG 5: Logic error: checking if current_spent is less than budget, but not checking if adding cost exceeds it
+        if current_spent < budget:
+            current_spent += cost
+            chosen.append(item)
+    # BUG 6: Does not handle the case where adding the item exceeds budget, just blindly adds it if current_spent < budget
+    # E.g. budget 100, current 90, item cost 50 -> adds it, total 140
+    return chosen

rl_code_fix_env/dataset/problem_3/metadata.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "difficulty": "easy",
-  "bug_type": "off-by-one",
-  "expected_steps": 1
 }

 {
+  "difficulty": "hard",
+  "bug_type": "multiple",
+  "expected_steps": 5
 }

rl_code_fix_env/dataset/problem_3/test.py CHANGED Viewed

@@ -1,15 +1,44 @@
-import unittest
-from src.dataset.problem_3.buggy import fibonacci
-class TestFibonacci(unittest.TestCase):
-    def test_small_values(self):
-        self.assertEqual(fibonacci(2), 1)
-        self.assertEqual(fibonacci(3), 2)
-    def test_larger_value(self):
-        self.assertEqual(fibonacci(7), 13)
-if __name__ == "__main__":
-    unittest.main()

+import pytest
+from dataset.problem_23.buggy import optimize_portfolio
+def test_optimize_portfolio():
+    investments = [
+        {'id': 'A', 'cost': 50, 'expected_return': 60}, # ratio 1.2
+        {'id': 'B', 'cost': 30, 'expected_return': 45}, # ratio 1.5
+        {'id': 'C', 'cost': 20, 'expected_return': 40}, # ratio 2.0
+        {'id': 'D', 'cost': 40, 'expected_return': 50}, # ratio 1.25
+        {'id': 'E', 'cost': 10, 'expected_return': 15}  # ratio 1.5
+    ]
+    # Original list should not be mutated
+    orig_investments = [dict(i) for i in investments]
+    # Budget 50
+    # Expected greedy: C (20) -> B (30) -> total cost 50, return 85
+    result = optimize_portfolio(investments, 50)
+    assert investments == orig_investments, "Original list was mutated"
+    # Assert correct items selected
+    chosen_ids = {item['id'] for item in result}
+    assert chosen_ids == {'B', 'C'}, f"Expected B and C, got {chosen_ids}"
+    total_cost = sum(item['cost'] for item in result)
+    assert total_cost <= 50
+def test_budget_exceeded_check():
+    investments = [
+        {'id': 'A', 'cost': 90, 'expected_return': 100},
+        {'id': 'B', 'cost': 50, 'expected_return': 60}
+    ]
+    # Budget 100
+    # Expected: A (cost 90)
+    result = optimize_portfolio(investments, 100)
+    chosen_ids = {item['id'] for item in result}
+    assert chosen_ids == {'A'}, "Should not include B since total cost would be 140"
+def test_empty_or_zero_budget():
+    assert optimize_portfolio([], 100) == []
+    assert optimize_portfolio([{'id': 'A', 'cost': 10, 'expected_return': 20}], 0) == []

rl_code_fix_env/dataset/problem_4/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_4.buggy import merge_sorted
 class TestMergeSorted(unittest.TestCase):

 import unittest
+from dataset.problem_4.buggy import merge_sorted
 class TestMergeSorted(unittest.TestCase):

rl_code_fix_env/dataset/problem_5/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_5.buggy import chunk_list
 class TestChunkList(unittest.TestCase):

 import unittest
+from dataset.problem_5.buggy import chunk_list
 class TestChunkList(unittest.TestCase):

rl_code_fix_env/dataset/problem_6/buggy.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from src.dataset.problem_6.helpers import tokenize
 def count_unique_words(text: str) -> int:


1	+ from dataset.problem_6.helpers import tokenize
2
3
4	def count_unique_words(text: str) -> int:

rl_code_fix_env/dataset/problem_6/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_6.buggy import count_unique_words
 class TestCountUniqueWords(unittest.TestCase):

 import unittest
+from dataset.problem_6.buggy import count_unique_words
 class TestCountUniqueWords(unittest.TestCase):

rl_code_fix_env/dataset/problem_7/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_7.buggy import top_k_frequent
 class TestTopKFrequent(unittest.TestCase):

 import unittest
+from dataset.problem_7.buggy import top_k_frequent
 class TestTopKFrequent(unittest.TestCase):

rl_code_fix_env/dataset/problem_8/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_8.buggy import flatten_one_level
 class TestFlattenOneLevel(unittest.TestCase):

 import unittest
+from dataset.problem_8.buggy import flatten_one_level
 class TestFlattenOneLevel(unittest.TestCase):

rl_code_fix_env/dataset/problem_9/test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from src.dataset.problem_9.buggy import safe_divide
 class TestSafeDivide(unittest.TestCase):

 import unittest
+from dataset.problem_9.buggy import safe_divide
 class TestSafeDivide(unittest.TestCase):

rl_code_fix_env/dataset/swebench_adapter.py CHANGED Viewed

@@ -46,47 +46,93 @@ def get_swebench_task(difficulty: str) -> Dict[str, Any]:
     Expected local layout:
       dataset/swebench_lite_tasks/<instance_id>/buggy.py
       dataset/swebench_lite_tasks/<instance_id>/test.py
     """
     diff = (difficulty or "").strip().lower()
     if diff not in DIFFICULTIES:
         raise ValueError(f"Invalid difficulty '{difficulty}'. Must be one of {DIFFICULTIES}.")
-    rows = _load_swebench_lite_rows()
-    if not rows:
-        raise RuntimeError("SWE-bench Lite split is empty.")
-    bounds = _difficulty_bounds(len(rows))
-    start, end = bounds[diff]
-    candidates = rows[start:end] if end > start else rows
     tasks_root = Path(os.getenv("SWEBENCH_TASKS_ROOT", str(DEFAULT_TASKS_ROOT)))
-    preferred_offset = int(os.getenv("SWEBENCH_INDEX", "0"))
-    # Deterministic scan order with optional offset.
-    ordered = candidates[preferred_offset:] + candidates[:preferred_offset]
-    for row in ordered:
-        row_idx = int(row.get("__index_level_0__", 0))
-        instance_id = str(row.get("instance_id", f"row_{row_idx}"))
-        for folder in _candidate_dirs(tasks_root, instance_id, row_idx):
-            buggy_file = folder / "buggy.py"
-            test_file = folder / "test.py"
-            if buggy_file.exists() and test_file.exists():
-                code = buggy_file.read_text(encoding="utf-8")
-                metadata = {
-                    "source": "swebench_lite",
-                    "instance_id": instance_id,
-                    "repo": row.get("repo"),
-                    "base_commit": row.get("base_commit"),
-                    "problem_statement": row.get("problem_statement"),
-                    "difficulty": diff,
-                }
-                return {
-                    "code": code,
-                    "tests": str(test_file),
-                    "metadata": metadata,
-                    "problem_dir": str(folder),
-                    "problem_id": instance_id,
-                }
     raise FileNotFoundError(
         "No materialized SWE-bench task workspace found. "

     Expected local layout:
       dataset/swebench_lite_tasks/<instance_id>/buggy.py
       dataset/swebench_lite_tasks/<instance_id>/test.py
+    First tries to load from local files, then falls back to HuggingFace dataset.
     """
     diff = (difficulty or "").strip().lower()
     if diff not in DIFFICULTIES:
         raise ValueError(f"Invalid difficulty '{difficulty}'. Must be one of {DIFFICULTIES}.")
     tasks_root = Path(os.getenv("SWEBENCH_TASKS_ROOT", str(DEFAULT_TASKS_ROOT)))
+    # First, try to load from local materialized tasks
+    if tasks_root.exists():
+        # Find all instance directories
+        instance_dirs = []
+        for item in tasks_root.iterdir():
+            if item.is_dir() and (item / "buggy.py").exists() and (item / "test.py").exists():
+                # Check if this directory matches the difficulty
+                if diff in item.name.lower():
+                    instance_dirs.append(item)
+        if instance_dirs:
+            # Sort for deterministic selection
+            instance_dirs.sort(key=lambda x: x.name)
+            # Select based on SWEBENCH_INDEX
+            preferred_offset = int(os.getenv("SWEBENCH_INDEX", "0"))
+            selected_dir = instance_dirs[preferred_offset % len(instance_dirs)]
+            buggy_file = selected_dir / "buggy.py"
+            test_file = selected_dir / "test.py"
+            metadata_file = selected_dir / "metadata.json"
+            code = buggy_file.read_text(encoding="utf-8")
+            # Load metadata if available
+            metadata = {"source": "swebench_lite", "difficulty": diff}
+            if metadata_file.exists():
+                import json
+                metadata = json.loads(metadata_file.read_text(encoding="utf-8"))
+            return {
+                "code": code,
+                "tests": str(test_file),
+                "metadata": metadata,
+                "problem_dir": str(selected_dir),
+                "problem_id": selected_dir.name,
+            }
+    # Fallback: try to load from HuggingFace dataset
+    try:
+        rows = _load_swebench_lite_rows()
+        if not rows:
+            raise RuntimeError("SWE-bench Lite split is empty.")
+        bounds = _difficulty_bounds(len(rows))
+        start, end = bounds[diff]
+        candidates = rows[start:end] if end > start else rows
+        preferred_offset = int(os.getenv("SWEBENCH_INDEX", "0"))
+        # Deterministic scan order with optional offset.
+        ordered = candidates[preferred_offset:] + candidates[:preferred_offset]
+        for row in ordered:
+            row_idx = int(row.get("__index_level_0__", 0))
+            instance_id = str(row.get("instance_id", f"row_{row_idx}"))
+            for folder in _candidate_dirs(tasks_root, instance_id, row_idx):
+                buggy_file = folder / "buggy.py"
+                test_file = folder / "test.py"
+                if buggy_file.exists() and test_file.exists():
+                    code = buggy_file.read_text(encoding="utf-8")
+                    metadata = {
+                        "source": "swebench_lite",
+                        "instance_id": instance_id,
+                        "repo": row.get("repo"),
+                        "base_commit": row.get("base_commit"),
+                        "problem_statement": row.get("problem_statement"),
+                        "difficulty": diff,
+                    }
+                    return {
+                        "code": code,
+                        "tests": str(test_file),
+                        "metadata": metadata,
+                        "problem_dir": str(folder),
+                        "problem_id": instance_id,
+                    }
+    except Exception as e:
+        # If HuggingFace fails, raise the original error about missing local files
+        pass
     raise FileNotFoundError(
         "No materialized SWE-bench task workspace found. "

rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/buggy.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from django import forms
+from django.contrib.auth.models import User
+class UserCreationForm(forms.ModelForm):
+    """Form for creating new users."""
+    password1 = forms.CharField(widget=forms.PasswordInput)
+    password2 = forms.CharField(widget=forms.PasswordInput)
+    class Meta:
+        model = User
+        fields = ('username', 'email')
+    def clean(self):
+        cleaned_data = super().clean()
+        password1 = cleaned_data.get('password1')
+        password2 = cleaned_data.get('password2')
+        # BUG: This comparison is case-sensitive but should be case-insensitive
+        if password1 != password2:
+            raise forms.ValidationError("Passwords don't match")
+        return cleaned_data
+    def save(self, commit=True):
+        user = super().save(commit=False)
+        user.set_password(self.cleaned_data['password1'])
+        if commit:
+            user.save()
+        return user

rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/metadata.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "instance_id": "django__django-11098_easy_0",
+  "repo": "django/django",
+  "problem_statement": "Fix the user creation form validation error",
+  "difficulty": "easy"
+}

rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/test.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import unittest
+from buggy import UserCreationForm
+class TestUserCreationForm(unittest.TestCase):
+    def test_password_matching(self):
+        """Test that matching passwords pass validation."""
+        form = UserCreationForm(data={
+            'username': 'testuser',
+            'email': 'test@example.com',
+            'password1': 'TestPass123',
+            'password2': 'TestPass123',
+        })
+        self.assertTrue(form.is_valid())
+    def test_password_mismatch(self):
+        """Test that mismatched passwords fail validation."""
+        form = UserCreationForm(data={
+            'username': 'testuser',
+            'email': 'test@example.com',
+            'password1': 'TestPass123',
+            'password2': 'testpass123',  # Different case
+        })
+        self.assertFalse(form.is_valid())
+        self.assertIn('passwords', str(form.errors).lower())

rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/buggy.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import json
+from datetime import datetime, date
+class JSONEncoder(json.JSONEncoder):
+    """Custom JSON encoder for Flask."""
+    def default(self, obj):
+        # BUG: Missing handling for datetime objects
+        if isinstance(obj, date):
+            return obj.isoformat()
+        return super().default(obj)
+def to_json(obj):
+    """Convert object to JSON string."""
+    return json.dumps(obj, cls=JSONEncoder)

rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/metadata.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "instance_id": "flask__flask-1048_easy_1",
+  "repo": "pallets/flask",
+  "problem_statement": "Fix JSON encoding for datetime objects",
+  "difficulty": "easy"
+}

rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/test.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import unittest
+from datetime import datetime
+from buggy import to_json
+class TestJSONEncoding(unittest.TestCase):
+    def test_encode_datetime(self):
+        """Test that datetime objects are properly encoded."""
+        dt = datetime(2024, 1, 15, 10, 30, 0)
+        result = to_json({'timestamp': dt})
+        self.assertIn('2024-01-15', result)
+        self.assertIn('10:30:00', result)
+    def test_encode_date(self):
+        """Test that date objects are properly encoded."""
+        d = date(2024, 1, 15)
+        result = to_json({'date': d})
+        self.assertIn('2024-01-15', result)

rl_code_fix_env/dataset/swebench_lite_tasks/numpy__numpy-10825_medium_0/buggy.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import numpy as np
+def concatenate_arrays(*arrays):
+    """Concatenate multiple arrays along axis 0."""
+    if not arrays:
+        return np.array([])
+    # BUG: Should handle None arrays gracefully
+    result = arrays[0]
+    for arr in arrays[1:]:
+        result = np.concatenate([result, arr])
+    return result