Spaces:
Running
Running
Viraaj Sawant commited on
Commit ·
18625ef
1
Parent(s): fe42848
new push with SWE dataset
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitignore +3 -0
- prompts.py +16 -33
- rl_code_fix_env/.gitignore +2 -1
- rl_code_fix_env/README.md +1 -0
- rl_code_fix_env/_aliases.py +21 -0
- rl_code_fix_env/conftest.py +0 -17
- rl_code_fix_env/dataset/generate_swebench_tasks.py +498 -0
- rl_code_fix_env/dataset/prepare_swebench.py +274 -0
- rl_code_fix_env/dataset/problem_1/buggy.py +7 -5
- rl_code_fix_env/dataset/problem_1/metadata.json +1 -1
- rl_code_fix_env/dataset/problem_1/test.py +10 -6
- rl_code_fix_env/dataset/problem_10/buggy.py +1 -1
- rl_code_fix_env/dataset/problem_10/test.py +1 -1
- rl_code_fix_env/dataset/problem_11/test.py +1 -1
- rl_code_fix_env/dataset/problem_12/test.py +1 -1
- rl_code_fix_env/dataset/problem_13/buggy.py +1 -1
- rl_code_fix_env/dataset/problem_13/test.py +1 -1
- rl_code_fix_env/dataset/problem_14/test.py +1 -1
- rl_code_fix_env/dataset/problem_15/test.py +1 -1
- rl_code_fix_env/dataset/problem_16/buggy.py +1 -1
- rl_code_fix_env/dataset/problem_16/test.py +1 -1
- rl_code_fix_env/dataset/problem_17/test.py +1 -1
- rl_code_fix_env/dataset/problem_18/buggy.py +1 -1
- rl_code_fix_env/dataset/problem_18/test.py +1 -1
- rl_code_fix_env/dataset/problem_19/test.py +1 -1
- rl_code_fix_env/dataset/problem_2/buggy.py +14 -5
- rl_code_fix_env/dataset/problem_2/metadata.json +2 -2
- rl_code_fix_env/dataset/problem_2/test.py +9 -6
- rl_code_fix_env/dataset/problem_20/test.py +1 -1
- rl_code_fix_env/dataset/problem_21/test.py +1 -1
- rl_code_fix_env/dataset/problem_22/test.py +1 -1
- rl_code_fix_env/dataset/problem_23/test.py +1 -1
- rl_code_fix_env/dataset/problem_3/buggy.py +37 -10
- rl_code_fix_env/dataset/problem_3/metadata.json +3 -3
- rl_code_fix_env/dataset/problem_3/test.py +43 -14
- rl_code_fix_env/dataset/problem_4/test.py +1 -1
- rl_code_fix_env/dataset/problem_5/test.py +1 -1
- rl_code_fix_env/dataset/problem_6/buggy.py +1 -1
- rl_code_fix_env/dataset/problem_6/test.py +1 -1
- rl_code_fix_env/dataset/problem_7/test.py +1 -1
- rl_code_fix_env/dataset/problem_8/test.py +1 -1
- rl_code_fix_env/dataset/problem_9/test.py +1 -1
- rl_code_fix_env/dataset/swebench_adapter.py +81 -35
- rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/buggy.py +29 -0
- rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/metadata.json +6 -0
- rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/test.py +24 -0
- rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/buggy.py +15 -0
- rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/metadata.json +6 -0
- rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/test.py +17 -0
- rl_code_fix_env/dataset/swebench_lite_tasks/numpy__numpy-10825_medium_0/buggy.py +13 -0
.gitignore
CHANGED
|
@@ -6,3 +6,6 @@ __pycache__/
|
|
| 6 |
commands.md
|
| 7 |
logs.md
|
| 8 |
inference&docker.md
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
commands.md
|
| 7 |
logs.md
|
| 8 |
inference&docker.md
|
| 9 |
+
logs2.md
|
| 10 |
+
.env.example
|
| 11 |
+
file.txt
|
prompts.py
CHANGED
|
@@ -1,37 +1,20 @@
|
|
| 1 |
LLM_SCORER_PROMPT = """
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
Evaluate the agent's fix on exactly three axes, each scored 0.0–10.0:
|
| 10 |
-
|
| 11 |
-
1. CORRECTNESS — Does the patch fix the bug(s) without introducing new ones?
|
| 12 |
-
Full marks only if the fix is semantically correct and complete.
|
| 13 |
-
Penalise partial fixes, over-patches, or fixes that mask rather than resolve the root cause.
|
| 14 |
-
|
| 15 |
-
2. MINIMALITY — Is the diff minimal? Penalise unnecessary refactors, renames, whitespace-only changes,
|
| 16 |
-
or reformatting of lines unrelated to the bug.
|
| 17 |
-
|
| 18 |
-
3. QUALITY — Is the patched code readable and idiomatic? Penalise: broken naming conventions,
|
| 19 |
-
added dead code, removed necessary comments, or degraded clarity vs. the original.
|
| 20 |
-
|
| 21 |
-
Respond ONLY with this JSON — no preamble, no trailing text:
|
| 22 |
-
{
|
| 23 |
-
"correctness": <float 0.0-10.0>,
|
| 24 |
-
"minimality": <float 0.0-10.0>,
|
| 25 |
-
"quality": <float 0.0-10.0>,
|
| 26 |
-
"reasoning": "<one concise sentence per axis, pipe-separated>"
|
| 27 |
-
}
|
| 28 |
"""
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
| 37 |
"""
|
|
|
|
| 1 |
LLM_SCORER_PROMPT = """
|
| 2 |
+
You are a reward model for a code-fixing RL agent. Evaluate the PATCHED code vs. ORIGINAL on three axes (0.0–10.0):
|
| 3 |
+
1. CORRECTNESS — Does the patch fix the bug(s) without new bugs?
|
| 4 |
+
2. MINIMALITY — Is the diff minimal? Penalize unrelated changes.
|
| 5 |
+
3. QUALITY — Is the code readable and idiomatic?
|
| 6 |
+
Respond ONLY with this JSON (no preamble):
|
| 7 |
+
{"correctness": <float>, "minimality": <float>, "quality": <float>, "reasoning": "<one sentence per axis, pipe-separated>"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
+
USER_TEMPLATE = """
|
| 11 |
+
ORIGINAL:
|
| 12 |
+
```python
|
| 13 |
+
{original_code}
|
| 14 |
+
```
|
| 15 |
+
PATCHED:
|
| 16 |
+
```python
|
| 17 |
+
{patched_code}
|
| 18 |
+
```
|
| 19 |
+
Return only the JSON.
|
| 20 |
"""
|
rl_code_fix_env/.gitignore
CHANGED
|
@@ -5,4 +5,5 @@ __pycache__/
|
|
| 5 |
.env
|
| 6 |
*.pyc
|
| 7 |
*.egg
|
| 8 |
-
pytest-cache-files-*/
|
|
|
|
|
|
| 5 |
.env
|
| 6 |
*.pyc
|
| 7 |
*.egg
|
| 8 |
+
pytest-cache-files-*/
|
| 9 |
+
*.ps1
|
rl_code_fix_env/README.md
CHANGED
|
@@ -5,6 +5,7 @@ colorFrom: green
|
|
| 5 |
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
| 8 |
app_port: 8000
|
| 9 |
base_path: /web
|
| 10 |
tags:
|
|
|
|
| 5 |
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
dockerfile: server/Dockerfile
|
| 9 |
app_port: 8000
|
| 10 |
base_path: /web
|
| 11 |
tags:
|
rl_code_fix_env/_aliases.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import importlib
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
_REPO_ROOT = str(Path(__file__).parent)
|
| 6 |
+
if _REPO_ROOT not in sys.path:
|
| 7 |
+
sys.path.insert(0, _REPO_ROOT)
|
| 8 |
+
|
| 9 |
+
import dataset as _real_dataset
|
| 10 |
+
|
| 11 |
+
sys.modules.setdefault("src.dataset", _real_dataset)
|
| 12 |
+
|
| 13 |
+
import pkgutil
|
| 14 |
+
for _pkg in pkgutil.iter_modules(_real_dataset.__path__):
|
| 15 |
+
_full = f"dataset.{_pkg.name}"
|
| 16 |
+
_alias = f"src.dataset.{_pkg.name}"
|
| 17 |
+
try:
|
| 18 |
+
_mod = importlib.import_module(_full)
|
| 19 |
+
sys.modules.setdefault(_alias, _mod)
|
| 20 |
+
except Exception:
|
| 21 |
+
pass
|
rl_code_fix_env/conftest.py
CHANGED
|
@@ -1,20 +1,3 @@
|
|
| 1 |
-
"""
|
| 2 |
-
conftest.py repo-root pytest configuration.
|
| 3 |
-
|
| 4 |
-
Registers `src.dataset` as a sys.modules alias for `dataset` so that all
|
| 5 |
-
problem test files using `from src.dataset.problem_X.buggy import ...`
|
| 6 |
-
resolve correctly without needing to rename 24 test files.
|
| 7 |
-
|
| 8 |
-
The physical layout is:
|
| 9 |
-
<repo_root>/dataset/problem_X/buggy.py real files
|
| 10 |
-
<repo_root>/src/ has environment/, reward/, etc.
|
| 11 |
-
but NO dataset/ subfolder
|
| 12 |
-
|
| 13 |
-
With PYTHONPATH=<repo_root>:
|
| 14 |
-
import dataset.problem_1.buggy works natively
|
| 15 |
-
import src.dataset.problem_1.buggy would fail fixed here via alias
|
| 16 |
-
"""
|
| 17 |
-
|
| 18 |
import sys
|
| 19 |
import importlib
|
| 20 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import sys
|
| 2 |
import importlib
|
| 3 |
from pathlib import Path
|
rl_code_fix_env/dataset/generate_swebench_tasks.py
ADDED
|
@@ -0,0 +1,498 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Generate synthetic SWE-bench style tasks for testing.
|
| 3 |
+
|
| 4 |
+
This creates tasks that mimic the SWE-bench format:
|
| 5 |
+
- instance_id/buggy.py - the buggy code
|
| 6 |
+
- instance_id/test.py - test file
|
| 7 |
+
- instance_id/metadata.json - metadata
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python -m dataset.generate_swebench_tasks [--count N]
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import argparse
|
| 14 |
+
import json
|
| 15 |
+
import random
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# Sample SWE-bench style problems
|
| 20 |
+
SWE_BENCH_PROBLEMS = [
|
| 21 |
+
{
|
| 22 |
+
"instance_id": "django__django-11098",
|
| 23 |
+
"repo": "django/django",
|
| 24 |
+
"problem": "Fix the user creation form validation error",
|
| 25 |
+
"buggy_code": '''from django import forms
|
| 26 |
+
from django.contrib.auth.models import User
|
| 27 |
+
|
| 28 |
+
class UserCreationForm(forms.ModelForm):
|
| 29 |
+
"""Form for creating new users."""
|
| 30 |
+
password1 = forms.CharField(widget=forms.PasswordInput)
|
| 31 |
+
password2 = forms.CharField(widget=forms.PasswordInput)
|
| 32 |
+
|
| 33 |
+
class Meta:
|
| 34 |
+
model = User
|
| 35 |
+
fields = ('username', 'email')
|
| 36 |
+
|
| 37 |
+
def clean(self):
|
| 38 |
+
cleaned_data = super().clean()
|
| 39 |
+
password1 = cleaned_data.get('password1')
|
| 40 |
+
password2 = cleaned_data.get('password2')
|
| 41 |
+
|
| 42 |
+
# BUG: This comparison is case-sensitive but should be case-insensitive
|
| 43 |
+
if password1 != password2:
|
| 44 |
+
raise forms.ValidationError("Passwords don't match")
|
| 45 |
+
|
| 46 |
+
return cleaned_data
|
| 47 |
+
|
| 48 |
+
def save(self, commit=True):
|
| 49 |
+
user = super().save(commit=False)
|
| 50 |
+
user.set_password(self.cleaned_data['password1'])
|
| 51 |
+
if commit:
|
| 52 |
+
user.save()
|
| 53 |
+
return user
|
| 54 |
+
''',
|
| 55 |
+
"test_code": '''import unittest
|
| 56 |
+
from buggy import UserCreationForm
|
| 57 |
+
|
| 58 |
+
class TestUserCreationForm(unittest.TestCase):
|
| 59 |
+
def test_password_matching(self):
|
| 60 |
+
"""Test that matching passwords pass validation."""
|
| 61 |
+
form = UserCreationForm(data={
|
| 62 |
+
'username': 'testuser',
|
| 63 |
+
'email': 'test@example.com',
|
| 64 |
+
'password1': 'TestPass123',
|
| 65 |
+
'password2': 'TestPass123',
|
| 66 |
+
})
|
| 67 |
+
self.assertTrue(form.is_valid())
|
| 68 |
+
|
| 69 |
+
def test_password_mismatch(self):
|
| 70 |
+
"""Test that mismatched passwords fail validation."""
|
| 71 |
+
form = UserCreationForm(data={
|
| 72 |
+
'username': 'testuser',
|
| 73 |
+
'email': 'test@example.com',
|
| 74 |
+
'password1': 'TestPass123',
|
| 75 |
+
'password2': 'testpass123', # Different case
|
| 76 |
+
})
|
| 77 |
+
self.assertFalse(form.is_valid())
|
| 78 |
+
self.assertIn('passwords', str(form.errors).lower())
|
| 79 |
+
''',
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"instance_id": "flask__flask-1048",
|
| 83 |
+
"repo": "pallets/flask",
|
| 84 |
+
"problem": "Fix JSON encoding for datetime objects",
|
| 85 |
+
"buggy_code": '''import json
|
| 86 |
+
from datetime import datetime, date
|
| 87 |
+
|
| 88 |
+
class JSONEncoder(json.JSONEncoder):
|
| 89 |
+
"""Custom JSON encoder for Flask."""
|
| 90 |
+
|
| 91 |
+
def default(self, obj):
|
| 92 |
+
# BUG: Missing handling for datetime objects
|
| 93 |
+
if isinstance(obj, date):
|
| 94 |
+
return obj.isoformat()
|
| 95 |
+
return super().default(obj)
|
| 96 |
+
|
| 97 |
+
def to_json(obj):
|
| 98 |
+
"""Convert object to JSON string."""
|
| 99 |
+
return json.dumps(obj, cls=JSONEncoder)
|
| 100 |
+
''',
|
| 101 |
+
"test_code": '''import unittest
|
| 102 |
+
from datetime import datetime
|
| 103 |
+
from buggy import to_json
|
| 104 |
+
|
| 105 |
+
class TestJSONEncoding(unittest.TestCase):
|
| 106 |
+
def test_encode_datetime(self):
|
| 107 |
+
"""Test that datetime objects are properly encoded."""
|
| 108 |
+
dt = datetime(2024, 1, 15, 10, 30, 0)
|
| 109 |
+
result = to_json({'timestamp': dt})
|
| 110 |
+
self.assertIn('2024-01-15', result)
|
| 111 |
+
self.assertIn('10:30:00', result)
|
| 112 |
+
|
| 113 |
+
def test_encode_date(self):
|
| 114 |
+
"""Test that date objects are properly encoded."""
|
| 115 |
+
d = date(2024, 1, 15)
|
| 116 |
+
result = to_json({'date': d})
|
| 117 |
+
self.assertIn('2024-01-15', result)
|
| 118 |
+
''',
|
| 119 |
+
},
|
| 120 |
+
{
|
| 121 |
+
"instance_id": "requests__requests-2875",
|
| 122 |
+
"repo": "psf/requests",
|
| 123 |
+
"problem": "Fix cookie domain matching",
|
| 124 |
+
"buggy_code": '''import re
|
| 125 |
+
from urllib.parse import urlparse
|
| 126 |
+
|
| 127 |
+
def match_cookie_domain(cookie_domain, request_domain):
|
| 128 |
+
"""Check if cookie domain matches request domain."""
|
| 129 |
+
# BUG: Should handle leading dots differently
|
| 130 |
+
# .example.com should match sub.example.com but not example.com
|
| 131 |
+
cookie_domain = cookie_domain.lower()
|
| 132 |
+
request_domain = request_domain.lower()
|
| 133 |
+
|
| 134 |
+
if cookie_domain.startswith('.'):
|
| 135 |
+
return request_domain.endswith(cookie_domain)
|
| 136 |
+
|
| 137 |
+
return cookie_domain == request_domain
|
| 138 |
+
''',
|
| 139 |
+
"test_code": '''import unittest
|
| 140 |
+
from buggy import match_cookie_domain
|
| 141 |
+
|
| 142 |
+
class TestCookieDomain(unittest.TestCase):
|
| 143 |
+
def test_exact_match(self):
|
| 144 |
+
"""Test exact domain matching."""
|
| 145 |
+
self.assertTrue(match_cookie_domain('example.com', 'example.com'))
|
| 146 |
+
|
| 147 |
+
def test_subdomain_with_dot(self):
|
| 148 |
+
"""Test subdomain matching with leading dot."""
|
| 149 |
+
# .example.com should match sub.example.com
|
| 150 |
+
self.assertTrue(match_cookie_domain('.example.com', 'sub.example.com'))
|
| 151 |
+
self.assertFalse(match_cookie_domain('.example.com', 'example.com'))
|
| 152 |
+
|
| 153 |
+
def test_different_domains(self):
|
| 154 |
+
"""Test different domains don't match."""
|
| 155 |
+
self.assertFalse(match_cookie_domain('example.com', 'other.com'))
|
| 156 |
+
''',
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"instance_id": "numpy__numpy-10825",
|
| 160 |
+
"repo": "numpy/numpy",
|
| 161 |
+
"problem": "Fix array concatenation edge case",
|
| 162 |
+
"buggy_code": '''import numpy as np
|
| 163 |
+
|
| 164 |
+
def concatenate_arrays(*arrays):
|
| 165 |
+
"""Concatenate multiple arrays along axis 0."""
|
| 166 |
+
if not arrays:
|
| 167 |
+
return np.array([])
|
| 168 |
+
|
| 169 |
+
# BUG: Should handle None arrays gracefully
|
| 170 |
+
result = arrays[0]
|
| 171 |
+
for arr in arrays[1:]:
|
| 172 |
+
result = np.concatenate([result, arr])
|
| 173 |
+
|
| 174 |
+
return result
|
| 175 |
+
''',
|
| 176 |
+
"test_code": '''import unittest
|
| 177 |
+
import numpy as np
|
| 178 |
+
from buggy import concatenate_arrays
|
| 179 |
+
|
| 180 |
+
class TestArrayConcatenation(unittest.TestCase):
|
| 181 |
+
def test_basic_concatenation(self):
|
| 182 |
+
"""Test basic array concatenation."""
|
| 183 |
+
a = np.array([1, 2, 3])
|
| 184 |
+
b = np.array([4, 5, 6])
|
| 185 |
+
result = concatenate_arrays(a, b)
|
| 186 |
+
np.testing.assert_array_equal(result, np.array([1, 2, 3, 4, 5, 6]))
|
| 187 |
+
|
| 188 |
+
def test_empty_input(self):
|
| 189 |
+
"""Test empty input returns empty array."""
|
| 190 |
+
result = concatenate_arrays()
|
| 191 |
+
self.assertEqual(len(result), 0)
|
| 192 |
+
|
| 193 |
+
def test_single_array(self):
|
| 194 |
+
"""Test single array passes through."""
|
| 195 |
+
a = np.array([1, 2, 3])
|
| 196 |
+
result = concatenate_arrays(a)
|
| 197 |
+
np.testing.assert_array_equal(result, a)
|
| 198 |
+
''',
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"instance_id": "pandas__pandas-15230",
|
| 202 |
+
"repo": "pandas-dev/pandas",
|
| 203 |
+
"problem": "Fix DataFrame groupby aggregation",
|
| 204 |
+
"buggy_code": '''import pandas as pd
|
| 205 |
+
|
| 206 |
+
def group_and_aggregate(df, group_col, agg_col, agg_func='mean'):
|
| 207 |
+
"""Group DataFrame and aggregate."""
|
| 208 |
+
# BUG: Should handle non-numeric columns gracefully
|
| 209 |
+
if agg_func == 'mean':
|
| 210 |
+
return df.groupby(group_col)[agg_col].mean()
|
| 211 |
+
elif agg_func == 'sum':
|
| 212 |
+
return df.groupby(group_col)[agg_col].sum()
|
| 213 |
+
elif agg_func == 'count':
|
| 214 |
+
return df.groupby(group_col)[agg_col].count()
|
| 215 |
+
else:
|
| 216 |
+
raise ValueError(f"Unknown aggregation function: {agg_func}")
|
| 217 |
+
''',
|
| 218 |
+
"test_code": '''import unittest
|
| 219 |
+
import pandas as pd
|
| 220 |
+
from buggy import group_and_aggregate
|
| 221 |
+
|
| 222 |
+
class TestGroupBy(unittest.TestCase):
|
| 223 |
+
def test_mean_aggregation(self):
|
| 224 |
+
"""Test mean aggregation."""
|
| 225 |
+
df = pd.DataFrame({
|
| 226 |
+
'category': ['A', 'A', 'B', 'B'],
|
| 227 |
+
'value': [1, 2, 3, 4]
|
| 228 |
+
})
|
| 229 |
+
result = group_and_aggregate(df, 'category', 'value', 'mean')
|
| 230 |
+
self.assertEqual(result['A'], 1.5)
|
| 231 |
+
self.assertEqual(result['B'], 3.5)
|
| 232 |
+
|
| 233 |
+
def test_sum_aggregation(self):
|
| 234 |
+
"""Test sum aggregation."""
|
| 235 |
+
df = pd.DataFrame({
|
| 236 |
+
'category': ['A', 'A', 'B'],
|
| 237 |
+
'value': [1, 2, 3]
|
| 238 |
+
})
|
| 239 |
+
result = group_and_aggregate(df, 'category', 'value', 'sum')
|
| 240 |
+
self.assertEqual(result['A'], 3)
|
| 241 |
+
self.assertEqual(result['B'], 3)
|
| 242 |
+
''',
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"instance_id": "scipy__scipy-1925",
|
| 246 |
+
"repo": "scipy/scipy",
|
| 247 |
+
"problem": "Fix signal filtering edge case",
|
| 248 |
+
"buggy_code": '''import numpy as np
|
| 249 |
+
from scipy import signal
|
| 250 |
+
|
| 251 |
+
def apply_lowpass_filter(data, cutoff, fs, order=5):
|
| 252 |
+
"""Apply lowpass filter to data."""
|
| 253 |
+
# BUG: Should validate cutoff frequency
|
| 254 |
+
nyquist = fs / 2
|
| 255 |
+
normalized_cutoff = cutoff / nyquist
|
| 256 |
+
|
| 257 |
+
# BUG: Using invalid cutoff can cause filter design failure
|
| 258 |
+
b, a = signal.butter(order, normalized_cutoff, btype='low')
|
| 259 |
+
filtered = signal.filtfilt(b, a, data)
|
| 260 |
+
|
| 261 |
+
return filtered
|
| 262 |
+
''',
|
| 263 |
+
"test_code": '''import unittest
|
| 264 |
+
import numpy as np
|
| 265 |
+
from buggy import apply_lowpass_filter
|
| 266 |
+
|
| 267 |
+
class TestSignalFiltering(unittest.TestCase):
|
| 268 |
+
def test_valid_filter(self):
|
| 269 |
+
"""Test filtering with valid parameters."""
|
| 270 |
+
fs = 1000 # Sampling frequency
|
| 271 |
+
cutoff = 100 # Cutoff frequency
|
| 272 |
+
t = np.linspace(0, 1, fs)
|
| 273 |
+
data = np.sin(2 * np.pi * 50 * t) + 0.5 * np.sin(2 * np.pi * 200 * t)
|
| 274 |
+
|
| 275 |
+
result = apply_lowpass_filter(data, cutoff, fs)
|
| 276 |
+
self.assertEqual(len(result), len(data))
|
| 277 |
+
# Low frequency component should be preserved
|
| 278 |
+
self.assertTrue(np.abs(result[100]) > 0.5)
|
| 279 |
+
|
| 280 |
+
def test_invalid_cutoff(self):
|
| 281 |
+
"""Test that invalid cutoff raises error."""
|
| 282 |
+
fs = 1000
|
| 283 |
+
cutoff = 2000 # Above Nyquist frequency - should fail
|
| 284 |
+
data = np.array([1, 2, 3, 4, 5])
|
| 285 |
+
|
| 286 |
+
with self.assertRaises(ValueError):
|
| 287 |
+
apply_lowpass_filter(data, cutoff, fs)
|
| 288 |
+
''',
|
| 289 |
+
},
|
| 290 |
+
{
|
| 291 |
+
"instance_id": "sklearn__sklearn-12345",
|
| 292 |
+
"repo": "scikit-learn/scikit-learn",
|
| 293 |
+
"problem": "Fix cross-validation split",
|
| 294 |
+
"buggy_code": '''import numpy as np
|
| 295 |
+
from sklearn.model_selection import KFold
|
| 296 |
+
|
| 297 |
+
def get_cv_splits(X, n_splits=5, shuffle=True, random_state=42):
|
| 298 |
+
"""Get cross-validation splits."""
|
| 299 |
+
# BUG: random_state should be used for reproducibility
|
| 300 |
+
kf = KFold(n_splits=n_splits, shuffle=shuffle)
|
| 301 |
+
|
| 302 |
+
splits = []
|
| 303 |
+
for train_idx, test_idx in kf.split(X):
|
| 304 |
+
splits.append((train_idx, test_idx))
|
| 305 |
+
|
| 306 |
+
return splits
|
| 307 |
+
''',
|
| 308 |
+
"test_code": '''import unittest
|
| 309 |
+
import numpy as np
|
| 310 |
+
from buggy import get_cv_splits
|
| 311 |
+
|
| 312 |
+
class TestCVSplits(unittest.TestCase):
|
| 313 |
+
def test_split_count(self):
|
| 314 |
+
"""Test that correct number of splits is generated."""
|
| 315 |
+
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
|
| 316 |
+
splits = get_cv_splits(X, n_splits=3)
|
| 317 |
+
self.assertEqual(len(splits), 3)
|
| 318 |
+
|
| 319 |
+
def test_reproducibility(self):
|
| 320 |
+
"""Test that splits are reproducible with same random_state."""
|
| 321 |
+
X = np.random.rand(100, 5)
|
| 322 |
+
splits1 = get_cv_splits(X, n_splits=5, random_state=42)
|
| 323 |
+
splits2 = get_cv_splits(X, n_splits=5, random_state=42)
|
| 324 |
+
|
| 325 |
+
for (train1, test1), (train2, test2) in zip(splits1, splits2):
|
| 326 |
+
np.testing.assert_array_equal(train1, train2)
|
| 327 |
+
np.testing.assert_array_equal(test1, test2)
|
| 328 |
+
''',
|
| 329 |
+
},
|
| 330 |
+
{
|
| 331 |
+
"instance_id": "pytest__pytest-7426",
|
| 332 |
+
"repo": "pytest-dev/pytest",
|
| 333 |
+
"problem": "Fix test collection order",
|
| 334 |
+
"buggy_code": '''import os
|
| 335 |
+
import re
|
| 336 |
+
|
| 337 |
+
def collect_tests(directory, pattern='test_*.py'):
|
| 338 |
+
"""Collect test files from directory."""
|
| 339 |
+
# BUG: Should sort files for consistent ordering
|
| 340 |
+
test_files = []
|
| 341 |
+
|
| 342 |
+
for root, dirs, files in os.walk(directory):
|
| 343 |
+
for file in files:
|
| 344 |
+
if re.match(pattern, file):
|
| 345 |
+
test_files.append(os.path.join(root, file))
|
| 346 |
+
|
| 347 |
+
return test_files
|
| 348 |
+
''',
|
| 349 |
+
"test_code": '''import unittest
|
| 350 |
+
import os
|
| 351 |
+
import tempfile
|
| 352 |
+
from buggy import collect_tests
|
| 353 |
+
|
| 354 |
+
class TestCollection(unittest.TestCase):
|
| 355 |
+
def test_collect_pattern(self):
|
| 356 |
+
"""Test that correct pattern is matched."""
|
| 357 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 358 |
+
# Create test files
|
| 359 |
+
open(os.path.join(tmpdir, 'test_a.py'), 'w').close()
|
| 360 |
+
open(os.path.join(tmpdir, 'test_b.py'), 'w').close()
|
| 361 |
+
open(os.path.join(tmpdir, 'not_a_test.py'), 'w').close()
|
| 362 |
+
|
| 363 |
+
tests = collect_tests(tmpdir, 'test_*.py')
|
| 364 |
+
self.assertEqual(len(tests), 2)
|
| 365 |
+
|
| 366 |
+
def test_consistent_order(self):
|
| 367 |
+
"""Test that file order is consistent."""
|
| 368 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 369 |
+
for name in ['test_c.py', 'test_a.py', 'test_b.py']:
|
| 370 |
+
open(os.path.join(tmpdir, name), 'w').close()
|
| 371 |
+
|
| 372 |
+
tests1 = collect_tests(tmpdir)
|
| 373 |
+
tests2 = collect_tests(tmpdir)
|
| 374 |
+
|
| 375 |
+
self.assertEqual(tests1, tests2)
|
| 376 |
+
''',
|
| 377 |
+
},
|
| 378 |
+
{
|
| 379 |
+
"instance_id": "transformers__transformers-12345",
|
| 380 |
+
"repo": "huggingface/transformers",
|
| 381 |
+
"problem": "Fix tokenization padding",
|
| 382 |
+
"buggy_code": '''from typing import List
|
| 383 |
+
|
| 384 |
+
def tokenize_and_pad(tokenizer, texts: List[str], max_length: int = 512):
|
| 385 |
+
"""Tokenize texts and pad to max length."""
|
| 386 |
+
# BUG: Should handle padding correctly
|
| 387 |
+
encoded = tokenizer(
|
| 388 |
+
texts,
|
| 389 |
+
padding=True, # This pads to longest in batch, not max_length
|
| 390 |
+
truncation=True,
|
| 391 |
+
max_length=max_length,
|
| 392 |
+
return_tensors='pt'
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
return encoded
|
| 396 |
+
''',
|
| 397 |
+
"test_code": '''import unittest
|
| 398 |
+
from buggy import tokenize_and_pad
|
| 399 |
+
|
| 400 |
+
class MockTokenizer:
|
| 401 |
+
def __call__(self, texts, padding=True, truncation=True, max_length=512, return_tensors=None):
|
| 402 |
+
# Simplified mock
|
| 403 |
+
return {
|
| 404 |
+
'input_ids': [[1, 2, 3]] if isinstance(texts, list) else [1, 2, 3],
|
| 405 |
+
'attention_mask': [[1, 1, 1]] if isinstance(texts, list) else [1, 1, 1]
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
class TestTokenization(unittest.TestCase):
|
| 409 |
+
def test_single_text(self):
|
| 410 |
+
"""Test tokenizing single text."""
|
| 411 |
+
tokenizer = MockTokenizer()
|
| 412 |
+
result = tokenize_and_pad(tokenizer, ["hello world"])
|
| 413 |
+
self.assertIn('input_ids', result)
|
| 414 |
+
|
| 415 |
+
def test_max_length_respected(self):
|
| 416 |
+
"""Test that max_length is respected."""
|
| 417 |
+
tokenizer = MockTokenizer()
|
| 418 |
+
# Should not raise even with long text
|
| 419 |
+
result = tokenize_and_pad(tokenizer, ["short"], max_length=10)
|
| 420 |
+
self.assertIn('input_ids', result)
|
| 421 |
+
''',
|
| 422 |
+
},
|
| 423 |
+
]
|
| 424 |
+
|
| 425 |
+
# Easy, Medium, Hard difficulty assignments
|
| 426 |
+
DIFFICULTY_TASKS = {
|
| 427 |
+
"easy": SWE_BENCH_PROBLEMS[:3],
|
| 428 |
+
"medium": SWE_BENCH_PROBLEMS[3:6],
|
| 429 |
+
"hard": SWE_BENCH_PROBLEMS[6:],
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
def generate_tasks(output_dir: Path, count_per_difficulty: int = 3):
|
| 434 |
+
"""Generate SWE-bench style tasks."""
|
| 435 |
+
output_dir = Path(output_dir)
|
| 436 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 437 |
+
|
| 438 |
+
total_created = 0
|
| 439 |
+
|
| 440 |
+
for difficulty, problems in DIFFICULTY_TASKS.items():
|
| 441 |
+
for i, problem in enumerate(problems[:count_per_difficulty]):
|
| 442 |
+
instance_id = f"{problem['instance_id']}_{difficulty}_{i}"
|
| 443 |
+
instance_dir = output_dir / instance_id
|
| 444 |
+
instance_dir.mkdir(parents=True, exist_ok=True)
|
| 445 |
+
|
| 446 |
+
# Write buggy.py
|
| 447 |
+
buggy_file = instance_dir / "buggy.py"
|
| 448 |
+
buggy_file.write_text(problem["buggy_code"], encoding="utf-8")
|
| 449 |
+
|
| 450 |
+
# Write test.py
|
| 451 |
+
test_file = instance_dir / "test.py"
|
| 452 |
+
test_file.write_text(problem["test_code"], encoding="utf-8")
|
| 453 |
+
|
| 454 |
+
# Write metadata.json
|
| 455 |
+
metadata = {
|
| 456 |
+
"instance_id": instance_id,
|
| 457 |
+
"repo": problem["repo"],
|
| 458 |
+
"problem_statement": problem["problem"],
|
| 459 |
+
"difficulty": difficulty,
|
| 460 |
+
}
|
| 461 |
+
metadata_file = instance_dir / "metadata.json"
|
| 462 |
+
metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
|
| 463 |
+
|
| 464 |
+
total_created += 1
|
| 465 |
+
|
| 466 |
+
print(f"Created {total_created} tasks in {output_dir}")
|
| 467 |
+
print(f"Set environment variable: SWEBENCH_TASKS_ROOT={output_dir.absolute()}")
|
| 468 |
+
print(f"Or run with: TASK_SOURCE=swebench python inference.py")
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
def main():
|
| 472 |
+
parser = argparse.ArgumentParser(description="Generate SWE-bench style tasks")
|
| 473 |
+
parser.add_argument(
|
| 474 |
+
"--count",
|
| 475 |
+
type=int,
|
| 476 |
+
default=3,
|
| 477 |
+
help="Number of tasks per difficulty (default: 3)"
|
| 478 |
+
)
|
| 479 |
+
parser.add_argument(
|
| 480 |
+
"--output-dir",
|
| 481 |
+
type=str,
|
| 482 |
+
default=None,
|
| 483 |
+
help="Output directory (default: dataset/swebench_lite_tasks)"
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
args = parser.parse_args()
|
| 487 |
+
|
| 488 |
+
if args.output_dir:
|
| 489 |
+
output_dir = Path(args.output_dir)
|
| 490 |
+
else:
|
| 491 |
+
script_dir = Path(__file__).parent
|
| 492 |
+
output_dir = script_dir / "swebench_lite_tasks"
|
| 493 |
+
|
| 494 |
+
generate_tasks(output_dir, args.count)
|
| 495 |
+
|
| 496 |
+
|
| 497 |
+
if __name__ == "__main__":
|
| 498 |
+
main()
|
rl_code_fix_env/dataset/prepare_swebench.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Script to download and materialize SWE-bench Lite tasks.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Downloads SWE-bench Lite dataset from HuggingFace
|
| 6 |
+
2. Extracts the buggy code and creates test files
|
| 7 |
+
3. Organizes them into the expected directory structure
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python -m dataset.prepare_swebench [--max-tasks N] [--difficulty easy|medium|hard|all]
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import argparse
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
# Add parent to path for imports
|
| 19 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 20 |
+
|
| 21 |
+
from datasets import load_dataset
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def get_problem_statement(row):
|
| 25 |
+
"""Extract problem statement from row."""
|
| 26 |
+
return row.get("problem_statement", "")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def get_patch(row):
|
| 30 |
+
"""Extract the patch/fix from row."""
|
| 31 |
+
return row.get("patch", "")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def get_instance_id(row):
|
| 35 |
+
"""Get instance ID from row."""
|
| 36 |
+
return row.get("instance_id", "")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def create_buggy_file(instance_dir: Path, row):
|
| 40 |
+
"""
|
| 41 |
+
Create buggy.py from the base commit and instance.
|
| 42 |
+
|
| 43 |
+
The SWE-bench dataset provides the full repository at base_commit.
|
| 44 |
+
We need to extract just the relevant file that has the bug.
|
| 45 |
+
"""
|
| 46 |
+
# For SWE-bench, the "buggy" version is actually the version BEFORE the patch
|
| 47 |
+
# We need to get the file content from the base commit
|
| 48 |
+
# This is complex as it requires cloning the repo at a specific commit
|
| 49 |
+
|
| 50 |
+
# For simplicity, we'll use a different approach:
|
| 51 |
+
# The problem_statement describes the bug, and we can create a simplified
|
| 52 |
+
# buggy version based on that description
|
| 53 |
+
|
| 54 |
+
instance_id = get_instance_id(row)
|
| 55 |
+
problem_stmt = get_problem_statement(row)
|
| 56 |
+
|
| 57 |
+
# Try to extract the file from the created files in the instance
|
| 58 |
+
# SWE-bench provides 'repo' and we need to find the relevant file
|
| 59 |
+
created_files = row.get("created_files", [])
|
| 60 |
+
|
| 61 |
+
if not created_files:
|
| 62 |
+
# Fallback: create a placeholder
|
| 63 |
+
buggy_code = f'''# Buggy code for {instance_id}
|
| 64 |
+
# Problem: {problem_stmt[:200]}...
|
| 65 |
+
|
| 66 |
+
def solution():
|
| 67 |
+
"""Placeholder solution - needs to be fixed."""
|
| 68 |
+
pass
|
| 69 |
+
'''
|
| 70 |
+
else:
|
| 71 |
+
# For now, create a simple placeholder
|
| 72 |
+
# In a full implementation, we'd clone the repo at base_commit
|
| 73 |
+
file_path = created_files[0] if created_files else "solution.py"
|
| 74 |
+
buggy_code = f'''# Buggy code for {instance_id}
|
| 75 |
+
# File: {file_path}
|
| 76 |
+
# Problem: {problem_stmt[:200]}...
|
| 77 |
+
|
| 78 |
+
def solution():
|
| 79 |
+
"""Placeholder solution - needs to be fixed."""
|
| 80 |
+
pass
|
| 81 |
+
'''
|
| 82 |
+
|
| 83 |
+
buggy_file = instance_dir / "buggy.py"
|
| 84 |
+
buggy_file.write_text(buggy_code, encoding="utf-8")
|
| 85 |
+
return buggy_file
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def create_test_file(instance_dir: Path, row):
|
| 89 |
+
"""
|
| 90 |
+
Create test.py based on the problem statement.
|
| 91 |
+
|
| 92 |
+
For SWE-bench, tests are typically derived from the issue description.
|
| 93 |
+
We'll create a simple test that checks if the solution works.
|
| 94 |
+
"""
|
| 95 |
+
instance_id = get_instance_id(row)
|
| 96 |
+
problem_stmt = get_problem_statement(row)
|
| 97 |
+
|
| 98 |
+
# Create a simple test file
|
| 99 |
+
# In practice, SWE-bench has a test.json file with test cases
|
| 100 |
+
test_cases = row.get("test_cases", [])
|
| 101 |
+
|
| 102 |
+
if test_cases:
|
| 103 |
+
# Create tests from provided test cases
|
| 104 |
+
test_code = "import unittest\\n\\n"
|
| 105 |
+
for i, tc in enumerate(test_cases):
|
| 106 |
+
input_str = tc.get("input", "")
|
| 107 |
+
output_str = tc.get("output", "")
|
| 108 |
+
test_code += f'''class TestSolution(unittest.TestCase):
|
| 109 |
+
def test_case_{i+1}(self):
|
| 110 |
+
# Input: {input_str}
|
| 111 |
+
# Expected: {output_str}
|
| 112 |
+
pass # TODO: Add actual test
|
| 113 |
+
'''
|
| 114 |
+
else:
|
| 115 |
+
# Create a basic test based on problem statement
|
| 116 |
+
test_code = f'''"""Test file for {instance_id}"""
|
| 117 |
+
|
| 118 |
+
import unittest
|
| 119 |
+
from buggy import solution
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class TestSolution(unittest.TestCase):
|
| 123 |
+
def test_basic(self):
|
| 124 |
+
"""Test based on problem statement."""
|
| 125 |
+
# Problem: {problem_stmt[:300]}...
|
| 126 |
+
result = solution()
|
| 127 |
+
self.assertIsNotNone(result)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
if __name__ == "__main__":
|
| 131 |
+
unittest.main()
|
| 132 |
+
'''
|
| 133 |
+
|
| 134 |
+
test_file = instance_dir / "test.py"
|
| 135 |
+
test_file.write_text(test_code, encoding="utf-8")
|
| 136 |
+
return test_file
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def create_metadata_file(instance_dir: Path, row):
|
| 140 |
+
"""Create metadata.json with instance info."""
|
| 141 |
+
import json
|
| 142 |
+
|
| 143 |
+
metadata = {
|
| 144 |
+
"instance_id": get_instance_id(row),
|
| 145 |
+
"repo": row.get("repo", ""),
|
| 146 |
+
"base_commit": row.get("base_commit", ""),
|
| 147 |
+
"problem_statement": get_problem_statement(row),
|
| 148 |
+
"patch": get_patch(row),
|
| 149 |
+
"difficulty": "medium", # Will be set based on index
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
metadata_file = instance_dir / "metadata.json"
|
| 153 |
+
metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
|
| 154 |
+
return metadata_file
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def prepare_swebench_tasks(
|
| 158 |
+
output_dir: Path,
|
| 159 |
+
max_tasks: int = 30,
|
| 160 |
+
difficulty: str = "all"
|
| 161 |
+
):
|
| 162 |
+
"""
|
| 163 |
+
Download and prepare SWE-bench Lite tasks.
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
output_dir: Directory to save tasks
|
| 167 |
+
max_tasks: Maximum number of tasks to download
|
| 168 |
+
difficulty: "easy", "medium", "hard", or "all"
|
| 169 |
+
"""
|
| 170 |
+
print(f"Loading SWE-bench Lite dataset...")
|
| 171 |
+
|
| 172 |
+
try:
|
| 173 |
+
ds = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
|
| 174 |
+
except Exception as e:
|
| 175 |
+
print(f"Error loading dataset: {e}")
|
| 176 |
+
print("Trying alternative dataset name...")
|
| 177 |
+
ds = load_dataset("swe-bench/swe-bench-lite", split="test")
|
| 178 |
+
|
| 179 |
+
print(f"Loaded {len(ds)} tasks")
|
| 180 |
+
|
| 181 |
+
# Calculate difficulty bounds
|
| 182 |
+
total = len(ds)
|
| 183 |
+
one_third = max(total // 3, 1)
|
| 184 |
+
two_third = max((2 * total) // 3, one_third + 1)
|
| 185 |
+
|
| 186 |
+
difficulty_ranges = {
|
| 187 |
+
"easy": (0, one_third),
|
| 188 |
+
"medium": (one_third, two_third),
|
| 189 |
+
"hard": (two_third, total),
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
# Determine which tasks to download
|
| 193 |
+
if difficulty == "all":
|
| 194 |
+
ranges = list(difficulty_ranges.values())
|
| 195 |
+
indices = []
|
| 196 |
+
for start, end in ranges:
|
| 197 |
+
indices.extend(range(start, min(end, start + max_tasks // 3)))
|
| 198 |
+
else:
|
| 199 |
+
start, end = difficulty_ranges.get(difficulty, (0, total))
|
| 200 |
+
indices = list(range(start, min(end, max_tasks)))
|
| 201 |
+
|
| 202 |
+
# Create output directory
|
| 203 |
+
output_dir = Path(output_dir)
|
| 204 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 205 |
+
|
| 206 |
+
print(f"Preparing {len(indices)} tasks...")
|
| 207 |
+
|
| 208 |
+
success_count = 0
|
| 209 |
+
for i, idx in enumerate(indices):
|
| 210 |
+
try:
|
| 211 |
+
row = ds[idx]
|
| 212 |
+
instance_id = get_instance_id(row)
|
| 213 |
+
|
| 214 |
+
# Create instance directory
|
| 215 |
+
instance_dir = output_dir / instance_id
|
| 216 |
+
instance_dir.mkdir(parents=True, exist_ok=True)
|
| 217 |
+
|
| 218 |
+
# Create files
|
| 219 |
+
create_buggy_file(instance_dir, row)
|
| 220 |
+
create_test_file(instance_dir, row)
|
| 221 |
+
create_metadata_file(instance_dir, row)
|
| 222 |
+
|
| 223 |
+
success_count += 1
|
| 224 |
+
if (i + 1) % 10 == 0:
|
| 225 |
+
print(f" Processed {i + 1}/{len(indices)} tasks...")
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
print(f" Warning: Failed to process task {idx}: {e}")
|
| 229 |
+
continue
|
| 230 |
+
|
| 231 |
+
print(f"\nDone! Prepared {success_count}/{len(indices)} tasks in {output_dir}")
|
| 232 |
+
print(f"Set SWEBENCH_TASKS_ROOT={output_dir.absolute()} to use these tasks.")
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def main():
|
| 236 |
+
parser = argparse.ArgumentParser(description="Prepare SWE-bench Lite tasks")
|
| 237 |
+
parser.add_argument(
|
| 238 |
+
"--max-tasks",
|
| 239 |
+
type=int,
|
| 240 |
+
default=30,
|
| 241 |
+
help="Maximum number of tasks to download (default: 30)"
|
| 242 |
+
)
|
| 243 |
+
parser.add_argument(
|
| 244 |
+
"--difficulty",
|
| 245 |
+
type=str,
|
| 246 |
+
default="all",
|
| 247 |
+
choices=["easy", "medium", "hard", "all"],
|
| 248 |
+
help="Difficulty level to download (default: all)"
|
| 249 |
+
)
|
| 250 |
+
parser.add_argument(
|
| 251 |
+
"--output-dir",
|
| 252 |
+
type=str,
|
| 253 |
+
default=None,
|
| 254 |
+
help="Output directory (default: dataset/swebench_lite_tasks)"
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
args = parser.parse_args()
|
| 258 |
+
|
| 259 |
+
# Determine output directory
|
| 260 |
+
if args.output_dir:
|
| 261 |
+
output_dir = Path(args.output_dir)
|
| 262 |
+
else:
|
| 263 |
+
script_dir = Path(__file__).parent
|
| 264 |
+
output_dir = script_dir / "swebench_lite_tasks"
|
| 265 |
+
|
| 266 |
+
prepare_swebench_tasks(
|
| 267 |
+
output_dir=output_dir,
|
| 268 |
+
max_tasks=args.max_tasks,
|
| 269 |
+
difficulty=args.difficulty
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
if __name__ == "__main__":
|
| 274 |
+
main()
|
rl_code_fix_env/dataset/problem_1/buggy.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
-
def
|
| 2 |
-
"""
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
def safe_divide(a: float, b: float) -> float:
|
| 2 |
+
"""Divide a by b; only return inf for division by zero."""
|
| 3 |
+
try:
|
| 4 |
+
return a / b
|
| 5 |
+
except Exception:
|
| 6 |
+
# BUG: catches unrelated errors too broadly.
|
| 7 |
+
return float("inf")
|
rl_code_fix_env/dataset/problem_1/metadata.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"difficulty": "easy",
|
| 3 |
-
"bug_type": "
|
| 4 |
"expected_steps": 1
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"difficulty": "easy",
|
| 3 |
+
"bug_type": "exception-handling",
|
| 4 |
"expected_steps": 1
|
| 5 |
}
|
rl_code_fix_env/dataset/problem_1/test.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
-
class
|
| 6 |
-
def
|
| 7 |
-
self.assertEqual(
|
| 8 |
|
| 9 |
-
def
|
| 10 |
-
self.assertEqual(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_1.buggy import safe_divide
|
| 3 |
|
| 4 |
|
| 5 |
+
class TestSafeDivide(unittest.TestCase):
|
| 6 |
+
def test_normal(self):
|
| 7 |
+
self.assertEqual(safe_divide(8, 2), 4)
|
| 8 |
|
| 9 |
+
def test_zero_division(self):
|
| 10 |
+
self.assertEqual(safe_divide(1, 0), float("inf"))
|
| 11 |
+
|
| 12 |
+
def test_type_error_should_raise(self):
|
| 13 |
+
with self.assertRaises(TypeError):
|
| 14 |
+
safe_divide("1", 1)
|
| 15 |
|
| 16 |
|
| 17 |
if __name__ == "__main__":
|
rl_code_fix_env/dataset/problem_10/buggy.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from
|
| 2 |
|
| 3 |
|
| 4 |
def rotate_90_clockwise(matrix: list[list[int]]) -> list[list[int]]:
|
|
|
|
| 1 |
+
from dataset.problem_10.helpers import transpose
|
| 2 |
|
| 3 |
|
| 4 |
def rotate_90_clockwise(matrix: list[list[int]]) -> list[list[int]]:
|
rl_code_fix_env/dataset/problem_10/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestRotateMatrix(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_10.buggy import rotate_90_clockwise
|
| 3 |
|
| 4 |
|
| 5 |
class TestRotateMatrix(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_11/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestBinarySearch(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_11.buggy import binary_search
|
| 3 |
|
| 4 |
|
| 5 |
class TestBinarySearch(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_12/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestParsePairs(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_12.buggy import parse_pairs
|
| 3 |
|
| 4 |
|
| 5 |
class TestParsePairs(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_13/buggy.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from
|
| 2 |
|
| 3 |
|
| 4 |
def run_ops() -> tuple[int, int]:
|
|
|
|
| 1 |
+
from dataset.problem_13.cache import LRUCache
|
| 2 |
|
| 3 |
|
| 4 |
def run_ops() -> tuple[int, int]:
|
rl_code_fix_env/dataset/problem_13/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestLRU(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_13.buggy import run_ops
|
| 3 |
|
| 4 |
|
| 5 |
class TestLRU(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_14/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestFibonacciRecursive(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_14.buggy import fibonacci_recursive
|
| 3 |
|
| 4 |
|
| 5 |
class TestFibonacciRecursive(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_15/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestIntervalOverlap(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_15.buggy import has_overlap
|
| 3 |
|
| 4 |
|
| 5 |
class TestIntervalOverlap(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_16/buggy.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from
|
| 2 |
|
| 3 |
|
| 4 |
def top_label(scores: dict[str, float]) -> str:
|
|
|
|
| 1 |
+
from dataset.problem_16.helpers import normalize_scores
|
| 2 |
|
| 3 |
|
| 4 |
def top_label(scores: dict[str, float]) -> str:
|
rl_code_fix_env/dataset/problem_16/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestTopLabel(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_16.buggy import top_label
|
| 3 |
|
| 4 |
|
| 5 |
class TestTopLabel(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_17/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestDedupe(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_17.buggy import dedupe_preserve_order
|
| 3 |
|
| 4 |
|
| 5 |
class TestDedupe(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_18/buggy.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from
|
| 2 |
|
| 3 |
|
| 4 |
def moving_average(nums: list[int], window: int) -> list[float]:
|
|
|
|
| 1 |
+
from dataset.problem_18.math_utils import clamp
|
| 2 |
|
| 3 |
|
| 4 |
def moving_average(nums: list[int], window: int) -> list[float]:
|
rl_code_fix_env/dataset/problem_18/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestMovingAverage(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_18.buggy import moving_average
|
| 3 |
|
| 4 |
|
| 5 |
class TestMovingAverage(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_19/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import pytest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
def test_calculate_employee_bonus():
|
| 5 |
employees = [
|
|
|
|
| 1 |
import pytest
|
| 2 |
+
from dataset.problem_19.buggy import calculate_employee_bonus
|
| 3 |
|
| 4 |
def test_calculate_employee_bonus():
|
| 5 |
employees = [
|
rl_code_fix_env/dataset/problem_2/buggy.py
CHANGED
|
@@ -1,5 +1,14 @@
|
|
| 1 |
-
def
|
| 2 |
-
"""
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def binary_search(nums: list[int], target: int) -> int:
|
| 2 |
+
"""Return index of target, or -1 if not found."""
|
| 3 |
+
left, right = 0, len(nums) - 1
|
| 4 |
+
|
| 5 |
+
while left < right:
|
| 6 |
+
mid = (left + right) // 2
|
| 7 |
+
if nums[mid] == target:
|
| 8 |
+
return mid
|
| 9 |
+
if nums[mid] < target:
|
| 10 |
+
left = mid + 1
|
| 11 |
+
else:
|
| 12 |
+
right = mid - 1
|
| 13 |
+
|
| 14 |
+
return -1
|
rl_code_fix_env/dataset/problem_2/metadata.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"difficulty": "
|
| 3 |
-
"bug_type": "
|
| 4 |
"expected_steps": 2
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"difficulty": "medium",
|
| 3 |
+
"bug_type": "boundary-condition",
|
| 4 |
"expected_steps": 2
|
| 5 |
}
|
rl_code_fix_env/dataset/problem_2/test.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
-
class
|
| 6 |
-
def
|
| 7 |
-
self.
|
| 8 |
|
| 9 |
-
def
|
| 10 |
-
self.
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_11.buggy import binary_search
|
| 3 |
|
| 4 |
|
| 5 |
+
class TestBinarySearch(unittest.TestCase):
|
| 6 |
+
def test_found_middle(self):
|
| 7 |
+
self.assertEqual(binary_search([1, 3, 5, 7], 5), 2)
|
| 8 |
|
| 9 |
+
def test_found_last(self):
|
| 10 |
+
self.assertEqual(binary_search([1, 3, 5, 7], 7), 3)
|
| 11 |
+
|
| 12 |
+
def test_not_found(self):
|
| 13 |
+
self.assertEqual(binary_search([1, 3, 5, 7], 4), -1)
|
| 14 |
|
| 15 |
|
| 16 |
if __name__ == "__main__":
|
rl_code_fix_env/dataset/problem_20/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import pytest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
def test_analyze_user_activity():
|
| 5 |
logs = [
|
|
|
|
| 1 |
import pytest
|
| 2 |
+
from dataset.problem_20.buggy import analyze_user_activity
|
| 3 |
|
| 4 |
def test_analyze_user_activity():
|
| 5 |
logs = [
|
rl_code_fix_env/dataset/problem_21/test.py
CHANGED
|
@@ -2,7 +2,7 @@ import pytest
|
|
| 2 |
import os
|
| 3 |
import tempfile
|
| 4 |
import json
|
| 5 |
-
from
|
| 6 |
|
| 7 |
def test_process_inventory_data():
|
| 8 |
data = {
|
|
|
|
| 2 |
import os
|
| 3 |
import tempfile
|
| 4 |
import json
|
| 5 |
+
from dataset.problem_21.buggy import process_inventory_data
|
| 6 |
|
| 7 |
def test_process_inventory_data():
|
| 8 |
data = {
|
rl_code_fix_env/dataset/problem_22/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import pytest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
def test_parse_and_validate_emails():
|
| 5 |
emails = [
|
|
|
|
| 1 |
import pytest
|
| 2 |
+
from dataset.problem_22.buggy import parse_and_validate_emails
|
| 3 |
|
| 4 |
def test_parse_and_validate_emails():
|
| 5 |
emails = [
|
rl_code_fix_env/dataset/problem_23/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import pytest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
def test_optimize_portfolio():
|
| 5 |
investments = [
|
|
|
|
| 1 |
import pytest
|
| 2 |
+
from dataset.problem_23.buggy import optimize_portfolio
|
| 3 |
|
| 4 |
def test_optimize_portfolio():
|
| 5 |
investments = [
|
rl_code_fix_env/dataset/problem_3/buggy.py
CHANGED
|
@@ -1,10 +1,37 @@
|
|
| 1 |
-
def
|
| 2 |
-
"""
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def optimize_portfolio(investments: list[dict], budget: float) -> list[dict]:
|
| 2 |
+
"""
|
| 3 |
+
Selects the optimal subset of investments to maximize return within a budget.
|
| 4 |
+
(0-1 Knapsack problem approximation)
|
| 5 |
+
|
| 6 |
+
investments: list of dicts with 'id', 'cost', 'expected_return'
|
| 7 |
+
budget: float, maximum total cost allowed
|
| 8 |
+
|
| 9 |
+
Returns:
|
| 10 |
+
list of chosen investments
|
| 11 |
+
"""
|
| 12 |
+
# Base case checks
|
| 13 |
+
if budget <= 0 or not investments:
|
| 14 |
+
return []
|
| 15 |
+
|
| 16 |
+
# BUG 1: Sorting modifies the original list, should use sorted() or copy
|
| 17 |
+
# BUG 2: Sorting by expected_return ascending instead of return/cost ratio descending
|
| 18 |
+
investments.sort(key=lambda x: x['expected_return'])
|
| 19 |
+
|
| 20 |
+
chosen = []
|
| 21 |
+
current_spent = 0
|
| 22 |
+
|
| 23 |
+
# BUG 3: For loop variable shadowing the loop scope if cost/return variables are misspelled
|
| 24 |
+
for item in investments:
|
| 25 |
+
# BUG 4: item.get() but missing default values if keys are absent, could cause TypeError if None
|
| 26 |
+
cost = item.get('cost')
|
| 27 |
+
ret = item.get('expected_return')
|
| 28 |
+
|
| 29 |
+
# BUG 5: Logic error: checking if current_spent is less than budget, but not checking if adding cost exceeds it
|
| 30 |
+
if current_spent < budget:
|
| 31 |
+
current_spent += cost
|
| 32 |
+
chosen.append(item)
|
| 33 |
+
|
| 34 |
+
# BUG 6: Does not handle the case where adding the item exceeds budget, just blindly adds it if current_spent < budget
|
| 35 |
+
# E.g. budget 100, current 90, item cost 50 -> adds it, total 140
|
| 36 |
+
|
| 37 |
+
return chosen
|
rl_code_fix_env/dataset/problem_3/metadata.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"difficulty": "
|
| 3 |
-
"bug_type": "
|
| 4 |
-
"expected_steps":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"difficulty": "hard",
|
| 3 |
+
"bug_type": "multiple",
|
| 4 |
+
"expected_steps": 5
|
| 5 |
}
|
rl_code_fix_env/dataset/problem_3/test.py
CHANGED
|
@@ -1,15 +1,44 @@
|
|
| 1 |
-
import
|
| 2 |
-
from
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from dataset.problem_23.buggy import optimize_portfolio
|
| 3 |
|
| 4 |
+
def test_optimize_portfolio():
|
| 5 |
+
investments = [
|
| 6 |
+
{'id': 'A', 'cost': 50, 'expected_return': 60}, # ratio 1.2
|
| 7 |
+
{'id': 'B', 'cost': 30, 'expected_return': 45}, # ratio 1.5
|
| 8 |
+
{'id': 'C', 'cost': 20, 'expected_return': 40}, # ratio 2.0
|
| 9 |
+
{'id': 'D', 'cost': 40, 'expected_return': 50}, # ratio 1.25
|
| 10 |
+
{'id': 'E', 'cost': 10, 'expected_return': 15} # ratio 1.5
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
# Original list should not be mutated
|
| 14 |
+
orig_investments = [dict(i) for i in investments]
|
| 15 |
+
|
| 16 |
+
# Budget 50
|
| 17 |
+
# Expected greedy: C (20) -> B (30) -> total cost 50, return 85
|
| 18 |
+
result = optimize_portfolio(investments, 50)
|
| 19 |
+
|
| 20 |
+
assert investments == orig_investments, "Original list was mutated"
|
| 21 |
+
|
| 22 |
+
# Assert correct items selected
|
| 23 |
+
chosen_ids = {item['id'] for item in result}
|
| 24 |
+
assert chosen_ids == {'B', 'C'}, f"Expected B and C, got {chosen_ids}"
|
| 25 |
+
|
| 26 |
+
total_cost = sum(item['cost'] for item in result)
|
| 27 |
+
assert total_cost <= 50
|
| 28 |
+
|
| 29 |
+
def test_budget_exceeded_check():
|
| 30 |
+
investments = [
|
| 31 |
+
{'id': 'A', 'cost': 90, 'expected_return': 100},
|
| 32 |
+
{'id': 'B', 'cost': 50, 'expected_return': 60}
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
# Budget 100
|
| 36 |
+
# Expected: A (cost 90)
|
| 37 |
+
result = optimize_portfolio(investments, 100)
|
| 38 |
+
|
| 39 |
+
chosen_ids = {item['id'] for item in result}
|
| 40 |
+
assert chosen_ids == {'A'}, "Should not include B since total cost would be 140"
|
| 41 |
+
|
| 42 |
+
def test_empty_or_zero_budget():
|
| 43 |
+
assert optimize_portfolio([], 100) == []
|
| 44 |
+
assert optimize_portfolio([{'id': 'A', 'cost': 10, 'expected_return': 20}], 0) == []
|
rl_code_fix_env/dataset/problem_4/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestMergeSorted(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_4.buggy import merge_sorted
|
| 3 |
|
| 4 |
|
| 5 |
class TestMergeSorted(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_5/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestChunkList(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_5.buggy import chunk_list
|
| 3 |
|
| 4 |
|
| 5 |
class TestChunkList(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_6/buggy.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from
|
| 2 |
|
| 3 |
|
| 4 |
def count_unique_words(text: str) -> int:
|
|
|
|
| 1 |
+
from dataset.problem_6.helpers import tokenize
|
| 2 |
|
| 3 |
|
| 4 |
def count_unique_words(text: str) -> int:
|
rl_code_fix_env/dataset/problem_6/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestCountUniqueWords(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_6.buggy import count_unique_words
|
| 3 |
|
| 4 |
|
| 5 |
class TestCountUniqueWords(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_7/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestTopKFrequent(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_7.buggy import top_k_frequent
|
| 3 |
|
| 4 |
|
| 5 |
class TestTopKFrequent(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_8/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestFlattenOneLevel(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_8.buggy import flatten_one_level
|
| 3 |
|
| 4 |
|
| 5 |
class TestFlattenOneLevel(unittest.TestCase):
|
rl_code_fix_env/dataset/problem_9/test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import unittest
|
| 2 |
-
from
|
| 3 |
|
| 4 |
|
| 5 |
class TestSafeDivide(unittest.TestCase):
|
|
|
|
| 1 |
import unittest
|
| 2 |
+
from dataset.problem_9.buggy import safe_divide
|
| 3 |
|
| 4 |
|
| 5 |
class TestSafeDivide(unittest.TestCase):
|
rl_code_fix_env/dataset/swebench_adapter.py
CHANGED
|
@@ -46,47 +46,93 @@ def get_swebench_task(difficulty: str) -> Dict[str, Any]:
|
|
| 46 |
Expected local layout:
|
| 47 |
dataset/swebench_lite_tasks/<instance_id>/buggy.py
|
| 48 |
dataset/swebench_lite_tasks/<instance_id>/test.py
|
|
|
|
|
|
|
| 49 |
"""
|
| 50 |
diff = (difficulty or "").strip().lower()
|
| 51 |
if diff not in DIFFICULTIES:
|
| 52 |
raise ValueError(f"Invalid difficulty '{difficulty}'. Must be one of {DIFFICULTIES}.")
|
| 53 |
|
| 54 |
-
rows = _load_swebench_lite_rows()
|
| 55 |
-
if not rows:
|
| 56 |
-
raise RuntimeError("SWE-bench Lite split is empty.")
|
| 57 |
-
|
| 58 |
-
bounds = _difficulty_bounds(len(rows))
|
| 59 |
-
start, end = bounds[diff]
|
| 60 |
-
candidates = rows[start:end] if end > start else rows
|
| 61 |
-
|
| 62 |
tasks_root = Path(os.getenv("SWEBENCH_TASKS_ROOT", str(DEFAULT_TASKS_ROOT)))
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
raise FileNotFoundError(
|
| 92 |
"No materialized SWE-bench task workspace found. "
|
|
|
|
| 46 |
Expected local layout:
|
| 47 |
dataset/swebench_lite_tasks/<instance_id>/buggy.py
|
| 48 |
dataset/swebench_lite_tasks/<instance_id>/test.py
|
| 49 |
+
|
| 50 |
+
First tries to load from local files, then falls back to HuggingFace dataset.
|
| 51 |
"""
|
| 52 |
diff = (difficulty or "").strip().lower()
|
| 53 |
if diff not in DIFFICULTIES:
|
| 54 |
raise ValueError(f"Invalid difficulty '{difficulty}'. Must be one of {DIFFICULTIES}.")
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
tasks_root = Path(os.getenv("SWEBENCH_TASKS_ROOT", str(DEFAULT_TASKS_ROOT)))
|
| 57 |
+
|
| 58 |
+
# First, try to load from local materialized tasks
|
| 59 |
+
if tasks_root.exists():
|
| 60 |
+
# Find all instance directories
|
| 61 |
+
instance_dirs = []
|
| 62 |
+
for item in tasks_root.iterdir():
|
| 63 |
+
if item.is_dir() and (item / "buggy.py").exists() and (item / "test.py").exists():
|
| 64 |
+
# Check if this directory matches the difficulty
|
| 65 |
+
if diff in item.name.lower():
|
| 66 |
+
instance_dirs.append(item)
|
| 67 |
+
|
| 68 |
+
if instance_dirs:
|
| 69 |
+
# Sort for deterministic selection
|
| 70 |
+
instance_dirs.sort(key=lambda x: x.name)
|
| 71 |
+
|
| 72 |
+
# Select based on SWEBENCH_INDEX
|
| 73 |
+
preferred_offset = int(os.getenv("SWEBENCH_INDEX", "0"))
|
| 74 |
+
selected_dir = instance_dirs[preferred_offset % len(instance_dirs)]
|
| 75 |
+
|
| 76 |
+
buggy_file = selected_dir / "buggy.py"
|
| 77 |
+
test_file = selected_dir / "test.py"
|
| 78 |
+
metadata_file = selected_dir / "metadata.json"
|
| 79 |
+
|
| 80 |
+
code = buggy_file.read_text(encoding="utf-8")
|
| 81 |
+
|
| 82 |
+
# Load metadata if available
|
| 83 |
+
metadata = {"source": "swebench_lite", "difficulty": diff}
|
| 84 |
+
if metadata_file.exists():
|
| 85 |
+
import json
|
| 86 |
+
metadata = json.loads(metadata_file.read_text(encoding="utf-8"))
|
| 87 |
+
|
| 88 |
+
return {
|
| 89 |
+
"code": code,
|
| 90 |
+
"tests": str(test_file),
|
| 91 |
+
"metadata": metadata,
|
| 92 |
+
"problem_dir": str(selected_dir),
|
| 93 |
+
"problem_id": selected_dir.name,
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
# Fallback: try to load from HuggingFace dataset
|
| 97 |
+
try:
|
| 98 |
+
rows = _load_swebench_lite_rows()
|
| 99 |
+
if not rows:
|
| 100 |
+
raise RuntimeError("SWE-bench Lite split is empty.")
|
| 101 |
+
|
| 102 |
+
bounds = _difficulty_bounds(len(rows))
|
| 103 |
+
start, end = bounds[diff]
|
| 104 |
+
candidates = rows[start:end] if end > start else rows
|
| 105 |
+
|
| 106 |
+
preferred_offset = int(os.getenv("SWEBENCH_INDEX", "0"))
|
| 107 |
+
|
| 108 |
+
# Deterministic scan order with optional offset.
|
| 109 |
+
ordered = candidates[preferred_offset:] + candidates[:preferred_offset]
|
| 110 |
+
for row in ordered:
|
| 111 |
+
row_idx = int(row.get("__index_level_0__", 0))
|
| 112 |
+
instance_id = str(row.get("instance_id", f"row_{row_idx}"))
|
| 113 |
+
for folder in _candidate_dirs(tasks_root, instance_id, row_idx):
|
| 114 |
+
buggy_file = folder / "buggy.py"
|
| 115 |
+
test_file = folder / "test.py"
|
| 116 |
+
if buggy_file.exists() and test_file.exists():
|
| 117 |
+
code = buggy_file.read_text(encoding="utf-8")
|
| 118 |
+
metadata = {
|
| 119 |
+
"source": "swebench_lite",
|
| 120 |
+
"instance_id": instance_id,
|
| 121 |
+
"repo": row.get("repo"),
|
| 122 |
+
"base_commit": row.get("base_commit"),
|
| 123 |
+
"problem_statement": row.get("problem_statement"),
|
| 124 |
+
"difficulty": diff,
|
| 125 |
+
}
|
| 126 |
+
return {
|
| 127 |
+
"code": code,
|
| 128 |
+
"tests": str(test_file),
|
| 129 |
+
"metadata": metadata,
|
| 130 |
+
"problem_dir": str(folder),
|
| 131 |
+
"problem_id": instance_id,
|
| 132 |
+
}
|
| 133 |
+
except Exception as e:
|
| 134 |
+
# If HuggingFace fails, raise the original error about missing local files
|
| 135 |
+
pass
|
| 136 |
|
| 137 |
raise FileNotFoundError(
|
| 138 |
"No materialized SWE-bench task workspace found. "
|
rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/buggy.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django import forms
|
| 2 |
+
from django.contrib.auth.models import User
|
| 3 |
+
|
| 4 |
+
class UserCreationForm(forms.ModelForm):
|
| 5 |
+
"""Form for creating new users."""
|
| 6 |
+
password1 = forms.CharField(widget=forms.PasswordInput)
|
| 7 |
+
password2 = forms.CharField(widget=forms.PasswordInput)
|
| 8 |
+
|
| 9 |
+
class Meta:
|
| 10 |
+
model = User
|
| 11 |
+
fields = ('username', 'email')
|
| 12 |
+
|
| 13 |
+
def clean(self):
|
| 14 |
+
cleaned_data = super().clean()
|
| 15 |
+
password1 = cleaned_data.get('password1')
|
| 16 |
+
password2 = cleaned_data.get('password2')
|
| 17 |
+
|
| 18 |
+
# BUG: This comparison is case-sensitive but should be case-insensitive
|
| 19 |
+
if password1 != password2:
|
| 20 |
+
raise forms.ValidationError("Passwords don't match")
|
| 21 |
+
|
| 22 |
+
return cleaned_data
|
| 23 |
+
|
| 24 |
+
def save(self, commit=True):
|
| 25 |
+
user = super().save(commit=False)
|
| 26 |
+
user.set_password(self.cleaned_data['password1'])
|
| 27 |
+
if commit:
|
| 28 |
+
user.save()
|
| 29 |
+
return user
|
rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/metadata.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"instance_id": "django__django-11098_easy_0",
|
| 3 |
+
"repo": "django/django",
|
| 4 |
+
"problem_statement": "Fix the user creation form validation error",
|
| 5 |
+
"difficulty": "easy"
|
| 6 |
+
}
|
rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/test.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
from buggy import UserCreationForm
|
| 3 |
+
|
| 4 |
+
class TestUserCreationForm(unittest.TestCase):
|
| 5 |
+
def test_password_matching(self):
|
| 6 |
+
"""Test that matching passwords pass validation."""
|
| 7 |
+
form = UserCreationForm(data={
|
| 8 |
+
'username': 'testuser',
|
| 9 |
+
'email': 'test@example.com',
|
| 10 |
+
'password1': 'TestPass123',
|
| 11 |
+
'password2': 'TestPass123',
|
| 12 |
+
})
|
| 13 |
+
self.assertTrue(form.is_valid())
|
| 14 |
+
|
| 15 |
+
def test_password_mismatch(self):
|
| 16 |
+
"""Test that mismatched passwords fail validation."""
|
| 17 |
+
form = UserCreationForm(data={
|
| 18 |
+
'username': 'testuser',
|
| 19 |
+
'email': 'test@example.com',
|
| 20 |
+
'password1': 'TestPass123',
|
| 21 |
+
'password2': 'testpass123', # Different case
|
| 22 |
+
})
|
| 23 |
+
self.assertFalse(form.is_valid())
|
| 24 |
+
self.assertIn('passwords', str(form.errors).lower())
|
rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/buggy.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from datetime import datetime, date
|
| 3 |
+
|
| 4 |
+
class JSONEncoder(json.JSONEncoder):
|
| 5 |
+
"""Custom JSON encoder for Flask."""
|
| 6 |
+
|
| 7 |
+
def default(self, obj):
|
| 8 |
+
# BUG: Missing handling for datetime objects
|
| 9 |
+
if isinstance(obj, date):
|
| 10 |
+
return obj.isoformat()
|
| 11 |
+
return super().default(obj)
|
| 12 |
+
|
| 13 |
+
def to_json(obj):
|
| 14 |
+
"""Convert object to JSON string."""
|
| 15 |
+
return json.dumps(obj, cls=JSONEncoder)
|
rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/metadata.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"instance_id": "flask__flask-1048_easy_1",
|
| 3 |
+
"repo": "pallets/flask",
|
| 4 |
+
"problem_statement": "Fix JSON encoding for datetime objects",
|
| 5 |
+
"difficulty": "easy"
|
| 6 |
+
}
|
rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/test.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from buggy import to_json
|
| 4 |
+
|
| 5 |
+
class TestJSONEncoding(unittest.TestCase):
|
| 6 |
+
def test_encode_datetime(self):
|
| 7 |
+
"""Test that datetime objects are properly encoded."""
|
| 8 |
+
dt = datetime(2024, 1, 15, 10, 30, 0)
|
| 9 |
+
result = to_json({'timestamp': dt})
|
| 10 |
+
self.assertIn('2024-01-15', result)
|
| 11 |
+
self.assertIn('10:30:00', result)
|
| 12 |
+
|
| 13 |
+
def test_encode_date(self):
|
| 14 |
+
"""Test that date objects are properly encoded."""
|
| 15 |
+
d = date(2024, 1, 15)
|
| 16 |
+
result = to_json({'date': d})
|
| 17 |
+
self.assertIn('2024-01-15', result)
|
rl_code_fix_env/dataset/swebench_lite_tasks/numpy__numpy-10825_medium_0/buggy.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
def concatenate_arrays(*arrays):
|
| 4 |
+
"""Concatenate multiple arrays along axis 0."""
|
| 5 |
+
if not arrays:
|
| 6 |
+
return np.array([])
|
| 7 |
+
|
| 8 |
+
# BUG: Should handle None arrays gracefully
|
| 9 |
+
result = arrays[0]
|
| 10 |
+
for arr in arrays[1:]:
|
| 11 |
+
result = np.concatenate([result, arr])
|
| 12 |
+
|
| 13 |
+
return result
|