Viraaj Sawant commited on
Commit
18625ef
·
1 Parent(s): fe42848

new push with SWE dataset

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +3 -0
  2. prompts.py +16 -33
  3. rl_code_fix_env/.gitignore +2 -1
  4. rl_code_fix_env/README.md +1 -0
  5. rl_code_fix_env/_aliases.py +21 -0
  6. rl_code_fix_env/conftest.py +0 -17
  7. rl_code_fix_env/dataset/generate_swebench_tasks.py +498 -0
  8. rl_code_fix_env/dataset/prepare_swebench.py +274 -0
  9. rl_code_fix_env/dataset/problem_1/buggy.py +7 -5
  10. rl_code_fix_env/dataset/problem_1/metadata.json +1 -1
  11. rl_code_fix_env/dataset/problem_1/test.py +10 -6
  12. rl_code_fix_env/dataset/problem_10/buggy.py +1 -1
  13. rl_code_fix_env/dataset/problem_10/test.py +1 -1
  14. rl_code_fix_env/dataset/problem_11/test.py +1 -1
  15. rl_code_fix_env/dataset/problem_12/test.py +1 -1
  16. rl_code_fix_env/dataset/problem_13/buggy.py +1 -1
  17. rl_code_fix_env/dataset/problem_13/test.py +1 -1
  18. rl_code_fix_env/dataset/problem_14/test.py +1 -1
  19. rl_code_fix_env/dataset/problem_15/test.py +1 -1
  20. rl_code_fix_env/dataset/problem_16/buggy.py +1 -1
  21. rl_code_fix_env/dataset/problem_16/test.py +1 -1
  22. rl_code_fix_env/dataset/problem_17/test.py +1 -1
  23. rl_code_fix_env/dataset/problem_18/buggy.py +1 -1
  24. rl_code_fix_env/dataset/problem_18/test.py +1 -1
  25. rl_code_fix_env/dataset/problem_19/test.py +1 -1
  26. rl_code_fix_env/dataset/problem_2/buggy.py +14 -5
  27. rl_code_fix_env/dataset/problem_2/metadata.json +2 -2
  28. rl_code_fix_env/dataset/problem_2/test.py +9 -6
  29. rl_code_fix_env/dataset/problem_20/test.py +1 -1
  30. rl_code_fix_env/dataset/problem_21/test.py +1 -1
  31. rl_code_fix_env/dataset/problem_22/test.py +1 -1
  32. rl_code_fix_env/dataset/problem_23/test.py +1 -1
  33. rl_code_fix_env/dataset/problem_3/buggy.py +37 -10
  34. rl_code_fix_env/dataset/problem_3/metadata.json +3 -3
  35. rl_code_fix_env/dataset/problem_3/test.py +43 -14
  36. rl_code_fix_env/dataset/problem_4/test.py +1 -1
  37. rl_code_fix_env/dataset/problem_5/test.py +1 -1
  38. rl_code_fix_env/dataset/problem_6/buggy.py +1 -1
  39. rl_code_fix_env/dataset/problem_6/test.py +1 -1
  40. rl_code_fix_env/dataset/problem_7/test.py +1 -1
  41. rl_code_fix_env/dataset/problem_8/test.py +1 -1
  42. rl_code_fix_env/dataset/problem_9/test.py +1 -1
  43. rl_code_fix_env/dataset/swebench_adapter.py +81 -35
  44. rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/buggy.py +29 -0
  45. rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/metadata.json +6 -0
  46. rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/test.py +24 -0
  47. rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/buggy.py +15 -0
  48. rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/metadata.json +6 -0
  49. rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/test.py +17 -0
  50. rl_code_fix_env/dataset/swebench_lite_tasks/numpy__numpy-10825_medium_0/buggy.py +13 -0
.gitignore CHANGED
@@ -6,3 +6,6 @@ __pycache__/
6
  commands.md
7
  logs.md
8
  inference&docker.md
 
 
 
 
6
  commands.md
7
  logs.md
8
  inference&docker.md
9
+ logs2.md
10
+ .env.example
11
+ file.txt
prompts.py CHANGED
@@ -1,37 +1,20 @@
1
  LLM_SCORER_PROMPT = """
2
- You are a reward model for an autonomous code bug-fixing agent trained with reinforcement learning.
3
- Your scores are used directly as a learning signal be precise, consistent, and strict.
4
-
5
- You will receive:
6
- - ORIGINAL: the buggy code before the agent's fix
7
- - PATCHED: the code after the agent applied its patch
8
-
9
- Evaluate the agent's fix on exactly three axes, each scored 0.0–10.0:
10
-
11
- 1. CORRECTNESS — Does the patch fix the bug(s) without introducing new ones?
12
- Full marks only if the fix is semantically correct and complete.
13
- Penalise partial fixes, over-patches, or fixes that mask rather than resolve the root cause.
14
-
15
- 2. MINIMALITY — Is the diff minimal? Penalise unnecessary refactors, renames, whitespace-only changes,
16
- or reformatting of lines unrelated to the bug.
17
-
18
- 3. QUALITY — Is the patched code readable and idiomatic? Penalise: broken naming conventions,
19
- added dead code, removed necessary comments, or degraded clarity vs. the original.
20
-
21
- Respond ONLY with this JSON — no preamble, no trailing text:
22
- {
23
- "correctness": <float 0.0-10.0>,
24
- "minimality": <float 0.0-10.0>,
25
- "quality": <float 0.0-10.0>,
26
- "reasoning": "<one concise sentence per axis, pipe-separated>"
27
- }
28
  """
29
 
30
-
31
- USER_TEMPLATE ="""
32
- ORIGINAL:
33
- ```python
34
- {original_code}
35
- ```
36
- Return only the JSON.
 
 
 
37
  """
 
1
  LLM_SCORER_PROMPT = """
2
+ You are a reward model for a code-fixing RL agent. Evaluate the PATCHED code vs. ORIGINAL on three axes (0.0–10.0):
3
+ 1. CORRECTNESS Does the patch fix the bug(s) without new bugs?
4
+ 2. MINIMALITY — Is the diff minimal? Penalize unrelated changes.
5
+ 3. QUALITY — Is the code readable and idiomatic?
6
+ Respond ONLY with this JSON (no preamble):
7
+ {"correctness": <float>, "minimality": <float>, "quality": <float>, "reasoning": "<one sentence per axis, pipe-separated>"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  """
9
 
10
+ USER_TEMPLATE = """
11
+ ORIGINAL:
12
+ ```python
13
+ {original_code}
14
+ ```
15
+ PATCHED:
16
+ ```python
17
+ {patched_code}
18
+ ```
19
+ Return only the JSON.
20
  """
rl_code_fix_env/.gitignore CHANGED
@@ -5,4 +5,5 @@ __pycache__/
5
  .env
6
  *.pyc
7
  *.egg
8
- pytest-cache-files-*/
 
 
5
  .env
6
  *.pyc
7
  *.egg
8
+ pytest-cache-files-*/
9
+ *.ps1
rl_code_fix_env/README.md CHANGED
@@ -5,6 +5,7 @@ colorFrom: green
5
  colorTo: purple
6
  sdk: docker
7
  pinned: false
 
8
  app_port: 8000
9
  base_path: /web
10
  tags:
 
5
  colorTo: purple
6
  sdk: docker
7
  pinned: false
8
+ dockerfile: server/Dockerfile
9
  app_port: 8000
10
  base_path: /web
11
  tags:
rl_code_fix_env/_aliases.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import importlib
3
+ from pathlib import Path
4
+
5
+ _REPO_ROOT = str(Path(__file__).parent)
6
+ if _REPO_ROOT not in sys.path:
7
+ sys.path.insert(0, _REPO_ROOT)
8
+
9
+ import dataset as _real_dataset
10
+
11
+ sys.modules.setdefault("src.dataset", _real_dataset)
12
+
13
+ import pkgutil
14
+ for _pkg in pkgutil.iter_modules(_real_dataset.__path__):
15
+ _full = f"dataset.{_pkg.name}"
16
+ _alias = f"src.dataset.{_pkg.name}"
17
+ try:
18
+ _mod = importlib.import_module(_full)
19
+ sys.modules.setdefault(_alias, _mod)
20
+ except Exception:
21
+ pass
rl_code_fix_env/conftest.py CHANGED
@@ -1,20 +1,3 @@
1
- """
2
- conftest.py repo-root pytest configuration.
3
-
4
- Registers `src.dataset` as a sys.modules alias for `dataset` so that all
5
- problem test files using `from src.dataset.problem_X.buggy import ...`
6
- resolve correctly without needing to rename 24 test files.
7
-
8
- The physical layout is:
9
- <repo_root>/dataset/problem_X/buggy.py real files
10
- <repo_root>/src/ has environment/, reward/, etc.
11
- but NO dataset/ subfolder
12
-
13
- With PYTHONPATH=<repo_root>:
14
- import dataset.problem_1.buggy works natively
15
- import src.dataset.problem_1.buggy would fail fixed here via alias
16
- """
17
-
18
  import sys
19
  import importlib
20
  from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import sys
2
  import importlib
3
  from pathlib import Path
rl_code_fix_env/dataset/generate_swebench_tasks.py ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Generate synthetic SWE-bench style tasks for testing.
3
+
4
+ This creates tasks that mimic the SWE-bench format:
5
+ - instance_id/buggy.py - the buggy code
6
+ - instance_id/test.py - test file
7
+ - instance_id/metadata.json - metadata
8
+
9
+ Usage:
10
+ python -m dataset.generate_swebench_tasks [--count N]
11
+ """
12
+
13
+ import argparse
14
+ import json
15
+ import random
16
+ from pathlib import Path
17
+
18
+
19
+ # Sample SWE-bench style problems
20
+ SWE_BENCH_PROBLEMS = [
21
+ {
22
+ "instance_id": "django__django-11098",
23
+ "repo": "django/django",
24
+ "problem": "Fix the user creation form validation error",
25
+ "buggy_code": '''from django import forms
26
+ from django.contrib.auth.models import User
27
+
28
+ class UserCreationForm(forms.ModelForm):
29
+ """Form for creating new users."""
30
+ password1 = forms.CharField(widget=forms.PasswordInput)
31
+ password2 = forms.CharField(widget=forms.PasswordInput)
32
+
33
+ class Meta:
34
+ model = User
35
+ fields = ('username', 'email')
36
+
37
+ def clean(self):
38
+ cleaned_data = super().clean()
39
+ password1 = cleaned_data.get('password1')
40
+ password2 = cleaned_data.get('password2')
41
+
42
+ # BUG: This comparison is case-sensitive but should be case-insensitive
43
+ if password1 != password2:
44
+ raise forms.ValidationError("Passwords don't match")
45
+
46
+ return cleaned_data
47
+
48
+ def save(self, commit=True):
49
+ user = super().save(commit=False)
50
+ user.set_password(self.cleaned_data['password1'])
51
+ if commit:
52
+ user.save()
53
+ return user
54
+ ''',
55
+ "test_code": '''import unittest
56
+ from buggy import UserCreationForm
57
+
58
+ class TestUserCreationForm(unittest.TestCase):
59
+ def test_password_matching(self):
60
+ """Test that matching passwords pass validation."""
61
+ form = UserCreationForm(data={
62
+ 'username': 'testuser',
63
+ 'email': 'test@example.com',
64
+ 'password1': 'TestPass123',
65
+ 'password2': 'TestPass123',
66
+ })
67
+ self.assertTrue(form.is_valid())
68
+
69
+ def test_password_mismatch(self):
70
+ """Test that mismatched passwords fail validation."""
71
+ form = UserCreationForm(data={
72
+ 'username': 'testuser',
73
+ 'email': 'test@example.com',
74
+ 'password1': 'TestPass123',
75
+ 'password2': 'testpass123', # Different case
76
+ })
77
+ self.assertFalse(form.is_valid())
78
+ self.assertIn('passwords', str(form.errors).lower())
79
+ ''',
80
+ },
81
+ {
82
+ "instance_id": "flask__flask-1048",
83
+ "repo": "pallets/flask",
84
+ "problem": "Fix JSON encoding for datetime objects",
85
+ "buggy_code": '''import json
86
+ from datetime import datetime, date
87
+
88
+ class JSONEncoder(json.JSONEncoder):
89
+ """Custom JSON encoder for Flask."""
90
+
91
+ def default(self, obj):
92
+ # BUG: Missing handling for datetime objects
93
+ if isinstance(obj, date):
94
+ return obj.isoformat()
95
+ return super().default(obj)
96
+
97
+ def to_json(obj):
98
+ """Convert object to JSON string."""
99
+ return json.dumps(obj, cls=JSONEncoder)
100
+ ''',
101
+ "test_code": '''import unittest
102
+ from datetime import datetime
103
+ from buggy import to_json
104
+
105
+ class TestJSONEncoding(unittest.TestCase):
106
+ def test_encode_datetime(self):
107
+ """Test that datetime objects are properly encoded."""
108
+ dt = datetime(2024, 1, 15, 10, 30, 0)
109
+ result = to_json({'timestamp': dt})
110
+ self.assertIn('2024-01-15', result)
111
+ self.assertIn('10:30:00', result)
112
+
113
+ def test_encode_date(self):
114
+ """Test that date objects are properly encoded."""
115
+ d = date(2024, 1, 15)
116
+ result = to_json({'date': d})
117
+ self.assertIn('2024-01-15', result)
118
+ ''',
119
+ },
120
+ {
121
+ "instance_id": "requests__requests-2875",
122
+ "repo": "psf/requests",
123
+ "problem": "Fix cookie domain matching",
124
+ "buggy_code": '''import re
125
+ from urllib.parse import urlparse
126
+
127
+ def match_cookie_domain(cookie_domain, request_domain):
128
+ """Check if cookie domain matches request domain."""
129
+ # BUG: Should handle leading dots differently
130
+ # .example.com should match sub.example.com but not example.com
131
+ cookie_domain = cookie_domain.lower()
132
+ request_domain = request_domain.lower()
133
+
134
+ if cookie_domain.startswith('.'):
135
+ return request_domain.endswith(cookie_domain)
136
+
137
+ return cookie_domain == request_domain
138
+ ''',
139
+ "test_code": '''import unittest
140
+ from buggy import match_cookie_domain
141
+
142
+ class TestCookieDomain(unittest.TestCase):
143
+ def test_exact_match(self):
144
+ """Test exact domain matching."""
145
+ self.assertTrue(match_cookie_domain('example.com', 'example.com'))
146
+
147
+ def test_subdomain_with_dot(self):
148
+ """Test subdomain matching with leading dot."""
149
+ # .example.com should match sub.example.com
150
+ self.assertTrue(match_cookie_domain('.example.com', 'sub.example.com'))
151
+ self.assertFalse(match_cookie_domain('.example.com', 'example.com'))
152
+
153
+ def test_different_domains(self):
154
+ """Test different domains don't match."""
155
+ self.assertFalse(match_cookie_domain('example.com', 'other.com'))
156
+ ''',
157
+ },
158
+ {
159
+ "instance_id": "numpy__numpy-10825",
160
+ "repo": "numpy/numpy",
161
+ "problem": "Fix array concatenation edge case",
162
+ "buggy_code": '''import numpy as np
163
+
164
+ def concatenate_arrays(*arrays):
165
+ """Concatenate multiple arrays along axis 0."""
166
+ if not arrays:
167
+ return np.array([])
168
+
169
+ # BUG: Should handle None arrays gracefully
170
+ result = arrays[0]
171
+ for arr in arrays[1:]:
172
+ result = np.concatenate([result, arr])
173
+
174
+ return result
175
+ ''',
176
+ "test_code": '''import unittest
177
+ import numpy as np
178
+ from buggy import concatenate_arrays
179
+
180
+ class TestArrayConcatenation(unittest.TestCase):
181
+ def test_basic_concatenation(self):
182
+ """Test basic array concatenation."""
183
+ a = np.array([1, 2, 3])
184
+ b = np.array([4, 5, 6])
185
+ result = concatenate_arrays(a, b)
186
+ np.testing.assert_array_equal(result, np.array([1, 2, 3, 4, 5, 6]))
187
+
188
+ def test_empty_input(self):
189
+ """Test empty input returns empty array."""
190
+ result = concatenate_arrays()
191
+ self.assertEqual(len(result), 0)
192
+
193
+ def test_single_array(self):
194
+ """Test single array passes through."""
195
+ a = np.array([1, 2, 3])
196
+ result = concatenate_arrays(a)
197
+ np.testing.assert_array_equal(result, a)
198
+ ''',
199
+ },
200
+ {
201
+ "instance_id": "pandas__pandas-15230",
202
+ "repo": "pandas-dev/pandas",
203
+ "problem": "Fix DataFrame groupby aggregation",
204
+ "buggy_code": '''import pandas as pd
205
+
206
+ def group_and_aggregate(df, group_col, agg_col, agg_func='mean'):
207
+ """Group DataFrame and aggregate."""
208
+ # BUG: Should handle non-numeric columns gracefully
209
+ if agg_func == 'mean':
210
+ return df.groupby(group_col)[agg_col].mean()
211
+ elif agg_func == 'sum':
212
+ return df.groupby(group_col)[agg_col].sum()
213
+ elif agg_func == 'count':
214
+ return df.groupby(group_col)[agg_col].count()
215
+ else:
216
+ raise ValueError(f"Unknown aggregation function: {agg_func}")
217
+ ''',
218
+ "test_code": '''import unittest
219
+ import pandas as pd
220
+ from buggy import group_and_aggregate
221
+
222
+ class TestGroupBy(unittest.TestCase):
223
+ def test_mean_aggregation(self):
224
+ """Test mean aggregation."""
225
+ df = pd.DataFrame({
226
+ 'category': ['A', 'A', 'B', 'B'],
227
+ 'value': [1, 2, 3, 4]
228
+ })
229
+ result = group_and_aggregate(df, 'category', 'value', 'mean')
230
+ self.assertEqual(result['A'], 1.5)
231
+ self.assertEqual(result['B'], 3.5)
232
+
233
+ def test_sum_aggregation(self):
234
+ """Test sum aggregation."""
235
+ df = pd.DataFrame({
236
+ 'category': ['A', 'A', 'B'],
237
+ 'value': [1, 2, 3]
238
+ })
239
+ result = group_and_aggregate(df, 'category', 'value', 'sum')
240
+ self.assertEqual(result['A'], 3)
241
+ self.assertEqual(result['B'], 3)
242
+ ''',
243
+ },
244
+ {
245
+ "instance_id": "scipy__scipy-1925",
246
+ "repo": "scipy/scipy",
247
+ "problem": "Fix signal filtering edge case",
248
+ "buggy_code": '''import numpy as np
249
+ from scipy import signal
250
+
251
+ def apply_lowpass_filter(data, cutoff, fs, order=5):
252
+ """Apply lowpass filter to data."""
253
+ # BUG: Should validate cutoff frequency
254
+ nyquist = fs / 2
255
+ normalized_cutoff = cutoff / nyquist
256
+
257
+ # BUG: Using invalid cutoff can cause filter design failure
258
+ b, a = signal.butter(order, normalized_cutoff, btype='low')
259
+ filtered = signal.filtfilt(b, a, data)
260
+
261
+ return filtered
262
+ ''',
263
+ "test_code": '''import unittest
264
+ import numpy as np
265
+ from buggy import apply_lowpass_filter
266
+
267
+ class TestSignalFiltering(unittest.TestCase):
268
+ def test_valid_filter(self):
269
+ """Test filtering with valid parameters."""
270
+ fs = 1000 # Sampling frequency
271
+ cutoff = 100 # Cutoff frequency
272
+ t = np.linspace(0, 1, fs)
273
+ data = np.sin(2 * np.pi * 50 * t) + 0.5 * np.sin(2 * np.pi * 200 * t)
274
+
275
+ result = apply_lowpass_filter(data, cutoff, fs)
276
+ self.assertEqual(len(result), len(data))
277
+ # Low frequency component should be preserved
278
+ self.assertTrue(np.abs(result[100]) > 0.5)
279
+
280
+ def test_invalid_cutoff(self):
281
+ """Test that invalid cutoff raises error."""
282
+ fs = 1000
283
+ cutoff = 2000 # Above Nyquist frequency - should fail
284
+ data = np.array([1, 2, 3, 4, 5])
285
+
286
+ with self.assertRaises(ValueError):
287
+ apply_lowpass_filter(data, cutoff, fs)
288
+ ''',
289
+ },
290
+ {
291
+ "instance_id": "sklearn__sklearn-12345",
292
+ "repo": "scikit-learn/scikit-learn",
293
+ "problem": "Fix cross-validation split",
294
+ "buggy_code": '''import numpy as np
295
+ from sklearn.model_selection import KFold
296
+
297
+ def get_cv_splits(X, n_splits=5, shuffle=True, random_state=42):
298
+ """Get cross-validation splits."""
299
+ # BUG: random_state should be used for reproducibility
300
+ kf = KFold(n_splits=n_splits, shuffle=shuffle)
301
+
302
+ splits = []
303
+ for train_idx, test_idx in kf.split(X):
304
+ splits.append((train_idx, test_idx))
305
+
306
+ return splits
307
+ ''',
308
+ "test_code": '''import unittest
309
+ import numpy as np
310
+ from buggy import get_cv_splits
311
+
312
+ class TestCVSplits(unittest.TestCase):
313
+ def test_split_count(self):
314
+ """Test that correct number of splits is generated."""
315
+ X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
316
+ splits = get_cv_splits(X, n_splits=3)
317
+ self.assertEqual(len(splits), 3)
318
+
319
+ def test_reproducibility(self):
320
+ """Test that splits are reproducible with same random_state."""
321
+ X = np.random.rand(100, 5)
322
+ splits1 = get_cv_splits(X, n_splits=5, random_state=42)
323
+ splits2 = get_cv_splits(X, n_splits=5, random_state=42)
324
+
325
+ for (train1, test1), (train2, test2) in zip(splits1, splits2):
326
+ np.testing.assert_array_equal(train1, train2)
327
+ np.testing.assert_array_equal(test1, test2)
328
+ ''',
329
+ },
330
+ {
331
+ "instance_id": "pytest__pytest-7426",
332
+ "repo": "pytest-dev/pytest",
333
+ "problem": "Fix test collection order",
334
+ "buggy_code": '''import os
335
+ import re
336
+
337
+ def collect_tests(directory, pattern='test_*.py'):
338
+ """Collect test files from directory."""
339
+ # BUG: Should sort files for consistent ordering
340
+ test_files = []
341
+
342
+ for root, dirs, files in os.walk(directory):
343
+ for file in files:
344
+ if re.match(pattern, file):
345
+ test_files.append(os.path.join(root, file))
346
+
347
+ return test_files
348
+ ''',
349
+ "test_code": '''import unittest
350
+ import os
351
+ import tempfile
352
+ from buggy import collect_tests
353
+
354
+ class TestCollection(unittest.TestCase):
355
+ def test_collect_pattern(self):
356
+ """Test that correct pattern is matched."""
357
+ with tempfile.TemporaryDirectory() as tmpdir:
358
+ # Create test files
359
+ open(os.path.join(tmpdir, 'test_a.py'), 'w').close()
360
+ open(os.path.join(tmpdir, 'test_b.py'), 'w').close()
361
+ open(os.path.join(tmpdir, 'not_a_test.py'), 'w').close()
362
+
363
+ tests = collect_tests(tmpdir, 'test_*.py')
364
+ self.assertEqual(len(tests), 2)
365
+
366
+ def test_consistent_order(self):
367
+ """Test that file order is consistent."""
368
+ with tempfile.TemporaryDirectory() as tmpdir:
369
+ for name in ['test_c.py', 'test_a.py', 'test_b.py']:
370
+ open(os.path.join(tmpdir, name), 'w').close()
371
+
372
+ tests1 = collect_tests(tmpdir)
373
+ tests2 = collect_tests(tmpdir)
374
+
375
+ self.assertEqual(tests1, tests2)
376
+ ''',
377
+ },
378
+ {
379
+ "instance_id": "transformers__transformers-12345",
380
+ "repo": "huggingface/transformers",
381
+ "problem": "Fix tokenization padding",
382
+ "buggy_code": '''from typing import List
383
+
384
+ def tokenize_and_pad(tokenizer, texts: List[str], max_length: int = 512):
385
+ """Tokenize texts and pad to max length."""
386
+ # BUG: Should handle padding correctly
387
+ encoded = tokenizer(
388
+ texts,
389
+ padding=True, # This pads to longest in batch, not max_length
390
+ truncation=True,
391
+ max_length=max_length,
392
+ return_tensors='pt'
393
+ )
394
+
395
+ return encoded
396
+ ''',
397
+ "test_code": '''import unittest
398
+ from buggy import tokenize_and_pad
399
+
400
+ class MockTokenizer:
401
+ def __call__(self, texts, padding=True, truncation=True, max_length=512, return_tensors=None):
402
+ # Simplified mock
403
+ return {
404
+ 'input_ids': [[1, 2, 3]] if isinstance(texts, list) else [1, 2, 3],
405
+ 'attention_mask': [[1, 1, 1]] if isinstance(texts, list) else [1, 1, 1]
406
+ }
407
+
408
+ class TestTokenization(unittest.TestCase):
409
+ def test_single_text(self):
410
+ """Test tokenizing single text."""
411
+ tokenizer = MockTokenizer()
412
+ result = tokenize_and_pad(tokenizer, ["hello world"])
413
+ self.assertIn('input_ids', result)
414
+
415
+ def test_max_length_respected(self):
416
+ """Test that max_length is respected."""
417
+ tokenizer = MockTokenizer()
418
+ # Should not raise even with long text
419
+ result = tokenize_and_pad(tokenizer, ["short"], max_length=10)
420
+ self.assertIn('input_ids', result)
421
+ ''',
422
+ },
423
+ ]
424
+
425
+ # Easy, Medium, Hard difficulty assignments
426
+ DIFFICULTY_TASKS = {
427
+ "easy": SWE_BENCH_PROBLEMS[:3],
428
+ "medium": SWE_BENCH_PROBLEMS[3:6],
429
+ "hard": SWE_BENCH_PROBLEMS[6:],
430
+ }
431
+
432
+
433
+ def generate_tasks(output_dir: Path, count_per_difficulty: int = 3):
434
+ """Generate SWE-bench style tasks."""
435
+ output_dir = Path(output_dir)
436
+ output_dir.mkdir(parents=True, exist_ok=True)
437
+
438
+ total_created = 0
439
+
440
+ for difficulty, problems in DIFFICULTY_TASKS.items():
441
+ for i, problem in enumerate(problems[:count_per_difficulty]):
442
+ instance_id = f"{problem['instance_id']}_{difficulty}_{i}"
443
+ instance_dir = output_dir / instance_id
444
+ instance_dir.mkdir(parents=True, exist_ok=True)
445
+
446
+ # Write buggy.py
447
+ buggy_file = instance_dir / "buggy.py"
448
+ buggy_file.write_text(problem["buggy_code"], encoding="utf-8")
449
+
450
+ # Write test.py
451
+ test_file = instance_dir / "test.py"
452
+ test_file.write_text(problem["test_code"], encoding="utf-8")
453
+
454
+ # Write metadata.json
455
+ metadata = {
456
+ "instance_id": instance_id,
457
+ "repo": problem["repo"],
458
+ "problem_statement": problem["problem"],
459
+ "difficulty": difficulty,
460
+ }
461
+ metadata_file = instance_dir / "metadata.json"
462
+ metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
463
+
464
+ total_created += 1
465
+
466
+ print(f"Created {total_created} tasks in {output_dir}")
467
+ print(f"Set environment variable: SWEBENCH_TASKS_ROOT={output_dir.absolute()}")
468
+ print(f"Or run with: TASK_SOURCE=swebench python inference.py")
469
+
470
+
471
+ def main():
472
+ parser = argparse.ArgumentParser(description="Generate SWE-bench style tasks")
473
+ parser.add_argument(
474
+ "--count",
475
+ type=int,
476
+ default=3,
477
+ help="Number of tasks per difficulty (default: 3)"
478
+ )
479
+ parser.add_argument(
480
+ "--output-dir",
481
+ type=str,
482
+ default=None,
483
+ help="Output directory (default: dataset/swebench_lite_tasks)"
484
+ )
485
+
486
+ args = parser.parse_args()
487
+
488
+ if args.output_dir:
489
+ output_dir = Path(args.output_dir)
490
+ else:
491
+ script_dir = Path(__file__).parent
492
+ output_dir = script_dir / "swebench_lite_tasks"
493
+
494
+ generate_tasks(output_dir, args.count)
495
+
496
+
497
+ if __name__ == "__main__":
498
+ main()
rl_code_fix_env/dataset/prepare_swebench.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script to download and materialize SWE-bench Lite tasks.
3
+
4
+ This script:
5
+ 1. Downloads SWE-bench Lite dataset from HuggingFace
6
+ 2. Extracts the buggy code and creates test files
7
+ 3. Organizes them into the expected directory structure
8
+
9
+ Usage:
10
+ python -m dataset.prepare_swebench [--max-tasks N] [--difficulty easy|medium|hard|all]
11
+ """
12
+
13
+ import argparse
14
+ import os
15
+ import sys
16
+ from pathlib import Path
17
+
18
+ # Add parent to path for imports
19
+ sys.path.insert(0, str(Path(__file__).parent.parent))
20
+
21
+ from datasets import load_dataset
22
+
23
+
24
+ def get_problem_statement(row):
25
+ """Extract problem statement from row."""
26
+ return row.get("problem_statement", "")
27
+
28
+
29
+ def get_patch(row):
30
+ """Extract the patch/fix from row."""
31
+ return row.get("patch", "")
32
+
33
+
34
+ def get_instance_id(row):
35
+ """Get instance ID from row."""
36
+ return row.get("instance_id", "")
37
+
38
+
39
+ def create_buggy_file(instance_dir: Path, row):
40
+ """
41
+ Create buggy.py from the base commit and instance.
42
+
43
+ The SWE-bench dataset provides the full repository at base_commit.
44
+ We need to extract just the relevant file that has the bug.
45
+ """
46
+ # For SWE-bench, the "buggy" version is actually the version BEFORE the patch
47
+ # We need to get the file content from the base commit
48
+ # This is complex as it requires cloning the repo at a specific commit
49
+
50
+ # For simplicity, we'll use a different approach:
51
+ # The problem_statement describes the bug, and we can create a simplified
52
+ # buggy version based on that description
53
+
54
+ instance_id = get_instance_id(row)
55
+ problem_stmt = get_problem_statement(row)
56
+
57
+ # Try to extract the file from the created files in the instance
58
+ # SWE-bench provides 'repo' and we need to find the relevant file
59
+ created_files = row.get("created_files", [])
60
+
61
+ if not created_files:
62
+ # Fallback: create a placeholder
63
+ buggy_code = f'''# Buggy code for {instance_id}
64
+ # Problem: {problem_stmt[:200]}...
65
+
66
+ def solution():
67
+ """Placeholder solution - needs to be fixed."""
68
+ pass
69
+ '''
70
+ else:
71
+ # For now, create a simple placeholder
72
+ # In a full implementation, we'd clone the repo at base_commit
73
+ file_path = created_files[0] if created_files else "solution.py"
74
+ buggy_code = f'''# Buggy code for {instance_id}
75
+ # File: {file_path}
76
+ # Problem: {problem_stmt[:200]}...
77
+
78
+ def solution():
79
+ """Placeholder solution - needs to be fixed."""
80
+ pass
81
+ '''
82
+
83
+ buggy_file = instance_dir / "buggy.py"
84
+ buggy_file.write_text(buggy_code, encoding="utf-8")
85
+ return buggy_file
86
+
87
+
88
+ def create_test_file(instance_dir: Path, row):
89
+ """
90
+ Create test.py based on the problem statement.
91
+
92
+ For SWE-bench, tests are typically derived from the issue description.
93
+ We'll create a simple test that checks if the solution works.
94
+ """
95
+ instance_id = get_instance_id(row)
96
+ problem_stmt = get_problem_statement(row)
97
+
98
+ # Create a simple test file
99
+ # In practice, SWE-bench has a test.json file with test cases
100
+ test_cases = row.get("test_cases", [])
101
+
102
+ if test_cases:
103
+ # Create tests from provided test cases
104
+ test_code = "import unittest\\n\\n"
105
+ for i, tc in enumerate(test_cases):
106
+ input_str = tc.get("input", "")
107
+ output_str = tc.get("output", "")
108
+ test_code += f'''class TestSolution(unittest.TestCase):
109
+ def test_case_{i+1}(self):
110
+ # Input: {input_str}
111
+ # Expected: {output_str}
112
+ pass # TODO: Add actual test
113
+ '''
114
+ else:
115
+ # Create a basic test based on problem statement
116
+ test_code = f'''"""Test file for {instance_id}"""
117
+
118
+ import unittest
119
+ from buggy import solution
120
+
121
+
122
+ class TestSolution(unittest.TestCase):
123
+ def test_basic(self):
124
+ """Test based on problem statement."""
125
+ # Problem: {problem_stmt[:300]}...
126
+ result = solution()
127
+ self.assertIsNotNone(result)
128
+
129
+
130
+ if __name__ == "__main__":
131
+ unittest.main()
132
+ '''
133
+
134
+ test_file = instance_dir / "test.py"
135
+ test_file.write_text(test_code, encoding="utf-8")
136
+ return test_file
137
+
138
+
139
+ def create_metadata_file(instance_dir: Path, row):
140
+ """Create metadata.json with instance info."""
141
+ import json
142
+
143
+ metadata = {
144
+ "instance_id": get_instance_id(row),
145
+ "repo": row.get("repo", ""),
146
+ "base_commit": row.get("base_commit", ""),
147
+ "problem_statement": get_problem_statement(row),
148
+ "patch": get_patch(row),
149
+ "difficulty": "medium", # Will be set based on index
150
+ }
151
+
152
+ metadata_file = instance_dir / "metadata.json"
153
+ metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
154
+ return metadata_file
155
+
156
+
157
+ def prepare_swebench_tasks(
158
+ output_dir: Path,
159
+ max_tasks: int = 30,
160
+ difficulty: str = "all"
161
+ ):
162
+ """
163
+ Download and prepare SWE-bench Lite tasks.
164
+
165
+ Args:
166
+ output_dir: Directory to save tasks
167
+ max_tasks: Maximum number of tasks to download
168
+ difficulty: "easy", "medium", "hard", or "all"
169
+ """
170
+ print(f"Loading SWE-bench Lite dataset...")
171
+
172
+ try:
173
+ ds = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
174
+ except Exception as e:
175
+ print(f"Error loading dataset: {e}")
176
+ print("Trying alternative dataset name...")
177
+ ds = load_dataset("swe-bench/swe-bench-lite", split="test")
178
+
179
+ print(f"Loaded {len(ds)} tasks")
180
+
181
+ # Calculate difficulty bounds
182
+ total = len(ds)
183
+ one_third = max(total // 3, 1)
184
+ two_third = max((2 * total) // 3, one_third + 1)
185
+
186
+ difficulty_ranges = {
187
+ "easy": (0, one_third),
188
+ "medium": (one_third, two_third),
189
+ "hard": (two_third, total),
190
+ }
191
+
192
+ # Determine which tasks to download
193
+ if difficulty == "all":
194
+ ranges = list(difficulty_ranges.values())
195
+ indices = []
196
+ for start, end in ranges:
197
+ indices.extend(range(start, min(end, start + max_tasks // 3)))
198
+ else:
199
+ start, end = difficulty_ranges.get(difficulty, (0, total))
200
+ indices = list(range(start, min(end, max_tasks)))
201
+
202
+ # Create output directory
203
+ output_dir = Path(output_dir)
204
+ output_dir.mkdir(parents=True, exist_ok=True)
205
+
206
+ print(f"Preparing {len(indices)} tasks...")
207
+
208
+ success_count = 0
209
+ for i, idx in enumerate(indices):
210
+ try:
211
+ row = ds[idx]
212
+ instance_id = get_instance_id(row)
213
+
214
+ # Create instance directory
215
+ instance_dir = output_dir / instance_id
216
+ instance_dir.mkdir(parents=True, exist_ok=True)
217
+
218
+ # Create files
219
+ create_buggy_file(instance_dir, row)
220
+ create_test_file(instance_dir, row)
221
+ create_metadata_file(instance_dir, row)
222
+
223
+ success_count += 1
224
+ if (i + 1) % 10 == 0:
225
+ print(f" Processed {i + 1}/{len(indices)} tasks...")
226
+
227
+ except Exception as e:
228
+ print(f" Warning: Failed to process task {idx}: {e}")
229
+ continue
230
+
231
+ print(f"\nDone! Prepared {success_count}/{len(indices)} tasks in {output_dir}")
232
+ print(f"Set SWEBENCH_TASKS_ROOT={output_dir.absolute()} to use these tasks.")
233
+
234
+
235
+ def main():
236
+ parser = argparse.ArgumentParser(description="Prepare SWE-bench Lite tasks")
237
+ parser.add_argument(
238
+ "--max-tasks",
239
+ type=int,
240
+ default=30,
241
+ help="Maximum number of tasks to download (default: 30)"
242
+ )
243
+ parser.add_argument(
244
+ "--difficulty",
245
+ type=str,
246
+ default="all",
247
+ choices=["easy", "medium", "hard", "all"],
248
+ help="Difficulty level to download (default: all)"
249
+ )
250
+ parser.add_argument(
251
+ "--output-dir",
252
+ type=str,
253
+ default=None,
254
+ help="Output directory (default: dataset/swebench_lite_tasks)"
255
+ )
256
+
257
+ args = parser.parse_args()
258
+
259
+ # Determine output directory
260
+ if args.output_dir:
261
+ output_dir = Path(args.output_dir)
262
+ else:
263
+ script_dir = Path(__file__).parent
264
+ output_dir = script_dir / "swebench_lite_tasks"
265
+
266
+ prepare_swebench_tasks(
267
+ output_dir=output_dir,
268
+ max_tasks=args.max_tasks,
269
+ difficulty=args.difficulty
270
+ )
271
+
272
+
273
+ if __name__ == "__main__":
274
+ main()
rl_code_fix_env/dataset/problem_1/buggy.py CHANGED
@@ -1,5 +1,7 @@
1
- def reverse_words(text: str) -> str:
2
- """Return the words in reverse order."""
3
- # BUG: split(" ") keeps empty items for repeated spaces.
4
- words = text.split(" ")
5
- return " ".join(reversed(words))
 
 
 
1
+ def safe_divide(a: float, b: float) -> float:
2
+ """Divide a by b; only return inf for division by zero."""
3
+ try:
4
+ return a / b
5
+ except Exception:
6
+ # BUG: catches unrelated errors too broadly.
7
+ return float("inf")
rl_code_fix_env/dataset/problem_1/metadata.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
  "difficulty": "easy",
3
- "bug_type": "string-splitting",
4
  "expected_steps": 1
5
  }
 
1
  {
2
  "difficulty": "easy",
3
+ "bug_type": "exception-handling",
4
  "expected_steps": 1
5
  }
rl_code_fix_env/dataset/problem_1/test.py CHANGED
@@ -1,13 +1,17 @@
1
  import unittest
2
- from src.dataset.problem_1.buggy import reverse_words
3
 
4
 
5
- class TestReverseWords(unittest.TestCase):
6
- def test_simple(self):
7
- self.assertEqual(reverse_words("hello world"), "world hello")
8
 
9
- def test_multiple_spaces(self):
10
- self.assertEqual(reverse_words("one two three"), "three two one")
 
 
 
 
11
 
12
 
13
  if __name__ == "__main__":
 
1
  import unittest
2
+ from dataset.problem_1.buggy import safe_divide
3
 
4
 
5
+ class TestSafeDivide(unittest.TestCase):
6
+ def test_normal(self):
7
+ self.assertEqual(safe_divide(8, 2), 4)
8
 
9
+ def test_zero_division(self):
10
+ self.assertEqual(safe_divide(1, 0), float("inf"))
11
+
12
+ def test_type_error_should_raise(self):
13
+ with self.assertRaises(TypeError):
14
+ safe_divide("1", 1)
15
 
16
 
17
  if __name__ == "__main__":
rl_code_fix_env/dataset/problem_10/buggy.py CHANGED
@@ -1,4 +1,4 @@
1
- from src.dataset.problem_10.helpers import transpose
2
 
3
 
4
  def rotate_90_clockwise(matrix: list[list[int]]) -> list[list[int]]:
 
1
+ from dataset.problem_10.helpers import transpose
2
 
3
 
4
  def rotate_90_clockwise(matrix: list[list[int]]) -> list[list[int]]:
rl_code_fix_env/dataset/problem_10/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_10.buggy import rotate_90_clockwise
3
 
4
 
5
  class TestRotateMatrix(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_10.buggy import rotate_90_clockwise
3
 
4
 
5
  class TestRotateMatrix(unittest.TestCase):
rl_code_fix_env/dataset/problem_11/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_11.buggy import binary_search
3
 
4
 
5
  class TestBinarySearch(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_11.buggy import binary_search
3
 
4
 
5
  class TestBinarySearch(unittest.TestCase):
rl_code_fix_env/dataset/problem_12/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_12.buggy import parse_pairs
3
 
4
 
5
  class TestParsePairs(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_12.buggy import parse_pairs
3
 
4
 
5
  class TestParsePairs(unittest.TestCase):
rl_code_fix_env/dataset/problem_13/buggy.py CHANGED
@@ -1,4 +1,4 @@
1
- from src.dataset.problem_13.cache import LRUCache
2
 
3
 
4
  def run_ops() -> tuple[int, int]:
 
1
+ from dataset.problem_13.cache import LRUCache
2
 
3
 
4
  def run_ops() -> tuple[int, int]:
rl_code_fix_env/dataset/problem_13/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_13.buggy import run_ops
3
 
4
 
5
  class TestLRU(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_13.buggy import run_ops
3
 
4
 
5
  class TestLRU(unittest.TestCase):
rl_code_fix_env/dataset/problem_14/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_14.buggy import fibonacci_recursive
3
 
4
 
5
  class TestFibonacciRecursive(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_14.buggy import fibonacci_recursive
3
 
4
 
5
  class TestFibonacciRecursive(unittest.TestCase):
rl_code_fix_env/dataset/problem_15/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_15.buggy import has_overlap
3
 
4
 
5
  class TestIntervalOverlap(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_15.buggy import has_overlap
3
 
4
 
5
  class TestIntervalOverlap(unittest.TestCase):
rl_code_fix_env/dataset/problem_16/buggy.py CHANGED
@@ -1,4 +1,4 @@
1
- from src.dataset.problem_16.helpers import normalize_scores
2
 
3
 
4
  def top_label(scores: dict[str, float]) -> str:
 
1
+ from dataset.problem_16.helpers import normalize_scores
2
 
3
 
4
  def top_label(scores: dict[str, float]) -> str:
rl_code_fix_env/dataset/problem_16/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_16.buggy import top_label
3
 
4
 
5
  class TestTopLabel(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_16.buggy import top_label
3
 
4
 
5
  class TestTopLabel(unittest.TestCase):
rl_code_fix_env/dataset/problem_17/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_17.buggy import dedupe_preserve_order
3
 
4
 
5
  class TestDedupe(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_17.buggy import dedupe_preserve_order
3
 
4
 
5
  class TestDedupe(unittest.TestCase):
rl_code_fix_env/dataset/problem_18/buggy.py CHANGED
@@ -1,4 +1,4 @@
1
- from src.dataset.problem_18.math_utils import clamp
2
 
3
 
4
  def moving_average(nums: list[int], window: int) -> list[float]:
 
1
+ from dataset.problem_18.math_utils import clamp
2
 
3
 
4
  def moving_average(nums: list[int], window: int) -> list[float]:
rl_code_fix_env/dataset/problem_18/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_18.buggy import moving_average
3
 
4
 
5
  class TestMovingAverage(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_18.buggy import moving_average
3
 
4
 
5
  class TestMovingAverage(unittest.TestCase):
rl_code_fix_env/dataset/problem_19/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import pytest
2
- from src.dataset.problem_19.buggy import calculate_employee_bonus
3
 
4
  def test_calculate_employee_bonus():
5
  employees = [
 
1
  import pytest
2
+ from dataset.problem_19.buggy import calculate_employee_bonus
3
 
4
  def test_calculate_employee_bonus():
5
  employees = [
rl_code_fix_env/dataset/problem_2/buggy.py CHANGED
@@ -1,5 +1,14 @@
1
- def is_palindrome(text: str) -> bool:
2
- """Check whether text is a palindrome."""
3
- # BUG: does not normalize case or skip non-alphanumeric chars.
4
- cleaned = text.strip()
5
- return cleaned == cleaned[::-1]
 
 
 
 
 
 
 
 
 
 
1
+ def binary_search(nums: list[int], target: int) -> int:
2
+ """Return index of target, or -1 if not found."""
3
+ left, right = 0, len(nums) - 1
4
+
5
+ while left < right:
6
+ mid = (left + right) // 2
7
+ if nums[mid] == target:
8
+ return mid
9
+ if nums[mid] < target:
10
+ left = mid + 1
11
+ else:
12
+ right = mid - 1
13
+
14
+ return -1
rl_code_fix_env/dataset/problem_2/metadata.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "difficulty": "easy",
3
- "bug_type": "string-normalization",
4
  "expected_steps": 2
5
  }
 
1
  {
2
+ "difficulty": "medium",
3
+ "bug_type": "boundary-condition",
4
  "expected_steps": 2
5
  }
rl_code_fix_env/dataset/problem_2/test.py CHANGED
@@ -1,13 +1,16 @@
1
  import unittest
2
- from src.dataset.problem_2.buggy import is_palindrome
3
 
4
 
5
- class TestPalindrome(unittest.TestCase):
6
- def test_basic_true(self):
7
- self.assertTrue(is_palindrome("level"))
8
 
9
- def test_ignores_case_and_symbols(self):
10
- self.assertTrue(is_palindrome("A man, a plan, a canal: Panama"))
 
 
 
11
 
12
 
13
  if __name__ == "__main__":
 
1
  import unittest
2
+ from dataset.problem_11.buggy import binary_search
3
 
4
 
5
+ class TestBinarySearch(unittest.TestCase):
6
+ def test_found_middle(self):
7
+ self.assertEqual(binary_search([1, 3, 5, 7], 5), 2)
8
 
9
+ def test_found_last(self):
10
+ self.assertEqual(binary_search([1, 3, 5, 7], 7), 3)
11
+
12
+ def test_not_found(self):
13
+ self.assertEqual(binary_search([1, 3, 5, 7], 4), -1)
14
 
15
 
16
  if __name__ == "__main__":
rl_code_fix_env/dataset/problem_20/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import pytest
2
- from src.dataset.problem_20.buggy import analyze_user_activity
3
 
4
  def test_analyze_user_activity():
5
  logs = [
 
1
  import pytest
2
+ from dataset.problem_20.buggy import analyze_user_activity
3
 
4
  def test_analyze_user_activity():
5
  logs = [
rl_code_fix_env/dataset/problem_21/test.py CHANGED
@@ -2,7 +2,7 @@ import pytest
2
  import os
3
  import tempfile
4
  import json
5
- from src.dataset.problem_21.buggy import process_inventory_data
6
 
7
  def test_process_inventory_data():
8
  data = {
 
2
  import os
3
  import tempfile
4
  import json
5
+ from dataset.problem_21.buggy import process_inventory_data
6
 
7
  def test_process_inventory_data():
8
  data = {
rl_code_fix_env/dataset/problem_22/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import pytest
2
- from src.dataset.problem_22.buggy import parse_and_validate_emails
3
 
4
  def test_parse_and_validate_emails():
5
  emails = [
 
1
  import pytest
2
+ from dataset.problem_22.buggy import parse_and_validate_emails
3
 
4
  def test_parse_and_validate_emails():
5
  emails = [
rl_code_fix_env/dataset/problem_23/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import pytest
2
- from src.dataset.problem_23.buggy import optimize_portfolio
3
 
4
  def test_optimize_portfolio():
5
  investments = [
 
1
  import pytest
2
+ from dataset.problem_23.buggy import optimize_portfolio
3
 
4
  def test_optimize_portfolio():
5
  investments = [
rl_code_fix_env/dataset/problem_3/buggy.py CHANGED
@@ -1,10 +1,37 @@
1
- def fibonacci(n: int) -> int:
2
- """Return the n-th Fibonacci number (0-indexed)."""
3
- if n <= 1:
4
- return n
5
-
6
- a, b = 0, 1
7
- # BUG: loop count is one step short.
8
- for _ in range(2, n):
9
- a, b = b, a + b
10
- return b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def optimize_portfolio(investments: list[dict], budget: float) -> list[dict]:
2
+ """
3
+ Selects the optimal subset of investments to maximize return within a budget.
4
+ (0-1 Knapsack problem approximation)
5
+
6
+ investments: list of dicts with 'id', 'cost', 'expected_return'
7
+ budget: float, maximum total cost allowed
8
+
9
+ Returns:
10
+ list of chosen investments
11
+ """
12
+ # Base case checks
13
+ if budget <= 0 or not investments:
14
+ return []
15
+
16
+ # BUG 1: Sorting modifies the original list, should use sorted() or copy
17
+ # BUG 2: Sorting by expected_return ascending instead of return/cost ratio descending
18
+ investments.sort(key=lambda x: x['expected_return'])
19
+
20
+ chosen = []
21
+ current_spent = 0
22
+
23
+ # BUG 3: For loop variable shadowing the loop scope if cost/return variables are misspelled
24
+ for item in investments:
25
+ # BUG 4: item.get() but missing default values if keys are absent, could cause TypeError if None
26
+ cost = item.get('cost')
27
+ ret = item.get('expected_return')
28
+
29
+ # BUG 5: Logic error: checking if current_spent is less than budget, but not checking if adding cost exceeds it
30
+ if current_spent < budget:
31
+ current_spent += cost
32
+ chosen.append(item)
33
+
34
+ # BUG 6: Does not handle the case where adding the item exceeds budget, just blindly adds it if current_spent < budget
35
+ # E.g. budget 100, current 90, item cost 50 -> adds it, total 140
36
+
37
+ return chosen
rl_code_fix_env/dataset/problem_3/metadata.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "difficulty": "easy",
3
- "bug_type": "off-by-one",
4
- "expected_steps": 1
5
  }
 
1
  {
2
+ "difficulty": "hard",
3
+ "bug_type": "multiple",
4
+ "expected_steps": 5
5
  }
rl_code_fix_env/dataset/problem_3/test.py CHANGED
@@ -1,15 +1,44 @@
1
- import unittest
2
- from src.dataset.problem_3.buggy import fibonacci
3
 
4
-
5
- class TestFibonacci(unittest.TestCase):
6
- def test_small_values(self):
7
- self.assertEqual(fibonacci(2), 1)
8
- self.assertEqual(fibonacci(3), 2)
9
-
10
- def test_larger_value(self):
11
- self.assertEqual(fibonacci(7), 13)
12
-
13
-
14
- if __name__ == "__main__":
15
- unittest.main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from dataset.problem_23.buggy import optimize_portfolio
3
 
4
+ def test_optimize_portfolio():
5
+ investments = [
6
+ {'id': 'A', 'cost': 50, 'expected_return': 60}, # ratio 1.2
7
+ {'id': 'B', 'cost': 30, 'expected_return': 45}, # ratio 1.5
8
+ {'id': 'C', 'cost': 20, 'expected_return': 40}, # ratio 2.0
9
+ {'id': 'D', 'cost': 40, 'expected_return': 50}, # ratio 1.25
10
+ {'id': 'E', 'cost': 10, 'expected_return': 15} # ratio 1.5
11
+ ]
12
+
13
+ # Original list should not be mutated
14
+ orig_investments = [dict(i) for i in investments]
15
+
16
+ # Budget 50
17
+ # Expected greedy: C (20) -> B (30) -> total cost 50, return 85
18
+ result = optimize_portfolio(investments, 50)
19
+
20
+ assert investments == orig_investments, "Original list was mutated"
21
+
22
+ # Assert correct items selected
23
+ chosen_ids = {item['id'] for item in result}
24
+ assert chosen_ids == {'B', 'C'}, f"Expected B and C, got {chosen_ids}"
25
+
26
+ total_cost = sum(item['cost'] for item in result)
27
+ assert total_cost <= 50
28
+
29
+ def test_budget_exceeded_check():
30
+ investments = [
31
+ {'id': 'A', 'cost': 90, 'expected_return': 100},
32
+ {'id': 'B', 'cost': 50, 'expected_return': 60}
33
+ ]
34
+
35
+ # Budget 100
36
+ # Expected: A (cost 90)
37
+ result = optimize_portfolio(investments, 100)
38
+
39
+ chosen_ids = {item['id'] for item in result}
40
+ assert chosen_ids == {'A'}, "Should not include B since total cost would be 140"
41
+
42
+ def test_empty_or_zero_budget():
43
+ assert optimize_portfolio([], 100) == []
44
+ assert optimize_portfolio([{'id': 'A', 'cost': 10, 'expected_return': 20}], 0) == []
rl_code_fix_env/dataset/problem_4/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_4.buggy import merge_sorted
3
 
4
 
5
  class TestMergeSorted(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_4.buggy import merge_sorted
3
 
4
 
5
  class TestMergeSorted(unittest.TestCase):
rl_code_fix_env/dataset/problem_5/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_5.buggy import chunk_list
3
 
4
 
5
  class TestChunkList(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_5.buggy import chunk_list
3
 
4
 
5
  class TestChunkList(unittest.TestCase):
rl_code_fix_env/dataset/problem_6/buggy.py CHANGED
@@ -1,4 +1,4 @@
1
- from src.dataset.problem_6.helpers import tokenize
2
 
3
 
4
  def count_unique_words(text: str) -> int:
 
1
+ from dataset.problem_6.helpers import tokenize
2
 
3
 
4
  def count_unique_words(text: str) -> int:
rl_code_fix_env/dataset/problem_6/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_6.buggy import count_unique_words
3
 
4
 
5
  class TestCountUniqueWords(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_6.buggy import count_unique_words
3
 
4
 
5
  class TestCountUniqueWords(unittest.TestCase):
rl_code_fix_env/dataset/problem_7/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_7.buggy import top_k_frequent
3
 
4
 
5
  class TestTopKFrequent(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_7.buggy import top_k_frequent
3
 
4
 
5
  class TestTopKFrequent(unittest.TestCase):
rl_code_fix_env/dataset/problem_8/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_8.buggy import flatten_one_level
3
 
4
 
5
  class TestFlattenOneLevel(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_8.buggy import flatten_one_level
3
 
4
 
5
  class TestFlattenOneLevel(unittest.TestCase):
rl_code_fix_env/dataset/problem_9/test.py CHANGED
@@ -1,5 +1,5 @@
1
  import unittest
2
- from src.dataset.problem_9.buggy import safe_divide
3
 
4
 
5
  class TestSafeDivide(unittest.TestCase):
 
1
  import unittest
2
+ from dataset.problem_9.buggy import safe_divide
3
 
4
 
5
  class TestSafeDivide(unittest.TestCase):
rl_code_fix_env/dataset/swebench_adapter.py CHANGED
@@ -46,47 +46,93 @@ def get_swebench_task(difficulty: str) -> Dict[str, Any]:
46
  Expected local layout:
47
  dataset/swebench_lite_tasks/<instance_id>/buggy.py
48
  dataset/swebench_lite_tasks/<instance_id>/test.py
 
 
49
  """
50
  diff = (difficulty or "").strip().lower()
51
  if diff not in DIFFICULTIES:
52
  raise ValueError(f"Invalid difficulty '{difficulty}'. Must be one of {DIFFICULTIES}.")
53
 
54
- rows = _load_swebench_lite_rows()
55
- if not rows:
56
- raise RuntimeError("SWE-bench Lite split is empty.")
57
-
58
- bounds = _difficulty_bounds(len(rows))
59
- start, end = bounds[diff]
60
- candidates = rows[start:end] if end > start else rows
61
-
62
  tasks_root = Path(os.getenv("SWEBENCH_TASKS_ROOT", str(DEFAULT_TASKS_ROOT)))
63
- preferred_offset = int(os.getenv("SWEBENCH_INDEX", "0"))
64
-
65
- # Deterministic scan order with optional offset.
66
- ordered = candidates[preferred_offset:] + candidates[:preferred_offset]
67
- for row in ordered:
68
- row_idx = int(row.get("__index_level_0__", 0))
69
- instance_id = str(row.get("instance_id", f"row_{row_idx}"))
70
- for folder in _candidate_dirs(tasks_root, instance_id, row_idx):
71
- buggy_file = folder / "buggy.py"
72
- test_file = folder / "test.py"
73
- if buggy_file.exists() and test_file.exists():
74
- code = buggy_file.read_text(encoding="utf-8")
75
- metadata = {
76
- "source": "swebench_lite",
77
- "instance_id": instance_id,
78
- "repo": row.get("repo"),
79
- "base_commit": row.get("base_commit"),
80
- "problem_statement": row.get("problem_statement"),
81
- "difficulty": diff,
82
- }
83
- return {
84
- "code": code,
85
- "tests": str(test_file),
86
- "metadata": metadata,
87
- "problem_dir": str(folder),
88
- "problem_id": instance_id,
89
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  raise FileNotFoundError(
92
  "No materialized SWE-bench task workspace found. "
 
46
  Expected local layout:
47
  dataset/swebench_lite_tasks/<instance_id>/buggy.py
48
  dataset/swebench_lite_tasks/<instance_id>/test.py
49
+
50
+ First tries to load from local files, then falls back to HuggingFace dataset.
51
  """
52
  diff = (difficulty or "").strip().lower()
53
  if diff not in DIFFICULTIES:
54
  raise ValueError(f"Invalid difficulty '{difficulty}'. Must be one of {DIFFICULTIES}.")
55
 
 
 
 
 
 
 
 
 
56
  tasks_root = Path(os.getenv("SWEBENCH_TASKS_ROOT", str(DEFAULT_TASKS_ROOT)))
57
+
58
+ # First, try to load from local materialized tasks
59
+ if tasks_root.exists():
60
+ # Find all instance directories
61
+ instance_dirs = []
62
+ for item in tasks_root.iterdir():
63
+ if item.is_dir() and (item / "buggy.py").exists() and (item / "test.py").exists():
64
+ # Check if this directory matches the difficulty
65
+ if diff in item.name.lower():
66
+ instance_dirs.append(item)
67
+
68
+ if instance_dirs:
69
+ # Sort for deterministic selection
70
+ instance_dirs.sort(key=lambda x: x.name)
71
+
72
+ # Select based on SWEBENCH_INDEX
73
+ preferred_offset = int(os.getenv("SWEBENCH_INDEX", "0"))
74
+ selected_dir = instance_dirs[preferred_offset % len(instance_dirs)]
75
+
76
+ buggy_file = selected_dir / "buggy.py"
77
+ test_file = selected_dir / "test.py"
78
+ metadata_file = selected_dir / "metadata.json"
79
+
80
+ code = buggy_file.read_text(encoding="utf-8")
81
+
82
+ # Load metadata if available
83
+ metadata = {"source": "swebench_lite", "difficulty": diff}
84
+ if metadata_file.exists():
85
+ import json
86
+ metadata = json.loads(metadata_file.read_text(encoding="utf-8"))
87
+
88
+ return {
89
+ "code": code,
90
+ "tests": str(test_file),
91
+ "metadata": metadata,
92
+ "problem_dir": str(selected_dir),
93
+ "problem_id": selected_dir.name,
94
+ }
95
+
96
+ # Fallback: try to load from HuggingFace dataset
97
+ try:
98
+ rows = _load_swebench_lite_rows()
99
+ if not rows:
100
+ raise RuntimeError("SWE-bench Lite split is empty.")
101
+
102
+ bounds = _difficulty_bounds(len(rows))
103
+ start, end = bounds[diff]
104
+ candidates = rows[start:end] if end > start else rows
105
+
106
+ preferred_offset = int(os.getenv("SWEBENCH_INDEX", "0"))
107
+
108
+ # Deterministic scan order with optional offset.
109
+ ordered = candidates[preferred_offset:] + candidates[:preferred_offset]
110
+ for row in ordered:
111
+ row_idx = int(row.get("__index_level_0__", 0))
112
+ instance_id = str(row.get("instance_id", f"row_{row_idx}"))
113
+ for folder in _candidate_dirs(tasks_root, instance_id, row_idx):
114
+ buggy_file = folder / "buggy.py"
115
+ test_file = folder / "test.py"
116
+ if buggy_file.exists() and test_file.exists():
117
+ code = buggy_file.read_text(encoding="utf-8")
118
+ metadata = {
119
+ "source": "swebench_lite",
120
+ "instance_id": instance_id,
121
+ "repo": row.get("repo"),
122
+ "base_commit": row.get("base_commit"),
123
+ "problem_statement": row.get("problem_statement"),
124
+ "difficulty": diff,
125
+ }
126
+ return {
127
+ "code": code,
128
+ "tests": str(test_file),
129
+ "metadata": metadata,
130
+ "problem_dir": str(folder),
131
+ "problem_id": instance_id,
132
+ }
133
+ except Exception as e:
134
+ # If HuggingFace fails, raise the original error about missing local files
135
+ pass
136
 
137
  raise FileNotFoundError(
138
  "No materialized SWE-bench task workspace found. "
rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/buggy.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from django import forms
2
+ from django.contrib.auth.models import User
3
+
4
+ class UserCreationForm(forms.ModelForm):
5
+ """Form for creating new users."""
6
+ password1 = forms.CharField(widget=forms.PasswordInput)
7
+ password2 = forms.CharField(widget=forms.PasswordInput)
8
+
9
+ class Meta:
10
+ model = User
11
+ fields = ('username', 'email')
12
+
13
+ def clean(self):
14
+ cleaned_data = super().clean()
15
+ password1 = cleaned_data.get('password1')
16
+ password2 = cleaned_data.get('password2')
17
+
18
+ # BUG: This comparison is case-sensitive but should be case-insensitive
19
+ if password1 != password2:
20
+ raise forms.ValidationError("Passwords don't match")
21
+
22
+ return cleaned_data
23
+
24
+ def save(self, commit=True):
25
+ user = super().save(commit=False)
26
+ user.set_password(self.cleaned_data['password1'])
27
+ if commit:
28
+ user.save()
29
+ return user
rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/metadata.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "instance_id": "django__django-11098_easy_0",
3
+ "repo": "django/django",
4
+ "problem_statement": "Fix the user creation form validation error",
5
+ "difficulty": "easy"
6
+ }
rl_code_fix_env/dataset/swebench_lite_tasks/django__django-11098_easy_0/test.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from buggy import UserCreationForm
3
+
4
+ class TestUserCreationForm(unittest.TestCase):
5
+ def test_password_matching(self):
6
+ """Test that matching passwords pass validation."""
7
+ form = UserCreationForm(data={
8
+ 'username': 'testuser',
9
+ 'email': 'test@example.com',
10
+ 'password1': 'TestPass123',
11
+ 'password2': 'TestPass123',
12
+ })
13
+ self.assertTrue(form.is_valid())
14
+
15
+ def test_password_mismatch(self):
16
+ """Test that mismatched passwords fail validation."""
17
+ form = UserCreationForm(data={
18
+ 'username': 'testuser',
19
+ 'email': 'test@example.com',
20
+ 'password1': 'TestPass123',
21
+ 'password2': 'testpass123', # Different case
22
+ })
23
+ self.assertFalse(form.is_valid())
24
+ self.assertIn('passwords', str(form.errors).lower())
rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/buggy.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from datetime import datetime, date
3
+
4
+ class JSONEncoder(json.JSONEncoder):
5
+ """Custom JSON encoder for Flask."""
6
+
7
+ def default(self, obj):
8
+ # BUG: Missing handling for datetime objects
9
+ if isinstance(obj, date):
10
+ return obj.isoformat()
11
+ return super().default(obj)
12
+
13
+ def to_json(obj):
14
+ """Convert object to JSON string."""
15
+ return json.dumps(obj, cls=JSONEncoder)
rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/metadata.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "instance_id": "flask__flask-1048_easy_1",
3
+ "repo": "pallets/flask",
4
+ "problem_statement": "Fix JSON encoding for datetime objects",
5
+ "difficulty": "easy"
6
+ }
rl_code_fix_env/dataset/swebench_lite_tasks/flask__flask-1048_easy_1/test.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from datetime import datetime
3
+ from buggy import to_json
4
+
5
+ class TestJSONEncoding(unittest.TestCase):
6
+ def test_encode_datetime(self):
7
+ """Test that datetime objects are properly encoded."""
8
+ dt = datetime(2024, 1, 15, 10, 30, 0)
9
+ result = to_json({'timestamp': dt})
10
+ self.assertIn('2024-01-15', result)
11
+ self.assertIn('10:30:00', result)
12
+
13
+ def test_encode_date(self):
14
+ """Test that date objects are properly encoded."""
15
+ d = date(2024, 1, 15)
16
+ result = to_json({'date': d})
17
+ self.assertIn('2024-01-15', result)
rl_code_fix_env/dataset/swebench_lite_tasks/numpy__numpy-10825_medium_0/buggy.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ def concatenate_arrays(*arrays):
4
+ """Concatenate multiple arrays along axis 0."""
5
+ if not arrays:
6
+ return np.array([])
7
+
8
+ # BUG: Should handle None arrays gracefully
9
+ result = arrays[0]
10
+ for arr in arrays[1:]:
11
+ result = np.concatenate([result, arr])
12
+
13
+ return result