File size: 8,506 Bytes
03a907a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
"""

Script to download and materialize SWE-bench Lite tasks.



This script:

1. Downloads SWE-bench Lite dataset from HuggingFace

2. Extracts the buggy code and creates test files

3. Organizes them into the expected directory structure



Usage:

    python -m dataset.prepare_swebench [--max-tasks N] [--difficulty easy|medium|hard|all]

"""

import argparse
import os
import sys
from pathlib import Path

# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from datasets import load_dataset


def get_problem_statement(row):
    """Extract problem statement from row."""
    return row.get("problem_statement", "")


def get_patch(row):
    """Extract the patch/fix from row."""
    return row.get("patch", "")


def get_instance_id(row):
    """Get instance ID from row."""
    return row.get("instance_id", "")


def create_buggy_file(instance_dir: Path, row):
    """

    Create buggy.py from the base commit and instance.

    

    The SWE-bench dataset provides the full repository at base_commit.

    We need to extract just the relevant file that has the bug.

    """
    # For SWE-bench, the "buggy" version is actually the version BEFORE the patch
    # We need to get the file content from the base commit
    # This is complex as it requires cloning the repo at a specific commit
    
    # For simplicity, we'll use a different approach:
    # The problem_statement describes the bug, and we can create a simplified
    # buggy version based on that description
    
    instance_id = get_instance_id(row)
    problem_stmt = get_problem_statement(row)
    
    # Try to extract the file from the created files in the instance
    # SWE-bench provides 'repo' and we need to find the relevant file
    created_files = row.get("created_files", [])
    
    if not created_files:
        # Fallback: create a placeholder
        buggy_code = f'''# Buggy code for {instance_id}

# Problem: {problem_stmt[:200]}...



def solution():

    """Placeholder solution - needs to be fixed."""

    pass

'''
    else:
        # For now, create a simple placeholder
        # In a full implementation, we'd clone the repo at base_commit
        file_path = created_files[0] if created_files else "solution.py"
        buggy_code = f'''# Buggy code for {instance_id}

# File: {file_path}

# Problem: {problem_stmt[:200]}...



def solution():

    """Placeholder solution - needs to be fixed."""

    pass

'''
    
    buggy_file = instance_dir / "buggy.py"
    buggy_file.write_text(buggy_code, encoding="utf-8")
    return buggy_file


def create_test_file(instance_dir: Path, row):
    """

    Create test.py based on the problem statement.

    

    For SWE-bench, tests are typically derived from the issue description.

    We'll create a simple test that checks if the solution works.

    """
    instance_id = get_instance_id(row)
    problem_stmt = get_problem_statement(row)
    
    # Create a simple test file
    # In practice, SWE-bench has a test.json file with test cases
    test_cases = row.get("test_cases", [])
    
    if test_cases:
        # Create tests from provided test cases
        test_code = "import unittest\\n\\n"
        for i, tc in enumerate(test_cases):
            input_str = tc.get("input", "")
            output_str = tc.get("output", "")
            test_code += f'''class TestSolution(unittest.TestCase):

    def test_case_{i+1}(self):

        # Input: {input_str}

        # Expected: {output_str}

        pass  # TODO: Add actual test

'''
    else:
        # Create a basic test based on problem statement
        test_code = f'''"""Test file for {instance_id}"""



import unittest

from buggy import solution





class TestSolution(unittest.TestCase):

    def test_basic(self):

        """Test based on problem statement."""

        # Problem: {problem_stmt[:300]}...

        result = solution()

        self.assertIsNotNone(result)





if __name__ == "__main__":

    unittest.main()

'''
    
    test_file = instance_dir / "test.py"
    test_file.write_text(test_code, encoding="utf-8")
    return test_file


def create_metadata_file(instance_dir: Path, row):
    """Create metadata.json with instance info."""
    import json
    
    metadata = {
        "instance_id": get_instance_id(row),
        "repo": row.get("repo", ""),
        "base_commit": row.get("base_commit", ""),
        "problem_statement": get_problem_statement(row),
        "patch": get_patch(row),
        "difficulty": "medium",  # Will be set based on index
    }
    
    metadata_file = instance_dir / "metadata.json"
    metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
    return metadata_file


def prepare_swebench_tasks(

    output_dir: Path,

    max_tasks: int = 30,

    difficulty: str = "all"

):
    """

    Download and prepare SWE-bench Lite tasks.

    

    Args:

        output_dir: Directory to save tasks

        max_tasks: Maximum number of tasks to download

        difficulty: "easy", "medium", "hard", or "all"

    """
    print(f"Loading SWE-bench Lite dataset...")
    
    try:
        ds = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Trying alternative dataset name...")
        ds = load_dataset("swe-bench/swe-bench-lite", split="test")
    
    print(f"Loaded {len(ds)} tasks")
    
    # Calculate difficulty bounds
    total = len(ds)
    one_third = max(total // 3, 1)
    two_third = max((2 * total) // 3, one_third + 1)
    
    difficulty_ranges = {
        "easy": (0, one_third),
        "medium": (one_third, two_third),
        "hard": (two_third, total),
    }
    
    # Determine which tasks to download
    if difficulty == "all":
        ranges = list(difficulty_ranges.values())
        indices = []
        for start, end in ranges:
            indices.extend(range(start, min(end, start + max_tasks // 3)))
    else:
        start, end = difficulty_ranges.get(difficulty, (0, total))
        indices = list(range(start, min(end, max_tasks)))
    
    # Create output directory
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"Preparing {len(indices)} tasks...")
    
    success_count = 0
    for i, idx in enumerate(indices):
        try:
            row = ds[idx]
            instance_id = get_instance_id(row)
            
            # Create instance directory
            instance_dir = output_dir / instance_id
            instance_dir.mkdir(parents=True, exist_ok=True)
            
            # Create files
            create_buggy_file(instance_dir, row)
            create_test_file(instance_dir, row)
            create_metadata_file(instance_dir, row)
            
            success_count += 1
            if (i + 1) % 10 == 0:
                print(f"  Processed {i + 1}/{len(indices)} tasks...")
                
        except Exception as e:
            print(f"  Warning: Failed to process task {idx}: {e}")
            continue
    
    print(f"\nDone! Prepared {success_count}/{len(indices)} tasks in {output_dir}")
    print(f"Set SWEBENCH_TASKS_ROOT={output_dir.absolute()} to use these tasks.")


def main():
    parser = argparse.ArgumentParser(description="Prepare SWE-bench Lite tasks")
    parser.add_argument(
        "--max-tasks", 
        type=int, 
        default=30,
        help="Maximum number of tasks to download (default: 30)"
    )
    parser.add_argument(
        "--difficulty",
        type=str,
        default="all",
        choices=["easy", "medium", "hard", "all"],
        help="Difficulty level to download (default: all)"
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default=None,
        help="Output directory (default: dataset/swebench_lite_tasks)"
    )
    
    args = parser.parse_args()
    
    # Determine output directory
    if args.output_dir:
        output_dir = Path(args.output_dir)
    else:
        script_dir = Path(__file__).parent
        output_dir = script_dir / "swebench_lite_tasks"
    
    prepare_swebench_tasks(
        output_dir=output_dir,
        max_tasks=args.max_tasks,
        difficulty=args.difficulty
    )


if __name__ == "__main__":
    main()