Spaces:

Viraj0112
/

rl_code_fix_env

Sleeping

App Files Files Community

Viraj0112 commited on Apr 12

Commit

03a907a

verified ·

1 Parent(s): 2259499

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +78 -0
README.md +182 -215
__init__.py +14 -0
_aliases.py +21 -0
client.py +185 -0
conftest.py +21 -0
dataset/README.md +20 -0
dataset/__init__.py +1 -0
dataset/generate_swebench_tasks.py +498 -0
dataset/loader.py +111 -0
dataset/prepare_swebench.py +274 -0
dataset/problem_1/buggy.py +7 -0
dataset/problem_1/metadata.json +5 -0
dataset/problem_1/test.py +18 -0
dataset/problem_10/buggy.py +8 -0
dataset/problem_10/helpers.py +2 -0
dataset/problem_10/metadata.json +5 -0
dataset/problem_10/test.py +12 -0
dataset/problem_11/buggy.py +14 -0
dataset/problem_11/metadata.json +5 -0
dataset/problem_11/test.py +17 -0
dataset/problem_12/buggy.py +11 -0
dataset/problem_12/metadata.json +5 -0
dataset/problem_12/test.py +14 -0
dataset/problem_13/buggy.py +10 -0
dataset/problem_13/cache.py +20 -0
dataset/problem_13/metadata.json +5 -0
dataset/problem_13/test.py +13 -0
dataset/problem_14/buggy.py +6 -0
dataset/problem_14/metadata.json +5 -0
dataset/problem_14/test.py +15 -0
dataset/problem_15/buggy.py +4 -0
dataset/problem_15/metadata.json +5 -0
dataset/problem_15/test.py +14 -0
dataset/problem_16/buggy.py +10 -0
dataset/problem_16/helpers.py +3 -0
dataset/problem_16/metadata.json +5 -0
dataset/problem_16/test.py +12 -0
dataset/problem_17/buggy.py +11 -0
dataset/problem_17/metadata.json +5 -0
dataset/problem_17/test.py +11 -0
dataset/problem_18/buggy.py +14 -0
dataset/problem_18/math_utils.py +6 -0
dataset/problem_18/metadata.json +5 -0
dataset/problem_18/test.py +14 -0
dataset/problem_19/buggy.py +36 -0
dataset/problem_19/metadata.json +5 -0
dataset/problem_19/test.py +48 -0
dataset/problem_2/buggy.py +14 -0
dataset/problem_2/metadata.json +5 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,78 @@

+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+# Ensure git and curl are available
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git curl ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=rl_code_fix_env
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available
+RUN if ! command -v uv >/dev/null 2>&1; then \
+    curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh; \
+    fi
+# Install dependencies using uv sync
+# If uv.lock exists, use it; otherwise resolve on the fly
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+    uv sync --frozen --no-install-project --no-editable; \
+    else \
+    uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+    uv sync --frozen --no-editable; \
+    else \
+    uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+# Install curl for health check
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends curl && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Copy environment code + its in-place virtualenv from builder.
+# Keep the venv at the same path it was created with (/app/env/.venv)
+# to avoid relocation issues and dual-venv path conflicts.
+COPY --from=builder /app/env /app/env
+# Use the single in-repo venv
+ENV VIRTUAL_ENV="/app/env/.venv"
+ENV PATH="/app/env/.venv/bin:$PATH"
+# Hermetic runtime: keep imports pinned to repo code + active venv.
+ENV PYTHONPATH="/app/env"
+ENV PYTHONNOUSERSITE="1"
+ENV PYTHONDONTWRITEBYTECODE="1"
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Expose the application port
+EXPOSE 8000
+# Run the FastAPI server
+# The module path is constructed to work with the /app/env structure
+ENV ENABLE_WEB_INTERFACE=true
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

README.md CHANGED Viewed

@@ -1,288 +1,255 @@
-# TraceRL Mini Environment for Autonomous Code Fixing
-This repository packages an OpenEnv-compatible reinforcement learning environment for autonomous Python bug fixing. An agent receives buggy code, can apply unified-diff patches, run the task's tests, inspect logs, and is rewarded for functional progress, reasonable debugging traces, and solving the problem within a step budget.
-## Environment Overview and Motivation
-The core environment lives in `rl_code_fix_env/` and wraps a code-repair loop around three pieces of functionality:
-1. Load a bug-fixing task from either a local curated dataset or a materialized SWE-bench Lite workspace.
-2. Let the agent iteratively edit the current `buggy.py` contents with `apply_patch`, then execute the task test file.
-3. Return observations and rewards that make the environment suitable for RL-style training and evaluation.
-The motivation is to benchmark whether an autonomous agent can do more than generate one-shot code. It must:
-- read failing code,
-- produce minimal patches,
-- use test feedback to refine its fix,
-- manage a limited interaction budget,
-- and recover from bad intermediate edits.
-This repo also includes a baseline `inference.py` script, containerization for OpenEnv/Hugging Face Spaces deployment, and run logs for a reference baseline.
-## Repository Layout
-- `rl_code_fix_env/`: main OpenEnv package.
-- `rl_code_fix_env/src/environment/environment.py`: core RL environment logic.
-- `rl_code_fix_env/src/reward/`: reward shaping and trace scoring.
-- `rl_code_fix_env/src/sandbox/`: unified-diff patching and test execution sandbox.
-- `rl_code_fix_env/dataset/`: local bug-fixing tasks and metadata.
-- `rl_code_fix_env/server/`: FastAPI/OpenEnv server and Dockerfile.
-- `rl_code_fix_env/inference.py`: baseline inference agent.
-- `logs.md`: recorded baseline run output.
-## Action Space
-The action model is defined in `rl_code_fix_env/models.py` as:
-```python
-CodeFixerAction(
-    type: str,
-    payload: Optional[str] = None,
-)
 ```
-Supported action types:
-- `apply_patch`: `payload` is a unified diff patch. The environment fuzzily applies hunks to the current code string.
-- `run_tests`: executes the task's `test.py` and updates pass/fail state and logs.
-- `get_logs`: returns the most recent logs without changing code.
-Practical meaning:
-- `apply_patch` is the editing action.
-- `run_tests` is the feedback action.
-- `get_logs` is a cheap inspection action when the agent wants the last failure output again.
-## Observation Space
-The observation model is also defined in `rl_code_fix_env/models.py`:
-```python
-CodeFixerObservation(
-    code: str = "",
-    logs: Optional[str] = None,
-    test_score: float = 0.0,
-    total_tests: int = 1,
-    steps: int = 0,
-    done: bool = False,
-    reward: Optional[float] = None,
-)
 ```
-Field meanings:
-- `code`: the current patched source code under repair.
-- `logs`: latest pytest output or startup/fallback messages.
-- `test_score`: normalized functional score. In the current local tasks it is `1.0` for pass and `0.0` for fail.
-- `total_tests`: number of task test files tracked by the environment. Current local tasks use a single target test file.
-- `steps`: number of patch actions consumed so far.
-- `done`: episode termination flag.
-- `reward`: latest reward returned by the environment wrapper.
-## Reward Design
-The reward is computed in `rl_code_fix_env/src/reward/reward.py`:
-```text
-reward =
-  0.7 * functional_reward
-  + 0.2 * trace_reward
-  + 0.1 * quality_reward
-  - efficiency_penalty
 ```
-Where:
-- `functional_reward = test_score`
-- `trace_reward = score_trace(trace_obj)`
-- `quality_reward = 1.0` when non-empty code exists, else `0.0`
-- `efficiency_penalty = 0.05 * (steps_taken / max_steps)`
-If all tests pass, the environment overrides the reward to `1.0`.
-## Task Descriptions and Expected Difficulty Levels
-### Official competition-facing task mapping
-The current local fallback dataset exposes one canonical task per difficulty through `get_hardcoded_task(...)`:
-| Difficulty | Problem ID | Description | Bug type | Expected steps |
-| --- | --- | --- | --- | --- |
-| Easy | `problem_1` | Reverse words while normalizing repeated spaces | `string-splitting` | 1 |
-| Medium | `problem_10` | Rotate a matrix 90 degrees clockwise | `matrix-transformation` | 1 |
-| Hard | `problem_13` | Preserve recency correctly in an LRU cache | `state-logic` | 2 |
-Canonical task details:
-- `easy`:
-  The buggy code uses `text.split(" ")`, which preserves empty tokens for repeated spaces. The fix is a small normalization change.
-- `medium`:
-  The code transposes the matrix and then reverses rows in the wrong direction, producing a counter-clockwise rotation.
-- `hard`:
-  The visible task calls into `cache.py`, where `LRUCache.get()` fails to refresh recency. This is stateful and effectively multi-file reasoning.
-### Full local dataset coverage
-The local dataset currently contains 23 problems:
-- `easy`: 8 tasks
-- `medium`: 9 tasks
-- `hard`: 6 tasks
-Bug patterns represented across the dataset include:
-- whitespace and string normalization
-- off-by-one and boundary-condition mistakes
-- incorrect matrix and sorting transformations
-- recursion and exception-handling bugs
-- stateful cache logic and multi-bug hard tasks
-### Difficulty interpretation
-- `easy`: usually a single-line or single-concept bug with direct test feedback.
-- `medium`: often requires understanding data transformation logic or helper-module behavior.
-- `hard`: commonly involves state, multi-step reasoning, or fixes that span more than one conceptual location.
-## Episode Flow
-1. `reset()` selects a difficulty.
-2. The environment loads the buggy code, test path, workspace path, and zeroed metrics.
-3. The agent alternates between `apply_patch`, `run_tests`, and optional `get_logs`.
-4. The episode ends when all tests pass or the step budget is exhausted.
-By default, the server cycles through `easy`, `medium`, and `hard` on reset. You can force a specific difficulty with `TRACERL_TASK=easy`, `TRACERL_TASK=medium`, or `TRACERL_TASK=hard`.
-## Data Sources
-`CodeEnv` defaults to `TASK_SOURCE=swebench`. If SWE-bench Lite task materialization is unavailable, it falls back to the local curated dataset when `SWEBENCH_FALLBACK_LOCAL=1` is enabled, which is the current default behavior.
-Expected SWE-bench Lite workspace layout:
-```text
-rl_code_fix_env/dataset/swebench_lite_tasks/<instance_id>/
-  buggy.py
-  test.py
 ```
-## Setup Instructions
-### Local Python setup
-From the repository root:
-```bash
-cd rl_code_fix_env
-uv sync
-```
-If you are not using `uv`, install the shared dependencies from the repository root:
-```bash
-pip install -r requirements.txt
 ```
-### Required environment variables for inference
-The baseline agent expects:
-```bash
-API_BASE_URL=<openai-compatible-endpoint>
-MODEL_NAME=<model-id>
-HF_TOKEN=<api-key>
-```
-Useful optional variables:
-```bash
-ENV_URL=http://localhost:8000
-TRACERL_TASK=easy
-TASK_SOURCE=swebench
-SWEBENCH_FALLBACK_LOCAL=1
-MAX_STEPS=10
-TEMPERATURE=0.2
-MAX_TOKENS=2048
-SUCCESS_THRESHOLD=1.0
-MAX_RETRIES=3
 ```
-## Usage Instructions
-### Run the environment server locally
-```bash
-cd rl_code_fix_env
-uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
 ```
-Alternative entry point:
-```bash
-cd rl_code_fix_env
-uv run --project . server
-```
-### Run the baseline inference agent
-Open a second terminal:
 ```bash
-cd rl_code_fix_env
-python inference.py
 ```
-The script emits machine-parseable lines in this format:
-```text
-[START] task=<task_name> env=<benchmark> model=<model_name>
-[STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
-[END]   success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
-```
-### Build and run with Docker
-From `rl_code_fix_env/`:
 ```bash
-docker build -t rl_code_fix_env-env:latest -f server/Dockerfile .
-docker run -p 8000:8000 rl_code_fix_env-env:latest
 ```
-### OpenEnv / Hugging Face Spaces deployment
-From `rl_code_fix_env/`:
-```bash
-openenv push
 ```
-The package is configured as a FastAPI OpenEnv space via `openenv.yaml`.
-## Baseline Performance Scores
-The current recorded baseline in `logs.md` ran one episode each for `easy`, `medium`, and `hard` using model `qwen/qwen3-coder-480b-a35b-instruct`.
-| Task | Success | Steps | Final score | Reward trace | Cumulative reward |
-| --- | --- | --- | --- | --- | --- |
-| Easy | `false` | 10 | 0.00 | `0.14,0.13,0.12,0.11,0.10,0.09,0.08,0.07,0.06,0.05` | 0.95 |
-| Medium | `false` | 10 | 0.00 | `0.14,0.13,0.12,0.11,0.10,0.09,0.08,0.07,0.06,0.05` | 0.95 |
-| Hard | `false` | 10 | 0.00 | `0.14,0.13,0.12,0.11,0.10,0.09,0.08,0.07,0.06,0.05` | 0.95 |
-Aggregate baseline summary:
-- episodes evaluated: 3
-- success rate: `0/3`
-- mean final score: `0.00`
-- mean cumulative reward: `0.95`
-Interpretation:
-- The baseline agent produced syntactically plausible patches and collected small shaped rewards.
-- It did not achieve a passing test score on any recorded task.
-- The current baseline should be treated as a starting point rather than a competitive upper bound.
-## Notes and Caveats
-- The local fallback tasks currently use one target test file per problem, so `test_score` is binary.
-- Patch application uses `unidiff` plus fuzzy matching from `diff-match-patch`, which makes the environment more tolerant to slightly stale context.
-- Test execution prefers Docker sandboxing, but falls back to direct `pytest` execution when Docker is unavailable.

+---
+title: Rl Code Fix Env Environment Server
+emoji: "🚀"
+colorFrom: green
+colorTo: purple
+sdk: docker
+pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - openenv
+---
+# Rl Code Fix Env Environment
+A simple test environment that echoes back messages. Perfect for testing the env APIs as well as demonstrating environment usage patterns.
+## Quick Start
+The simplest way to use the Rl Code Fix Env environment is through the `RlCodeFixEnv` class:
+```python
+from rl_code_fix_env import RlCodeFixAction, RlCodeFixEnv
+try:
+    # Create environment from Docker image
+    rl_code_fix_envenv = RlCodeFixEnv.from_docker_image("rl_code_fix_env-env:latest")
+    # Reset
+    result = rl_code_fix_envenv.reset()
+    print(f"Reset: {result.observation.echoed_message}")
+    # Send multiple messages
+    messages = ["Hello, World!", "Testing echo", "Final message"]
+    for msg in messages:
+        result = rl_code_fix_envenv.step(RlCodeFixAction(message=msg))
+        print(f"Sent: '{msg}'")
+        print(f"   Echoed: '{result.observation.echoed_message}'")
+        print(f"   Length: {result.observation.message_length}")
+        print(f"   Reward: {result.reward}")
+finally:
+    # Always clean up
+    rl_code_fix_envenv.close()
 ```
+That's it! The `RlCodeFixEnv.from_docker_image()` method handles:
+- Starting the Docker container
+- Waiting for the server to be ready
+- Connecting to the environment
+- Container cleanup when you call `close()`
+## Building the Docker Image
+Before using the environment, you need to build the Docker image:
+```bash
+# From project root
+docker build -t rl_code_fix_env-env:latest -f server/Dockerfile .
 ```
+## Deploying to Hugging Face Spaces
+You can easily deploy your OpenEnv environment to Hugging Face Spaces using the `openenv push` command:
+```bash
+# From the environment directory (where openenv.yaml is located)
+openenv push
+# Or specify options
+openenv push --namespace my-org --private
 ```
+The `openenv push` command will:
+1. Validate that the directory is an OpenEnv environment (checks for `openenv.yaml`)
+2. Prepare a custom build for Hugging Face Docker space (enables web interface)
+3. Upload to Hugging Face (ensuring you're logged in)
+### Prerequisites
+- Authenticate with Hugging Face: The command will prompt for login if not already authenticated
+### Options
+- `--directory`, `-d`: Directory containing the OpenEnv environment (defaults to current directory)
+- `--repo-id`, `-r`: Repository ID in format 'username/repo-name' (defaults to 'username/env-name' from openenv.yaml)
+- `--base-image`, `-b`: Base Docker image to use (overrides Dockerfile FROM)
+- `--private`: Deploy the space as private (default: public)
+### Examples
+```bash
+# Push to your personal namespace (defaults to username/env-name from openenv.yaml)
+openenv push
+# Push to a specific repository
+openenv push --repo-id my-org/my-env
+# Push with a custom base image
+openenv push --base-image ghcr.io/meta-pytorch/openenv-base:latest
+# Push as a private space
+openenv push --private
+# Combine options
+openenv push --repo-id my-org/my-env --base-image custom-base:latest --private
+```
+After deployment, your space will be available at:
+`https://huggingface.co/spaces/<repo-id>`
+The deployed space includes:
+- **Web Interface** at `/web` - Interactive UI for exploring the environment
+- **API Documentation** at `/docs` - Full OpenAPI/Swagger interface
+- **Health Check** at `/health` - Container health monitoring
+- **WebSocket** at `/ws` - Persistent session endpoint for low-latency interactions
+## Environment Details
+### Action
+**RlCodeFixAction**: Contains a single field
+- `message` (str) - The message to echo back
+### Observation
+**RlCodeFixObservation**: Contains the echo response and metadata
+- `echoed_message` (str) - The message echoed back
+- `message_length` (int) - Length of the message
+- `reward` (float) - Reward based on message length (length  0.1)
+- `done` (bool) - Always False for echo environment
+- `metadata` (dict) - Additional info like step count
+### Reward
+The reward is calculated as: `message_length  0.1`
+- "Hi"  reward: 0.2
+- "Hello, World!"  reward: 1.3
+- Empty message  reward: 0.0
+## Advanced Usage
+### Connecting to an Existing Server
+If you already have a Rl Code Fix Env environment server running, you can connect directly:
+```python
+from rl_code_fix_env import RlCodeFixEnv
+# Connect to existing server
+rl_code_fix_envenv = RlCodeFixEnv(base_url="<ENV_HTTP_URL_HERE>")
+# Use as normal
+result = rl_code_fix_envenv.reset()
+result = rl_code_fix_envenv.step(RlCodeFixAction(message="Hello!"))
 ```
+Note: When connecting to an existing server, `rl_code_fix_envenv.close()` will NOT stop the server.
+### Using the Context Manager
+The client supports context manager usage for automatic connection management:
+```python
+from rl_code_fix_env import RlCodeFixAction, RlCodeFixEnv
+# Connect with context manager (auto-connects and closes)
+with RlCodeFixEnv(base_url="http://localhost:8000") as env:
+    result = env.reset()
+    print(f"Reset: {result.observation.echoed_message}")
+    # Multiple steps with low latency
+    for msg in ["Hello", "World", "!"]:
+        result = env.step(RlCodeFixAction(message=msg))
+        print(f"Echoed: {result.observation.echoed_message}")
 ```
+The client uses WebSocket connections for:
+- **Lower latency**: No HTTP connection overhead per request
+- **Persistent session**: Server maintains your environment state
+- **Efficient for episodes**: Better for many sequential steps
+### Concurrent WebSocket Sessions
+The server supports multiple concurrent WebSocket connections. To enable this,
+modify `server/app.py` to use factory mode:
+```python
+# In server/app.py - use factory mode for concurrent sessions
+app = create_app(
+    RlCodeFixEnvironment,  # Pass class, not instance
+    RlCodeFixAction,
+    RlCodeFixObservation,
+    max_concurrent_envs=4,  # Allow 4 concurrent sessions
+)
 ```
+Then multiple clients can connect simultaneously:
+```python
+from rl_code_fix_env import RlCodeFixAction, RlCodeFixEnv
+from concurrent.futures import ThreadPoolExecutor
+def run_episode(client_id: int):
+    with RlCodeFixEnv(base_url="http://localhost:8000") as env:
+        result = env.reset()
+        for i in range(10):
+            result = env.step(RlCodeFixAction(message=f"Client {client_id}, step {i}"))
+        return client_id, result.observation.message_length
+# Run 4 episodes concurrently
+with ThreadPoolExecutor(max_workers=4) as executor:
+    results = list(executor.map(run_episode, range(4)))
 ```
+## Development & Testing
+### Direct Environment Testing
+Test the environment logic directly without starting the HTTP server:
 ```bash
+# From the server directory
+python3 server/rl_code_fix_env_environment.py
 ```
+This verifies that:
+- Environment resets correctly
+- Step executes actions properly
+- State tracking works
+- Rewards are calculated correctly
+### Running Locally
+Run the server locally for development:
 ```bash
+uvicorn server.app:app --reload
 ```
+## Project Structure
 ```
+rl_code_fix_env/
+ .dockerignore         # Docker build exclusions
+ __init__.py            # Module exports
+ README.md              # This file
+ openenv.yaml           # OpenEnv manifest
+ pyproject.toml         # Project metadata and dependencies
+ uv.lock                # Locked dependencies (generated)
+ client.py              # RlCodeFixEnv client
+ models.py              # Action and Observation models
+ server/
+     __init__.py        # Server module exports
+     rl_code_fix_env_environment.py  # Core environment logic
+     app.py             # FastAPI application (HTTP + WebSocket endpoints)
+     Dockerfile         # Container image definition
+```

__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Rl Code Fix Env Environment."""
+from .models import CodeFixerAction, CodeFixerObservation
+__all__ = [
+    "CodeFixerAction",
+    "CodeFixerObservation",
+]

_aliases.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import sys
+import importlib
+from pathlib import Path
+_REPO_ROOT = str(Path(__file__).parent)
+if _REPO_ROOT not in sys.path:
+    sys.path.insert(0, _REPO_ROOT)
+import dataset as _real_dataset
+sys.modules.setdefault("src.dataset", _real_dataset)
+import pkgutil
+for _pkg in pkgutil.iter_modules(_real_dataset.__path__):
+    _full = f"dataset.{_pkg.name}"
+    _alias = f"src.dataset.{_pkg.name}"
+    try:
+        _mod = importlib.import_module(_full)
+        sys.modules.setdefault(_alias, _mod)
+    except Exception:
+        pass

client.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Code Fixer Environment Client."""
+import asyncio
+import inspect
+import logging
+from typing import Dict
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from openenv.core.env_server.types import State
+from rl_code_fix_env.models import CodeFixerAction, CodeFixerObservation
+log = logging.getLogger(__name__)
+class CodeFixerEnv(
+    EnvClient[CodeFixerAction, CodeFixerObservation, State]
+):
+    """
+    Client for the Code Fixer Environment.
+    This client maintains a persistent WebSocket connection to the environment server,
+    enabling efficient multi-step interactions with lower latency.
+    Each client instance has its own dedicated environment session on the server.
+    Example:
+        >>> # Connect to a running server
+        >>> with CodeFixerEnv(base_url="http://localhost:8000") as client:
+        ...     result = client.reset()
+        ...     print(result.observation.code)
+        ...
+        ...     result = client.step(CodeFixerAction(type="run_tests"))
+        ...     print(result.observation.test_passed)
+    Example with Docker:
+        >>> # Automatically start container and connect
+        >>> client = CodeFixerEnv.from_docker_image("code_fixer-env:latest")
+        >>> try:
+        ...     result = client.reset()
+        ...     result = client.step(CodeFixerAction(type="run_tests"))
+        ... finally:
+        ...     client.close()
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._loop = asyncio.new_event_loop()
+        # Store init args for reconnection
+        self._init_args = args
+        self._init_kwargs = kwargs
+    def _run_sync(self, result):
+        """Run coroutine results on this client's dedicated event loop."""
+        if inspect.iscoroutine(result):
+            return self._loop.run_until_complete(result)
+        return result
+    def _reconnect(self) -> None:
+        """
+        Tear down the dead event loop and WebSocket connection, then
+        re-initialise so the next call works cleanly.
+        Called automatically by reset() and step() when a 1011 / timeout
+        error is detected after an idle period.
+        """
+        log.warning("[CodeFixerEnv] WebSocket timed out  reconnecting...")
+        # Close the old loop gracefully
+        try:
+            self._run_sync(super().close())
+        except Exception:
+            pass
+        if not self._loop.is_closed():
+            self._loop.close()
+        # Re-initialise: fresh loop + fresh base-class state
+        self._loop = asyncio.new_event_loop()
+        super().__init__(*self._init_args, **self._init_kwargs)
+        log.warning("[CodeFixerEnv] Reconnected successfully.")
+    @staticmethod
+    def _is_reconnectable_ws_error(exc: Exception) -> bool:
+        err = str(exc).lower()
+        reconnect_markers = (
+            "1011",
+            "1006",
+            "keepalive",
+            "timed out",
+            "closed",
+            "close frame",
+            "connection closed",
+            "connectionclosed",
+            "websocket",
+        )
+        return any(marker in err for marker in reconnect_markers)
+    def reset(self):
+        """Reset the environment  auto-reconnects if the WebSocket died."""
+        try:
+            return self._run_sync(super().reset())
+        except Exception as exc:
+            if self._is_reconnectable_ws_error(exc):
+                self._reconnect()
+                return self._run_sync(super().reset())  # one retry
+            raise
+    def step(self, action: CodeFixerAction):
+        """Execute a step  auto-reconnects if the WebSocket died."""
+        try:
+            return self._run_sync(super().step(action))
+        except Exception as exc:
+            if self._is_reconnectable_ws_error(exc):
+                self._reconnect()
+                return self._run_sync(super().step(action))  # one retry
+            raise
+    def close(self):
+        """Close client resources and the dedicated event loop safely."""
+        try:
+            self._run_sync(super().close())
+        finally:
+            if not self._loop.is_closed():
+                self._loop.close()
+    def _step_payload(self, action: CodeFixerAction) -> Dict:
+        """
+        Convert CodeFixerAction to JSON payload for step message.
+        Args:
+            action: CodeFixerAction instance
+        Returns:
+            Dictionary representation suitable for JSON encoding
+        """
+        return {
+            "type": action.type,
+            "payload": action.payload,
+        }
+    def _parse_result(self, payload: Dict) -> StepResult[CodeFixerObservation]:
+        """
+        Parse server response into StepResult[CodeFixerObservation].
+        Args:
+            payload: JSON response data from server
+        Returns:
+            StepResult with CodeFixerObservation
+        """
+        obs_data = payload.get("observation", {})
+        observation = CodeFixerObservation(
+            code=obs_data.get("code", ""),
+            logs=obs_data.get("logs"),
+            test_score=float(obs_data.get("test_score", 0.0)),
+            total_tests=obs_data.get("total_tests", 1),
+            steps=obs_data.get("steps", 0),
+            done=obs_data.get("done", payload.get("done", False)),
+            reward=obs_data.get("reward", payload.get("reward")),
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict) -> State:
+        """
+        Parse server response into State object.
+        Args:
+            payload: JSON response from state request
+        Returns:
+            State object with episode_id and step_count
+        """
+        return State(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+        )

conftest.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import sys
+import importlib
+from pathlib import Path
+_REPO_ROOT = str(Path(__file__).parent)
+if _REPO_ROOT not in sys.path:
+    sys.path.insert(0, _REPO_ROOT)
+import dataset as _real_dataset
+sys.modules.setdefault("src.dataset", _real_dataset)
+import pkgutil
+for _pkg in pkgutil.iter_modules(_real_dataset.__path__):
+    _full = f"dataset.{_pkg.name}"
+    _alias = f"src.dataset.{_pkg.name}"
+    try:
+        _mod = importlib.import_module(_full)
+        sys.modules.setdefault(_alias, _mod)
+    except Exception:
+        pass

dataset/README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# Buggy Training Dataset
+This dataset is organized as:
+- `problem_x/buggy.py`: intentionally buggy implementation
+- `problem_x/test.py`: correctness tests that should fail before fixes
+- optional extra modules (`helpers.py`, `cache.py`, etc.) to support multi-file bug fixing
+Current problems: `problem_1` to `problem_18`.
+Bug patterns included:
+- off-by-one errors
+- boundary condition mistakes
+- incorrect sorting direction
+- exception handling mistakes
+- state/recency bugs in cache logic
+- recursive base-case bugs
+- parsing and whitespace normalization issues
+- order-preservation regressions
+- matrix transformation direction errors

dataset/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Dataset loading modules."""

dataset/generate_swebench_tasks.py ADDED Viewed

	@@ -0,0 +1,498 @@

+"""
+Generate synthetic SWE-bench style tasks for testing.
+This creates tasks that mimic the SWE-bench format:
+- instance_id/buggy.py - the buggy code
+- instance_id/test.py - test file
+- instance_id/metadata.json - metadata
+Usage:
+    python -m dataset.generate_swebench_tasks [--count N]
+"""
+import argparse
+import json
+import random
+from pathlib import Path
+# Sample SWE-bench style problems
+SWE_BENCH_PROBLEMS = [
+    {
+        "instance_id": "django__django-11098",
+        "repo": "django/django",
+        "problem": "Fix the user creation form validation error",
+        "buggy_code": '''from django import forms
+from django.contrib.auth.models import User
+class UserCreationForm(forms.ModelForm):
+    """Form for creating new users."""
+    password1 = forms.CharField(widget=forms.PasswordInput)
+    password2 = forms.CharField(widget=forms.PasswordInput)
+    class Meta:
+        model = User
+        fields = ('username', 'email')
+    def clean(self):
+        cleaned_data = super().clean()
+        password1 = cleaned_data.get('password1')
+        password2 = cleaned_data.get('password2')
+        # BUG: This comparison is case-sensitive but should be case-insensitive
+        if password1 != password2:
+            raise forms.ValidationError("Passwords don't match")
+        return cleaned_data
+    def save(self, commit=True):
+        user = super().save(commit=False)
+        user.set_password(self.cleaned_data['password1'])
+        if commit:
+            user.save()
+        return user
+''',
+        "test_code": '''import unittest
+from buggy import UserCreationForm
+class TestUserCreationForm(unittest.TestCase):
+    def test_password_matching(self):
+        """Test that matching passwords pass validation."""
+        form = UserCreationForm(data={
+            'username': 'testuser',
+            'email': 'test@example.com',
+            'password1': 'TestPass123',
+            'password2': 'TestPass123',
+        })
+        self.assertTrue(form.is_valid())
+    def test_password_mismatch(self):
+        """Test that mismatched passwords fail validation."""
+        form = UserCreationForm(data={
+            'username': 'testuser',
+            'email': 'test@example.com',
+            'password1': 'TestPass123',
+            'password2': 'testpass123',  # Different case
+        })
+        self.assertFalse(form.is_valid())
+        self.assertIn('passwords', str(form.errors).lower())
+''',
+    },
+    {
+        "instance_id": "flask__flask-1048",
+        "repo": "pallets/flask",
+        "problem": "Fix JSON encoding for datetime objects",
+        "buggy_code": '''import json
+from datetime import datetime, date
+class JSONEncoder(json.JSONEncoder):
+    """Custom JSON encoder for Flask."""
+    def default(self, obj):
+        # BUG: Missing handling for datetime objects
+        if isinstance(obj, date):
+            return obj.isoformat()
+        return super().default(obj)
+def to_json(obj):
+    """Convert object to JSON string."""
+    return json.dumps(obj, cls=JSONEncoder)
+''',
+        "test_code": '''import unittest
+from datetime import datetime
+from buggy import to_json
+class TestJSONEncoding(unittest.TestCase):
+    def test_encode_datetime(self):
+        """Test that datetime objects are properly encoded."""
+        dt = datetime(2024, 1, 15, 10, 30, 0)
+        result = to_json({'timestamp': dt})
+        self.assertIn('2024-01-15', result)
+        self.assertIn('10:30:00', result)
+    def test_encode_date(self):
+        """Test that date objects are properly encoded."""
+        d = date(2024, 1, 15)
+        result = to_json({'date': d})
+        self.assertIn('2024-01-15', result)
+''',
+    },
+    {
+        "instance_id": "requests__requests-2875",
+        "repo": "psf/requests",
+        "problem": "Fix cookie domain matching",
+        "buggy_code": '''import re
+from urllib.parse import urlparse
+def match_cookie_domain(cookie_domain, request_domain):
+    """Check if cookie domain matches request domain."""
+    # BUG: Should handle leading dots differently
+    # .example.com should match sub.example.com but not example.com
+    cookie_domain = cookie_domain.lower()
+    request_domain = request_domain.lower()
+    if cookie_domain.startswith('.'):
+        return request_domain.endswith(cookie_domain)
+    return cookie_domain == request_domain
+''',
+        "test_code": '''import unittest
+from buggy import match_cookie_domain
+class TestCookieDomain(unittest.TestCase):
+    def test_exact_match(self):
+        """Test exact domain matching."""
+        self.assertTrue(match_cookie_domain('example.com', 'example.com'))
+    def test_subdomain_with_dot(self):
+        """Test subdomain matching with leading dot."""
+        # .example.com should match sub.example.com
+        self.assertTrue(match_cookie_domain('.example.com', 'sub.example.com'))
+        self.assertFalse(match_cookie_domain('.example.com', 'example.com'))
+    def test_different_domains(self):
+        """Test different domains don't match."""
+        self.assertFalse(match_cookie_domain('example.com', 'other.com'))
+''',
+    },
+    {
+        "instance_id": "numpy__numpy-10825",
+        "repo": "numpy/numpy",
+        "problem": "Fix array concatenation edge case",
+        "buggy_code": '''import numpy as np
+def concatenate_arrays(*arrays):
+    """Concatenate multiple arrays along axis 0."""
+    if not arrays:
+        return np.array([])
+    # BUG: Should handle None arrays gracefully
+    result = arrays[0]
+    for arr in arrays[1:]:
+        result = np.concatenate([result, arr])
+    return result
+''',
+        "test_code": '''import unittest
+import numpy as np
+from buggy import concatenate_arrays
+class TestArrayConcatenation(unittest.TestCase):
+    def test_basic_concatenation(self):
+        """Test basic array concatenation."""
+        a = np.array([1, 2, 3])
+        b = np.array([4, 5, 6])
+        result = concatenate_arrays(a, b)
+        np.testing.assert_array_equal(result, np.array([1, 2, 3, 4, 5, 6]))
+    def test_empty_input(self):
+        """Test empty input returns empty array."""
+        result = concatenate_arrays()
+        self.assertEqual(len(result), 0)
+    def test_single_array(self):
+        """Test single array passes through."""
+        a = np.array([1, 2, 3])
+        result = concatenate_arrays(a)
+        np.testing.assert_array_equal(result, a)
+''',
+    },
+    {
+        "instance_id": "pandas__pandas-15230",
+        "repo": "pandas-dev/pandas",
+        "problem": "Fix DataFrame groupby aggregation",
+        "buggy_code": '''import pandas as pd
+def group_and_aggregate(df, group_col, agg_col, agg_func='mean'):
+    """Group DataFrame and aggregate."""
+    # BUG: Should handle non-numeric columns gracefully
+    if agg_func == 'mean':
+        return df.groupby(group_col)[agg_col].mean()
+    elif agg_func == 'sum':
+        return df.groupby(group_col)[agg_col].sum()
+    elif agg_func == 'count':
+        return df.groupby(group_col)[agg_col].count()
+    else:
+        raise ValueError(f"Unknown aggregation function: {agg_func}")
+''',
+        "test_code": '''import unittest
+import pandas as pd
+from buggy import group_and_aggregate
+class TestGroupBy(unittest.TestCase):
+    def test_mean_aggregation(self):
+        """Test mean aggregation."""
+        df = pd.DataFrame({
+            'category': ['A', 'A', 'B', 'B'],
+            'value': [1, 2, 3, 4]
+        })
+        result = group_and_aggregate(df, 'category', 'value', 'mean')
+        self.assertEqual(result['A'], 1.5)
+        self.assertEqual(result['B'], 3.5)
+    def test_sum_aggregation(self):
+        """Test sum aggregation."""
+        df = pd.DataFrame({
+            'category': ['A', 'A', 'B'],
+            'value': [1, 2, 3]
+        })
+        result = group_and_aggregate(df, 'category', 'value', 'sum')
+        self.assertEqual(result['A'], 3)
+        self.assertEqual(result['B'], 3)
+''',
+    },
+    {
+        "instance_id": "scipy__scipy-1925",
+        "repo": "scipy/scipy",
+        "problem": "Fix signal filtering edge case",
+        "buggy_code": '''import numpy as np
+from scipy import signal
+def apply_lowpass_filter(data, cutoff, fs, order=5):
+    """Apply lowpass filter to data."""
+    # BUG: Should validate cutoff frequency
+    nyquist = fs / 2
+    normalized_cutoff = cutoff / nyquist
+    # BUG: Using invalid cutoff can cause filter design failure
+    b, a = signal.butter(order, normalized_cutoff, btype='low')
+    filtered = signal.filtfilt(b, a, data)
+    return filtered
+''',
+        "test_code": '''import unittest
+import numpy as np
+from buggy import apply_lowpass_filter
+class TestSignalFiltering(unittest.TestCase):
+    def test_valid_filter(self):
+        """Test filtering with valid parameters."""
+        fs = 1000  # Sampling frequency
+        cutoff = 100  # Cutoff frequency
+        t = np.linspace(0, 1, fs)
+        data = np.sin(2 * np.pi * 50 * t) + 0.5 * np.sin(2 * np.pi * 200 * t)
+        result = apply_lowpass_filter(data, cutoff, fs)
+        self.assertEqual(len(result), len(data))
+        # Low frequency component should be preserved
+        self.assertTrue(np.abs(result[100]) > 0.5)
+    def test_invalid_cutoff(self):
+        """Test that invalid cutoff raises error."""
+        fs = 1000
+        cutoff = 2000  # Above Nyquist frequency - should fail
+        data = np.array([1, 2, 3, 4, 5])
+        with self.assertRaises(ValueError):
+            apply_lowpass_filter(data, cutoff, fs)
+''',
+    },
+    {
+        "instance_id": "sklearn__sklearn-12345",
+        "repo": "scikit-learn/scikit-learn",
+        "problem": "Fix cross-validation split",
+        "buggy_code": '''import numpy as np
+from sklearn.model_selection import KFold
+def get_cv_splits(X, n_splits=5, shuffle=True, random_state=42):
+    """Get cross-validation splits."""
+    # BUG: random_state should be used for reproducibility
+    kf = KFold(n_splits=n_splits, shuffle=shuffle)
+    splits = []
+    for train_idx, test_idx in kf.split(X):
+        splits.append((train_idx, test_idx))
+    return splits
+''',
+        "test_code": '''import unittest
+import numpy as np
+from buggy import get_cv_splits
+class TestCVSplits(unittest.TestCase):
+    def test_split_count(self):
+        """Test that correct number of splits is generated."""
+        X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
+        splits = get_cv_splits(X, n_splits=3)
+        self.assertEqual(len(splits), 3)
+    def test_reproducibility(self):
+        """Test that splits are reproducible with same random_state."""
+        X = np.random.rand(100, 5)
+        splits1 = get_cv_splits(X, n_splits=5, random_state=42)
+        splits2 = get_cv_splits(X, n_splits=5, random_state=42)
+        for (train1, test1), (train2, test2) in zip(splits1, splits2):
+            np.testing.assert_array_equal(train1, train2)
+            np.testing.assert_array_equal(test1, test2)
+''',
+    },
+    {
+        "instance_id": "pytest__pytest-7426",
+        "repo": "pytest-dev/pytest",
+        "problem": "Fix test collection order",
+        "buggy_code": '''import os
+import re
+def collect_tests(directory, pattern='test_*.py'):
+    """Collect test files from directory."""
+    # BUG: Should sort files for consistent ordering
+    test_files = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if re.match(pattern, file):
+                test_files.append(os.path.join(root, file))
+    return test_files
+''',
+        "test_code": '''import unittest
+import os
+import tempfile
+from buggy import collect_tests
+class TestCollection(unittest.TestCase):
+    def test_collect_pattern(self):
+        """Test that correct pattern is matched."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create test files
+            open(os.path.join(tmpdir, 'test_a.py'), 'w').close()
+            open(os.path.join(tmpdir, 'test_b.py'), 'w').close()
+            open(os.path.join(tmpdir, 'not_a_test.py'), 'w').close()
+            tests = collect_tests(tmpdir, 'test_*.py')
+            self.assertEqual(len(tests), 2)
+    def test_consistent_order(self):
+        """Test that file order is consistent."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            for name in ['test_c.py', 'test_a.py', 'test_b.py']:
+                open(os.path.join(tmpdir, name), 'w').close()
+            tests1 = collect_tests(tmpdir)
+            tests2 = collect_tests(tmpdir)
+            self.assertEqual(tests1, tests2)
+''',
+    },
+    {
+        "instance_id": "transformers__transformers-12345",
+        "repo": "huggingface/transformers",
+        "problem": "Fix tokenization padding",
+        "buggy_code": '''from typing import List
+def tokenize_and_pad(tokenizer, texts: List[str], max_length: int = 512):
+    """Tokenize texts and pad to max length."""
+    # BUG: Should handle padding correctly
+    encoded = tokenizer(
+        texts,
+        padding=True,  # This pads to longest in batch, not max_length
+        truncation=True,
+        max_length=max_length,
+        return_tensors='pt'
+    )
+    return encoded
+''',
+        "test_code": '''import unittest
+from buggy import tokenize_and_pad
+class MockTokenizer:
+    def __call__(self, texts, padding=True, truncation=True, max_length=512, return_tensors=None):
+        # Simplified mock
+        return {
+            'input_ids': [[1, 2, 3]] if isinstance(texts, list) else [1, 2, 3],
+            'attention_mask': [[1, 1, 1]] if isinstance(texts, list) else [1, 1, 1]
+        }
+class TestTokenization(unittest.TestCase):
+    def test_single_text(self):
+        """Test tokenizing single text."""
+        tokenizer = MockTokenizer()
+        result = tokenize_and_pad(tokenizer, ["hello world"])
+        self.assertIn('input_ids', result)
+    def test_max_length_respected(self):
+        """Test that max_length is respected."""
+        tokenizer = MockTokenizer()
+        # Should not raise even with long text
+        result = tokenize_and_pad(tokenizer, ["short"], max_length=10)
+        self.assertIn('input_ids', result)
+''',
+    },
+]
+# Easy, Medium, Hard difficulty assignments
+DIFFICULTY_TASKS = {
+    "easy": SWE_BENCH_PROBLEMS[:3],
+    "medium": SWE_BENCH_PROBLEMS[3:6],
+    "hard": SWE_BENCH_PROBLEMS[6:],
+}
+def generate_tasks(output_dir: Path, count_per_difficulty: int = 3):
+    """Generate SWE-bench style tasks."""
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    total_created = 0
+    for difficulty, problems in DIFFICULTY_TASKS.items():
+        for i, problem in enumerate(problems[:count_per_difficulty]):
+            instance_id = f"{problem['instance_id']}_{difficulty}_{i}"
+            instance_dir = output_dir / instance_id
+            instance_dir.mkdir(parents=True, exist_ok=True)
+            # Write buggy.py
+            buggy_file = instance_dir / "buggy.py"
+            buggy_file.write_text(problem["buggy_code"], encoding="utf-8")
+            # Write test.py
+            test_file = instance_dir / "test.py"
+            test_file.write_text(problem["test_code"], encoding="utf-8")
+            # Write metadata.json
+            metadata = {
+                "instance_id": instance_id,
+                "repo": problem["repo"],
+                "problem_statement": problem["problem"],
+                "difficulty": difficulty,
+            }
+            metadata_file = instance_dir / "metadata.json"
+            metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
+            total_created += 1
+    print(f"Created {total_created} tasks in {output_dir}")
+    print(f"Set environment variable: SWEBENCH_TASKS_ROOT={output_dir.absolute()}")
+    print(f"Or run with: TASK_SOURCE=swebench python inference.py")
+def main():
+    parser = argparse.ArgumentParser(description="Generate SWE-bench style tasks")
+    parser.add_argument(
+        "--count",
+        type=int,
+        default=3,
+        help="Number of tasks per difficulty (default: 3)"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Output directory (default: dataset/swebench_lite_tasks)"
+    )
+    args = parser.parse_args()
+    if args.output_dir:
+        output_dir = Path(args.output_dir)
+    else:
+        script_dir = Path(__file__).parent
+        output_dir = script_dir / "swebench_lite_tasks"
+    generate_tasks(output_dir, args.count)
+if __name__ == "__main__":
+    main()

dataset/loader.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""Load static, competition-approved tasks."""
+import os
+import json
+from pathlib import Path
+from typing import Dict, List, Optional
+# Get the dataset root (same folder as this file)
+DATASET_ROOT = Path(__file__).parent
+# Hardcoded competition tasks: Easy  Medium  Hard
+STATIC_TASKS = {
+    "easy": {
+        "problem_id": "problem_1",
+        "difficulty": "easy",
+        "description": "String reversal with space normalization",
+    },
+    "medium": {
+        "problem_id": "problem_10",
+        "difficulty": "medium",
+        "description": "Matrix 90 clockwise rotation",
+    },
+    "hard": {
+        "problem_id": "problem_13",
+        "difficulty": "hard",
+        "description": "LRU cache with correct eviction policy",
+    },
+}
+def load_problem(problem_id: str) -> Dict[str, any]:
+    """
+    Load a single problem from disk.
+    Args:
+        problem_id: e.g., "problem_1", "problem_10", "problem_13"
+    Returns:
+        {
+            "code": str,          # buggy.py content
+            "tests": str,         # test.py path (relative to problem folder)
+            "metadata": dict,     # metadata.json
+            "problem_dir": str,   # absolute path to problem folder
+        }
+    """
+    problem_dir = DATASET_ROOT / problem_id
+    if not problem_dir.exists():
+        raise FileNotFoundError(f"Problem directory not found: {problem_dir}")
+    # Load buggy code
+    buggy_file = problem_dir / "buggy.py"
+    code = buggy_file.read_text(encoding="utf-8")
+    # Load metadata
+    metadata_file = problem_dir / "metadata.json"
+    metadata = json.loads(metadata_file.read_text(encoding="utf-8"))
+    # Test file path (relative to problem root)
+    test_path = str(problem_dir / "test.py")
+    return {
+        "code": code,
+        "tests": test_path,
+        "metadata": metadata,
+        "problem_dir": str(problem_dir),
+        "problem_id": problem_id,
+    }
+def get_hardcoded_task(difficulty: str) -> Dict[str, any]:
+    """
+    Get one of the three static competition tasks.
+    Args:
+        difficulty: "easy" | "medium" | "hard"
+    Returns:
+        Task dict with code, tests, metadata
+    Raises:
+        ValueError: if difficulty is not one of the three approved values
+    """
+    if difficulty not in STATIC_TASKS:
+        raise ValueError(
+            f"Invalid difficulty '{difficulty}'. "
+            f"Must be one of: {list(STATIC_TASKS.keys())}"
+        )
+    task_info = STATIC_TASKS[difficulty]
+    problem_id = task_info["problem_id"]
+    return load_problem(problem_id)
+def get_random_tasks():
+    """
+    DEPRECATED: Use get_hardcoded_task() instead.
+    Kept for backward compatibility.
+    """
+    import warnings
+    warnings.warn(
+        "get_random_tasks() is deprecated. Use get_hardcoded_task('easy'|'medium'|'hard')",
+        DeprecationWarning,
+        stacklevel=2
+    )
+    # Return a default (easy)
+    return get_hardcoded_task("easy")

dataset/prepare_swebench.py ADDED Viewed

	@@ -0,0 +1,274 @@

+"""
+Script to download and materialize SWE-bench Lite tasks.
+This script:
+1. Downloads SWE-bench Lite dataset from HuggingFace
+2. Extracts the buggy code and creates test files
+3. Organizes them into the expected directory structure
+Usage:
+    python -m dataset.prepare_swebench [--max-tasks N] [--difficulty easy|medium|hard|all]
+"""
+import argparse
+import os
+import sys
+from pathlib import Path
+# Add parent to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from datasets import load_dataset
+def get_problem_statement(row):
+    """Extract problem statement from row."""
+    return row.get("problem_statement", "")
+def get_patch(row):
+    """Extract the patch/fix from row."""
+    return row.get("patch", "")
+def get_instance_id(row):
+    """Get instance ID from row."""
+    return row.get("instance_id", "")
+def create_buggy_file(instance_dir: Path, row):
+    """
+    Create buggy.py from the base commit and instance.
+    The SWE-bench dataset provides the full repository at base_commit.
+    We need to extract just the relevant file that has the bug.
+    """
+    # For SWE-bench, the "buggy" version is actually the version BEFORE the patch
+    # We need to get the file content from the base commit
+    # This is complex as it requires cloning the repo at a specific commit
+    # For simplicity, we'll use a different approach:
+    # The problem_statement describes the bug, and we can create a simplified
+    # buggy version based on that description
+    instance_id = get_instance_id(row)
+    problem_stmt = get_problem_statement(row)
+    # Try to extract the file from the created files in the instance
+    # SWE-bench provides 'repo' and we need to find the relevant file
+    created_files = row.get("created_files", [])
+    if not created_files:
+        # Fallback: create a placeholder
+        buggy_code = f'''# Buggy code for {instance_id}
+# Problem: {problem_stmt[:200]}...
+def solution():
+    """Placeholder solution - needs to be fixed."""
+    pass
+'''
+    else:
+        # For now, create a simple placeholder
+        # In a full implementation, we'd clone the repo at base_commit
+        file_path = created_files[0] if created_files else "solution.py"
+        buggy_code = f'''# Buggy code for {instance_id}
+# File: {file_path}
+# Problem: {problem_stmt[:200]}...
+def solution():
+    """Placeholder solution - needs to be fixed."""
+    pass
+'''
+    buggy_file = instance_dir / "buggy.py"
+    buggy_file.write_text(buggy_code, encoding="utf-8")
+    return buggy_file
+def create_test_file(instance_dir: Path, row):
+    """
+    Create test.py based on the problem statement.
+    For SWE-bench, tests are typically derived from the issue description.
+    We'll create a simple test that checks if the solution works.
+    """
+    instance_id = get_instance_id(row)
+    problem_stmt = get_problem_statement(row)
+    # Create a simple test file
+    # In practice, SWE-bench has a test.json file with test cases
+    test_cases = row.get("test_cases", [])
+    if test_cases:
+        # Create tests from provided test cases
+        test_code = "import unittest\\n\\n"
+        for i, tc in enumerate(test_cases):
+            input_str = tc.get("input", "")
+            output_str = tc.get("output", "")
+            test_code += f'''class TestSolution(unittest.TestCase):
+    def test_case_{i+1}(self):
+        # Input: {input_str}
+        # Expected: {output_str}
+        pass  # TODO: Add actual test
+'''
+    else:
+        # Create a basic test based on problem statement
+        test_code = f'''"""Test file for {instance_id}"""
+import unittest
+from buggy import solution
+class TestSolution(unittest.TestCase):
+    def test_basic(self):
+        """Test based on problem statement."""
+        # Problem: {problem_stmt[:300]}...
+        result = solution()
+        self.assertIsNotNone(result)
+if __name__ == "__main__":
+    unittest.main()
+'''
+    test_file = instance_dir / "test.py"
+    test_file.write_text(test_code, encoding="utf-8")
+    return test_file
+def create_metadata_file(instance_dir: Path, row):
+    """Create metadata.json with instance info."""
+    import json
+    metadata = {
+        "instance_id": get_instance_id(row),
+        "repo": row.get("repo", ""),
+        "base_commit": row.get("base_commit", ""),
+        "problem_statement": get_problem_statement(row),
+        "patch": get_patch(row),
+        "difficulty": "medium",  # Will be set based on index
+    }
+    metadata_file = instance_dir / "metadata.json"
+    metadata_file.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
+    return metadata_file
+def prepare_swebench_tasks(
+    output_dir: Path,
+    max_tasks: int = 30,
+    difficulty: str = "all"
+):
+    """
+    Download and prepare SWE-bench Lite tasks.
+    Args:
+        output_dir: Directory to save tasks
+        max_tasks: Maximum number of tasks to download
+        difficulty: "easy", "medium", "hard", or "all"
+    """
+    print(f"Loading SWE-bench Lite dataset...")
+    try:
+        ds = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
+    except Exception as e:
+        print(f"Error loading dataset: {e}")
+        print("Trying alternative dataset name...")
+        ds = load_dataset("swe-bench/swe-bench-lite", split="test")
+    print(f"Loaded {len(ds)} tasks")
+    # Calculate difficulty bounds
+    total = len(ds)
+    one_third = max(total // 3, 1)
+    two_third = max((2 * total) // 3, one_third + 1)
+    difficulty_ranges = {
+        "easy": (0, one_third),
+        "medium": (one_third, two_third),
+        "hard": (two_third, total),
+    }
+    # Determine which tasks to download
+    if difficulty == "all":
+        ranges = list(difficulty_ranges.values())
+        indices = []
+        for start, end in ranges:
+            indices.extend(range(start, min(end, start + max_tasks // 3)))
+    else:
+        start, end = difficulty_ranges.get(difficulty, (0, total))
+        indices = list(range(start, min(end, max_tasks)))
+    # Create output directory
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Preparing {len(indices)} tasks...")
+    success_count = 0
+    for i, idx in enumerate(indices):
+        try:
+            row = ds[idx]
+            instance_id = get_instance_id(row)
+            # Create instance directory
+            instance_dir = output_dir / instance_id
+            instance_dir.mkdir(parents=True, exist_ok=True)
+            # Create files
+            create_buggy_file(instance_dir, row)
+            create_test_file(instance_dir, row)
+            create_metadata_file(instance_dir, row)
+            success_count += 1
+            if (i + 1) % 10 == 0:
+                print(f"  Processed {i + 1}/{len(indices)} tasks...")
+        except Exception as e:
+            print(f"  Warning: Failed to process task {idx}: {e}")
+            continue
+    print(f"\nDone! Prepared {success_count}/{len(indices)} tasks in {output_dir}")
+    print(f"Set SWEBENCH_TASKS_ROOT={output_dir.absolute()} to use these tasks.")
+def main():
+    parser = argparse.ArgumentParser(description="Prepare SWE-bench Lite tasks")
+    parser.add_argument(
+        "--max-tasks",
+        type=int,
+        default=30,
+        help="Maximum number of tasks to download (default: 30)"
+    )
+    parser.add_argument(
+        "--difficulty",
+        type=str,
+        default="all",
+        choices=["easy", "medium", "hard", "all"],
+        help="Difficulty level to download (default: all)"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Output directory (default: dataset/swebench_lite_tasks)"
+    )
+    args = parser.parse_args()
+    # Determine output directory
+    if args.output_dir:
+        output_dir = Path(args.output_dir)
+    else:
+        script_dir = Path(__file__).parent
+        output_dir = script_dir / "swebench_lite_tasks"
+    prepare_swebench_tasks(
+        output_dir=output_dir,
+        max_tasks=args.max_tasks,
+        difficulty=args.difficulty
+    )
+if __name__ == "__main__":
+    main()

dataset/problem_1/buggy.py ADDED Viewed

	@@ -0,0 +1,7 @@

+def safe_divide(a: float, b: float) -> float:
+    """Divide a by b; only return inf for division by zero."""
+    try:
+        return a / b
+    except Exception:
+        # BUG: catches unrelated errors too broadly.
+        return float("inf")

dataset/problem_1/metadata.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "difficulty": "easy",
+  "bug_type": "exception-handling",
+  "expected_steps": 1
+}

dataset/problem_1/test.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import unittest
+from dataset.problem_1.buggy import safe_divide
+class TestSafeDivide(unittest.TestCase):
+    def test_normal(self):
+        self.assertEqual(safe_divide(8, 2), 4)
+    def test_zero_division(self):
+        self.assertEqual(safe_divide(1, 0), float("inf"))
+    def test_type_error_should_raise(self):
+        with self.assertRaises(TypeError):
+            safe_divide("1", 1)
+if __name__ == "__main__":
+    unittest.main()

dataset/problem_10/buggy.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from dataset.problem_10.helpers import transpose
+def rotate_90_clockwise(matrix: list[list[int]]) -> list[list[int]]:
+    """Rotate matrix 90 degrees clockwise."""
+    t = transpose(matrix)
+    # BUG: this is counter-clockwise.
+    return t[::-1]

dataset/problem_10/helpers.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ def transpose(matrix: list[list[int]]) -> list[list[int]]:
2	+ return [list(row) for row in zip(*matrix)]

dataset/problem_10/metadata.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "difficulty": "medium",
+  "bug_type": "matrix-transformation",
+  "expected_steps": 1
+}

dataset/problem_10/test.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import unittest
+from dataset.problem_10.buggy import rotate_90_clockwise
+class TestRotateMatrix(unittest.TestCase):
+    def test_2x2(self):
+        matrix = [[1, 2], [3, 4]]
+        self.assertEqual(rotate_90_clockwise(matrix), [[3, 1], [4, 2]])
+if __name__ == "__main__":
+    unittest.main()

dataset/problem_11/buggy.py ADDED Viewed

	@@ -0,0 +1,14 @@

+def binary_search(nums: list[int], target: int) -> int:
+    """Return index of target, or -1 if not found."""
+    left, right = 0, len(nums) - 1
+    while left < right:
+        mid = (left + right) // 2
+        if nums[mid] == target:
+            return mid
+        if nums[mid] < target:
+            left = mid + 1
+        else:
+            right = mid - 1
+    return -1

dataset/problem_11/metadata.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "difficulty": "medium",
+  "bug_type": "boundary-condition",
+  "expected_steps": 2
+}

dataset/problem_11/test.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import unittest
+from dataset.problem_11.buggy import binary_search
+class TestBinarySearch(unittest.TestCase):
+    def test_found_middle(self):
+        self.assertEqual(binary_search([1, 3, 5, 7], 5), 2)
+    def test_found_last(self):
+        self.assertEqual(binary_search([1, 3, 5, 7], 7), 3)
+    def test_not_found(self):
+        self.assertEqual(binary_search([1, 3, 5, 7], 4), -1)
+if __name__ == "__main__":
+    unittest.main()

dataset/problem_12/buggy.py ADDED Viewed

	@@ -0,0 +1,11 @@

+def parse_pairs(raw: str) -> dict[str, int]:
+    """Parse strings like 'a=1,b=2' into a dict."""
+    result = {}
+    if not raw:
+        return result
+    for segment in raw.split(","):
+        key, value = segment.split("=")
+        # BUG: does not strip whitespace around keys/values.
+        result[key] = int(value)
+    return result

dataset/problem_12/metadata.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "difficulty": "easy",
+  "bug_type": "string-normalization",
+  "expected_steps": 2
+}

dataset/problem_12/test.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import unittest
+from dataset.problem_12.buggy import parse_pairs
+class TestParsePairs(unittest.TestCase):
+    def test_simple(self):
+        self.assertEqual(parse_pairs("a=1,b=2"), {"a": 1, "b": 2})
+    def test_spaces(self):
+        self.assertEqual(parse_pairs("x = 10, y = 20"), {"x": 10, "y": 20})
+if __name__ == "__main__":
+    unittest.main()

dataset/problem_13/buggy.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from dataset.problem_13.cache import LRUCache
+def run_ops() -> tuple[int, int]:
+    cache = LRUCache(2)
+    cache.put("a", 1)
+    cache.put("b", 2)
+    _ = cache.get("a")
+    cache.put("c", 3)
+    return cache.get("a"), cache.get("b")

dataset/problem_13/cache.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from collections import OrderedDict
+class LRUCache:
+    def __init__(self, capacity: int):
+        self.capacity = capacity
+        self.store: OrderedDict[str, int] = OrderedDict()
+    def get(self, key: str) -> int:
+        if key not in self.store:
+            return -1
+        # BUG: does not refresh recency when key is accessed.
+        return self.store[key]
+    def put(self, key: str, value: int) -> None:
+        if key in self.store:
+            self.store.pop(key)
+        self.store[key] = value
+        if len(self.store) > self.capacity:
+            self.store.popitem(last=False)

dataset/problem_13/metadata.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "difficulty": "hard",
+  "bug_type": "state-logic",
+  "expected_steps": 2
+}

dataset/problem_13/test.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import unittest
+from dataset.problem_13.buggy import run_ops
+class TestLRU(unittest.TestCase):
+    def test_recency_update_on_get(self):
+        a, b = run_ops()
+        self.assertEqual(a, 1)
+        self.assertEqual(b, -1)
+if __name__ == "__main__":
+    unittest.main()

dataset/problem_14/buggy.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def fibonacci_recursive(n: int) -> int:
+    """Return nth Fibonacci number."""
+    # BUG: wrong base case for n == 0.
+    if n <= 1:
+        return 1
+    return fibonacci_recursive(n - 1) + fibonacci_recursive(n - 2)

dataset/problem_14/metadata.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "difficulty": "easy",
+  "bug_type": "recursion-base-case",
+  "expected_steps": 2
+}

dataset/problem_14/test.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import unittest
+from dataset.problem_14.buggy import fibonacci_recursive
+class TestFibonacciRecursive(unittest.TestCase):
+    def test_base_cases(self):
+        self.assertEqual(fibonacci_recursive(0), 0)
+        self.assertEqual(fibonacci_recursive(1), 1)
+    def test_n5(self):
+        self.assertEqual(fibonacci_recursive(5), 5)
+if __name__ == "__main__":
+    unittest.main()

dataset/problem_15/buggy.py ADDED Viewed

	@@ -0,0 +1,4 @@

+def has_overlap(a: tuple[int, int], b: tuple[int, int]) -> bool:
+    """Check if closed intervals [a0, a1] and [b0, b1] overlap."""
+    # BUG: uses strict inequalities, missing touching-boundary overlap.
+    return a[0] < b[1] and b[0] < a[1]

dataset/problem_15/metadata.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "difficulty": "medium",
+  "bug_type": "boundary-condition",
+  "expected_steps": 1
+}

dataset/problem_15/test.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import unittest
+from dataset.problem_15.buggy import has_overlap
+class TestIntervalOverlap(unittest.TestCase):
+    def test_overlapping(self):
+        self.assertTrue(has_overlap((1, 5), (4, 9)))
+    def test_touching_endpoints(self):
+        self.assertTrue(has_overlap((1, 3), (3, 7)))
+if __name__ == "__main__":
+    unittest.main()

dataset/problem_16/buggy.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from dataset.problem_16.helpers import normalize_scores
+def top_label(scores: dict[str, float]) -> str:
+    """Return label with highest normalized probability."""
+    labels = list(scores.keys())
+    probs = normalize_scores(list(scores.values()))
+    # BUG: chooses min instead of max.
+    idx = min(range(len(probs)), key=lambda i: probs[i])
+    return labels[idx]

dataset/problem_16/helpers.py ADDED Viewed

	@@ -0,0 +1,3 @@

+def normalize_scores(scores: list[float]) -> list[float]:
+    total = sum(scores)
+    return [s / total for s in scores]

dataset/problem_16/metadata.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "difficulty": "easy",
+  "bug_type": "logic-error",
+  "expected_steps": 1
+}

dataset/problem_16/test.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import unittest
+from dataset.problem_16.buggy import top_label
+class TestTopLabel(unittest.TestCase):
+    def test_select_highest(self):
+        scores = {"cat": 0.2, "dog": 0.7, "bird": 0.1}
+        self.assertEqual(top_label(scores), "dog")
+if __name__ == "__main__":
+    unittest.main()

dataset/problem_17/buggy.py ADDED Viewed

	@@ -0,0 +1,11 @@

+def dedupe_preserve_order(items: list[int]) -> list[int]:
+    """Remove duplicates while preserving first occurrence order."""
+    seen = set()
+    out = []
+    for item in items:
+        # BUG: keeps last occurrence logic effectively by replacing list.
+        if item in seen:
+            out = [x for x in out if x != item]
+        seen.add(item)
+        out.append(item)
+    return out

dataset/problem_17/metadata.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "difficulty": "medium",
+  "bug_type": "logic-error",
+  "expected_steps": 2
+}

dataset/problem_17/test.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import unittest
+from dataset.problem_17.buggy import dedupe_preserve_order
+class TestDedupe(unittest.TestCase):
+    def test_order(self):
+        self.assertEqual(dedupe_preserve_order([1, 2, 1, 3, 2]), [1, 2, 3])
+if __name__ == "__main__":
+    unittest.main()

dataset/problem_18/buggy.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from dataset.problem_18.math_utils import clamp
+def moving_average(nums: list[int], window: int) -> list[float]:
+    """Simple moving average over a fixed window."""
+    if window <= 0:
+        raise ValueError("window must be positive")
+    window = clamp(window, 1, len(nums))
+    out = []
+    # BUG: end index is off-by-one; misses final valid window.
+    for i in range(0, len(nums) - window):
+        out.append(sum(nums[i : i + window]) / window)
+    return out

dataset/problem_18/math_utils.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def clamp(value: int, low: int, high: int) -> int:
+    if value < low:
+        return low
+    if value > high:
+        return high
+    return value

dataset/problem_18/metadata.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "difficulty": "medium",
+  "bug_type": "off-by-one",
+  "expected_steps": 1
+}

dataset/problem_18/test.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import unittest
+from dataset.problem_18.buggy import moving_average
+class TestMovingAverage(unittest.TestCase):
+    def test_window_3(self):
+        self.assertEqual(moving_average([1, 2, 3, 4, 5], 3), [2.0, 3.0, 4.0])
+    def test_window_larger_than_data(self):
+        self.assertEqual(moving_average([2, 4], 5), [3.0])
+if __name__ == "__main__":
+    unittest.main()

dataset/problem_19/buggy.py ADDED Viewed

	@@ -0,0 +1,36 @@

+def calculate_employee_bonus(employees: list[dict], metrics: dict) -> list[dict]:
+    """
+    Calculate employee bonuses based on their base salary, performance rating,
+    and company-wide metrics.
+    employees: list of dicts with 'id', 'role', 'base_salary', 'rating' (1-5)
+    metrics: dict with 'company_multiplier' and 'department_multipliers'
+    Returns a list of dicts with 'id' and 'bonus'.
+    """
+    results = []
+    for emp in employees:
+        # BUG 1: Division by zero risk if rating is 0 or missing, and type mismatch if salary is string
+        base = emp.get('base_salary', 0)
+        rating = emp.get('rating', 1)
+        # BUG 2: Incorrect logic for role based multiplier, using assignment instead of lookup
+        role_mult = metrics.get('department_multipliers', {})[emp.get('role')] # will raise KeyError if role not found
+        # Calculate base bonus
+        if rating > 3:
+            base_bonus = base * 0.1
+        elif rating == 3:
+            base_bonus = base * 0.05
+        else:
+            base_bonus = 0
+        # BUG 3: Does not apply company multiplier correctly to the total
+        total_bonus = base_bonus * role_mult + metrics.get('company_multiplier', 1)
+        # BUG 4: mutating original dict instead of creating new one
+        emp['bonus'] = total_bonus
+        results.append(emp)
+    return results

dataset/problem_19/metadata.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "difficulty": "hard",
+  "bug_type": "multiple",
+  "expected_steps": 4
+}

dataset/problem_19/test.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import pytest
+from dataset.problem_19.buggy import calculate_employee_bonus
+def test_calculate_employee_bonus():
+    employees = [
+        {'id': 1, 'role': 'engineering', 'base_salary': 100000, 'rating': 4},
+        {'id': 2, 'role': 'sales', 'base_salary': '80000', 'rating': 3},
+        {'id': 3, 'role': 'hr', 'base_salary': 60000, 'rating': 2},
+        {'id': 4, 'role': 'unknown', 'base_salary': 50000, 'rating': 5}
+    ]
+    metrics = {
+        'company_multiplier': 1.2,
+        'department_multipliers': {
+            'engineering': 1.5,
+            'sales': 1.2,
+            'hr': 1.0
+        }
+    }
+    # Original dicts should not be modified
+    orig_employees = [dict(e) for e in employees]
+    results = calculate_employee_bonus(employees, metrics)
+    # Check if original was modified
+    assert employees == orig_employees, "Original list was mutated"
+    # Check results format
+    assert len(results) == 4
+    for r in results:
+        assert 'id' in r
+        assert 'bonus' in r
+        assert 'role' not in r # Should only contain id and bonus
+    # Check values
+    # Emp 1: 100000 * 0.1 * 1.5 * 1.2 = 18000
+    assert results[0]['bonus'] == 18000
+    # Emp 2: 80000 * 0.05 * 1.2 * 1.2 = 5760 (string salary handling)
+    assert results[1]['bonus'] == 5760
+    # Emp 3: 0 bonus due to rating 2
+    assert results[2]['bonus'] == 0
+    # Emp 4: unknown role falls back to 1.0 multiplier
+    # 50000 * 0.1 * 1.0 * 1.2 = 6000
+    assert results[3]['bonus'] == 6000

dataset/problem_2/buggy.py ADDED Viewed

	@@ -0,0 +1,14 @@

+def binary_search(nums: list[int], target: int) -> int:
+    """Return index of target, or -1 if not found."""
+    left, right = 0, len(nums) - 1
+    while left < right:
+        mid = (left + right) // 2
+        if nums[mid] == target:
+            return mid
+        if nums[mid] < target:
+            left = mid + 1
+        else:
+            right = mid - 1
+    return -1

dataset/problem_2/metadata.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "difficulty": "medium",
+  "bug_type": "boundary-condition",
+  "expected_steps": 2
+}