Spaces:
Running
Running
Commit Β·
2ce1061
0
Parent(s):
Initial commit
Browse files- .dockerignore +9 -0
- .vscode/settings.json +4 -0
- README.md +222 -0
- STRUCTURE.md +29 -0
- __init__.py +5 -0
- client.py +62 -0
- inference.py +285 -0
- models.py +73 -0
- openenv.yaml +48 -0
- pyproject.toml +26 -0
- server/Dockerfile +30 -0
- server/__init__.py +1 -0
- server/__pycache__/__init__.cpython-39.pyc +0 -0
- server/app.py +123 -0
- server/environment.py +147 -0
- server/graders/__init__.py +6 -0
- server/graders/grader_easy.py +90 -0
- server/graders/grader_hard.py +70 -0
- server/graders/grader_medium.py +22 -0
- server/requirements.txt +6 -0
- server/tasks/__init__.py +10 -0
- server/tasks/__pycache__/__init__.cpython-39.pyc +0 -0
- server/tasks/__pycache__/task_easy.cpython-39.pyc +0 -0
- server/tasks/__pycache__/task_hard.cpython-39.pyc +0 -0
- server/tasks/__pycache__/task_medium.cpython-39.pyc +0 -0
- server/tasks/task_easy.py +415 -0
- server/tasks/task_hard.py +628 -0
- server/tasks/task_medium.py +507 -0
- validator/__pycache__/pre_submit_check.cpython-39.pyc +0 -0
- validator/pre_submit_check.py +192 -0
.dockerignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
.env
|
| 5 |
+
.git
|
| 6 |
+
.gitignore
|
| 7 |
+
outputs/
|
| 8 |
+
*.log
|
| 9 |
+
.pytest_cache
|
.vscode/settings.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"python-envs.defaultEnvManager": "ms-python.python:conda",
|
| 3 |
+
"python-envs.defaultPackageManager": "ms-python.python:conda"
|
| 4 |
+
}
|
README.md
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Code Debug Environment
|
| 2 |
+
|
| 3 |
+
An [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Overview
|
| 8 |
+
|
| 9 |
+
| Property | Value |
|
| 10 |
+
|---|---|
|
| 11 |
+
| Domain | Real-world Python code debugging |
|
| 12 |
+
| Tasks | 45 total (15 easy + 15 medium + 15 hard) |
|
| 13 |
+
| Difficulties | easy β medium β hard |
|
| 14 |
+
| Reward Range | 0.0 β 1.0 (partial, proportional) |
|
| 15 |
+
| Max Steps/Episode | 3 |
|
| 16 |
+
| API | OpenEnv standard: `/reset`, `/step`, `/state` |
|
| 17 |
+
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
## Environment Description
|
| 21 |
+
|
| 22 |
+
The agent receives a buggy Python function and must fix it. Tasks come from real-world domains: data processing, string algorithms, API validation, sorting, dynamic programming, and graph algorithms.
|
| 23 |
+
|
| 24 |
+
- **Easy**: One bug (wrong operator, off-by-one, incorrect return). Reward proportional to test pass rate.
|
| 25 |
+
- **Medium**: Two bugs (logic bug + edge case). Reward proportional to test pass rate.
|
| 26 |
+
- **Hard**: One algorithmic bug + agent must explain what was wrong. Reward = 0.7 Γ test score + 0.3 Γ explanation quality.
|
| 27 |
+
|
| 28 |
+
---
|
| 29 |
+
|
| 30 |
+
## Action Space
|
| 31 |
+
|
| 32 |
+
```json
|
| 33 |
+
{
|
| 34 |
+
"fixed_code": "string β the corrected Python function (required)",
|
| 35 |
+
"explanation": "string β explanation of what was wrong (required for hard tasks)"
|
| 36 |
+
}
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
| Field | Type | Required | Description |
|
| 40 |
+
|---|---|---|---|
|
| 41 |
+
| `fixed_code` | `str` | Always | Complete corrected Python function as a string |
|
| 42 |
+
| `explanation` | `str` | Hard tasks | Describe the bug and why your fix is correct |
|
| 43 |
+
|
| 44 |
+
---
|
| 45 |
+
|
| 46 |
+
## Observation Space
|
| 47 |
+
|
| 48 |
+
Returned by `/reset` and `/step`:
|
| 49 |
+
|
| 50 |
+
```json
|
| 51 |
+
{
|
| 52 |
+
"task_id": "easy_003",
|
| 53 |
+
"difficulty": "easy",
|
| 54 |
+
"buggy_code": "def find_max(nums):\n ...",
|
| 55 |
+
"instructions": "The function has exactly one bug. Fix it.",
|
| 56 |
+
"test_cases_description": "Finds max value in a list without IndexError",
|
| 57 |
+
"reward": 0.67,
|
| 58 |
+
"passed_tests": 2,
|
| 59 |
+
"total_tests": 3,
|
| 60 |
+
"feedback": "Test 1: β
...\nTest 2: β
...\nTest 3: β ...",
|
| 61 |
+
"done": false
|
| 62 |
+
}
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
| Field | Type | Description |
|
| 66 |
+
|---|---|---|
|
| 67 |
+
| `task_id` | `str` | Unique task identifier |
|
| 68 |
+
| `difficulty` | `str` | `easy` / `medium` / `hard` |
|
| 69 |
+
| `buggy_code` | `str` | Buggy Python function to fix |
|
| 70 |
+
| `instructions` | `str` | Task instructions |
|
| 71 |
+
| `test_cases_description` | `str` | What the test cases check |
|
| 72 |
+
| `reward` | `float\|null` | Score from last step (null on reset) |
|
| 73 |
+
| `passed_tests` | `int\|null` | Tests passed (null on reset) |
|
| 74 |
+
| `total_tests` | `int` | Total number of test cases |
|
| 75 |
+
| `feedback` | `str\|null` | Detailed per-test feedback |
|
| 76 |
+
| `done` | `bool` | True when episode is complete |
|
| 77 |
+
|
| 78 |
+
---
|
| 79 |
+
|
| 80 |
+
## Reward Function
|
| 81 |
+
|
| 82 |
+
### Easy & Medium
|
| 83 |
+
```
|
| 84 |
+
reward = passed_tests / total_tests
|
| 85 |
+
```
|
| 86 |
+
- 3/3 tests β 1.0
|
| 87 |
+
- 2/3 tests β 0.67
|
| 88 |
+
- 1/3 tests β 0.33
|
| 89 |
+
- 0/3 tests β 0.0
|
| 90 |
+
|
| 91 |
+
### Hard
|
| 92 |
+
```
|
| 93 |
+
reward = 0.7 Γ test_score + 0.3 Γ explanation_score
|
| 94 |
+
```
|
| 95 |
+
Explanation is scored by matching key algorithmic concepts. Partial credit is given.
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
## Setup & Local Run
|
| 100 |
+
|
| 101 |
+
### Prerequisites
|
| 102 |
+
- Python 3.10+
|
| 103 |
+
- Docker
|
| 104 |
+
- Hugging Face CLI
|
| 105 |
+
|
| 106 |
+
### Install
|
| 107 |
+
```bash
|
| 108 |
+
git clone https://github.com/YOUR_USERNAME/code-debug-env
|
| 109 |
+
cd code-debug-env
|
| 110 |
+
pip install -e .
|
| 111 |
+
# Also clone OpenEnv for PYTHONPATH
|
| 112 |
+
git clone https://github.com/meta-pytorch/OpenEnv.git
|
| 113 |
+
export PYTHONPATH=$PYTHONPATH:OpenEnv:OpenEnv/src:.
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
### Run locally
|
| 117 |
+
```bash
|
| 118 |
+
uvicorn server.app:app --host 0.0.0.0 --port 7860 --reload
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
### Run with Docker
|
| 122 |
+
```bash
|
| 123 |
+
docker build -f server/Dockerfile -t code-debug-env .
|
| 124 |
+
docker run -p 7860:7860 code-debug-env
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
### Test the API
|
| 128 |
+
```bash
|
| 129 |
+
# Health check
|
| 130 |
+
curl http://localhost:7860/health
|
| 131 |
+
|
| 132 |
+
# Reset (easy task)
|
| 133 |
+
curl -X POST http://localhost:7860/reset \
|
| 134 |
+
-H "Content-Type: application/json" \
|
| 135 |
+
-d '{"difficulty": "easy"}'
|
| 136 |
+
|
| 137 |
+
# Submit a fix
|
| 138 |
+
curl -X POST http://localhost:7860/step \
|
| 139 |
+
-H "Content-Type: application/json" \
|
| 140 |
+
-d '{"fixed_code": "def find_max(nums):\n return max(nums)"}'
|
| 141 |
+
|
| 142 |
+
# Check state
|
| 143 |
+
curl http://localhost:7860/state
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
---
|
| 147 |
+
|
| 148 |
+
## Run Baseline Inference
|
| 149 |
+
|
| 150 |
+
```bash
|
| 151 |
+
export API_BASE_URL="https://api.openai.com/v1"
|
| 152 |
+
export MODEL_NAME="gpt-4o-mini"
|
| 153 |
+
export HF_TOKEN="your-api-key"
|
| 154 |
+
|
| 155 |
+
# Run all 3 difficulties
|
| 156 |
+
python inference.py --url http://localhost:7860
|
| 157 |
+
|
| 158 |
+
# Run specific difficulty
|
| 159 |
+
python inference.py --url http://localhost:7860 --difficulty hard
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
---
|
| 163 |
+
|
| 164 |
+
## Pre-Submission Validation
|
| 165 |
+
|
| 166 |
+
Run before submitting to catch any disqualifying issues:
|
| 167 |
+
|
| 168 |
+
```bash
|
| 169 |
+
# Start the environment first, then:
|
| 170 |
+
python validator/pre_submit_check.py --url http://localhost:7860
|
| 171 |
+
|
| 172 |
+
# Or against your HF Space:
|
| 173 |
+
python validator/pre_submit_check.py --url https://YOUR_SPACE.hf.space
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
---
|
| 177 |
+
|
| 178 |
+
## Deploy to Hugging Face Spaces
|
| 179 |
+
|
| 180 |
+
```bash
|
| 181 |
+
# Login
|
| 182 |
+
huggingface-cli login
|
| 183 |
+
|
| 184 |
+
# Create space and push
|
| 185 |
+
huggingface-cli repo create code-debug-env --type space --space_sdk docker
|
| 186 |
+
cd code-debug-env
|
| 187 |
+
git init
|
| 188 |
+
git remote add origin https://huggingface.co/spaces/YOUR_USERNAME/code-debug-env
|
| 189 |
+
git add .
|
| 190 |
+
git commit -m "Initial commit"
|
| 191 |
+
git push origin main
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
---
|
| 195 |
+
|
| 196 |
+
## Project Structure
|
| 197 |
+
|
| 198 |
+
```
|
| 199 |
+
code-debug-env/
|
| 200 |
+
βββ openenv.yaml β OpenEnv manifest
|
| 201 |
+
βββ inference.py β Baseline agent (root, required)
|
| 202 |
+
βββ pyproject.toml β Dependencies
|
| 203 |
+
βββ README.md
|
| 204 |
+
βββ models.py β Pydantic Action/Observation/State
|
| 205 |
+
βββ client.py β EnvClient for training loops
|
| 206 |
+
βββ __init__.py
|
| 207 |
+
βββ server/
|
| 208 |
+
β βββ app.py β FastAPI: /reset /step /state /health
|
| 209 |
+
β βββ environment.py β Core episode logic
|
| 210 |
+
β βββ tasks/
|
| 211 |
+
β β βββ task_easy.py β 15 single-bug tasks
|
| 212 |
+
β β βββ task_medium.pyβ 15 two-bug tasks
|
| 213 |
+
β β βββ task_hard.py β 15 algorithmic tasks
|
| 214 |
+
β βββ graders/
|
| 215 |
+
β β βββ grader_easy.py
|
| 216 |
+
β β βββ grader_medium.py
|
| 217 |
+
β β βββ grader_hard.py
|
| 218 |
+
β βββ requirements.txt
|
| 219 |
+
β βββ Dockerfile
|
| 220 |
+
βββ validator/
|
| 221 |
+
βββ pre_submit_check.py
|
| 222 |
+
```
|
STRUCTURE.md
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Code Debug Environment β Full File Structure
|
| 2 |
+
|
| 3 |
+
```
|
| 4 |
+
code-debug-env/
|
| 5 |
+
βββ openenv.yaml β OpenEnv manifest (required)
|
| 6 |
+
βββ inference.py β Baseline agent script (must be in root)
|
| 7 |
+
βββ pyproject.toml β Dependencies
|
| 8 |
+
βββ README.md β Docs with action/obs spaces
|
| 9 |
+
βββ .dockerignore
|
| 10 |
+
βββ models.py β Pydantic Action/Observation/State
|
| 11 |
+
βββ client.py β EnvClient (for training code)
|
| 12 |
+
βββ __init__.py β Exports
|
| 13 |
+
βββ server/
|
| 14 |
+
βββ __init__.py
|
| 15 |
+
βββ app.py β FastAPI server
|
| 16 |
+
βββ environment.py β Core logic: reset/step/state
|
| 17 |
+
βββ tasks/
|
| 18 |
+
β βββ __init__.py
|
| 19 |
+
β βββ task_easy.py β 15 buggy code samples
|
| 20 |
+
β βββ task_medium.py β 15 buggy code samples
|
| 21 |
+
β βββ task_hard.py β 15 buggy code samples
|
| 22 |
+
βββ graders/
|
| 23 |
+
β βββ __init__.py
|
| 24 |
+
β βββ grader_easy.py
|
| 25 |
+
β βββ grader_medium.py
|
| 26 |
+
β βββ grader_hard.py
|
| 27 |
+
βββ requirements.txt
|
| 28 |
+
βββ Dockerfile
|
| 29 |
+
```
|
__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# __init__.py
|
| 2 |
+
from models import DebugAction, DebugObservation, DebugState
|
| 3 |
+
from client import CodeDebugEnv
|
| 4 |
+
|
| 5 |
+
__all__ = ["DebugAction", "DebugObservation", "DebugState", "CodeDebugEnv"]
|
client.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# client.py
|
| 2 |
+
# Python client for connecting to the Code Debug Environment.
|
| 3 |
+
# Use this in training loops / evaluation scripts.
|
| 4 |
+
#
|
| 5 |
+
# Usage (sync):
|
| 6 |
+
# with CodeDebugEnv(base_url="https://your-space.hf.space").sync() as env:
|
| 7 |
+
# result = env.reset(difficulty="easy")
|
| 8 |
+
# result = env.step(DebugAction(fixed_code="..."))
|
| 9 |
+
#
|
| 10 |
+
# Usage (async):
|
| 11 |
+
# async with CodeDebugEnv(base_url="https://your-space.hf.space") as env:
|
| 12 |
+
# result = await env.reset(difficulty="medium")
|
| 13 |
+
# result = await env.step(DebugAction(fixed_code="..."))
|
| 14 |
+
|
| 15 |
+
from openenv.core.env_client import EnvClient
|
| 16 |
+
from openenv.core.client_types import StepResult
|
| 17 |
+
from models import DebugAction, DebugObservation, DebugState
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class CodeDebugEnv(EnvClient[DebugAction, DebugObservation, DebugState]):
|
| 21 |
+
"""
|
| 22 |
+
Client for the Code Debug Environment.
|
| 23 |
+
Wraps OpenEnv EnvClient with typed action/observation models.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def _step_payload(self, action: DebugAction) -> dict:
|
| 27 |
+
payload = {"fixed_code": action.fixed_code}
|
| 28 |
+
if action.explanation:
|
| 29 |
+
payload["explanation"] = action.explanation
|
| 30 |
+
return payload
|
| 31 |
+
|
| 32 |
+
def _parse_result(self, payload: dict) -> StepResult[DebugObservation]:
|
| 33 |
+
obs_data = payload.get("observation", {})
|
| 34 |
+
obs = DebugObservation(
|
| 35 |
+
task_id=obs_data.get("task_id", ""),
|
| 36 |
+
difficulty=obs_data.get("difficulty", "easy"),
|
| 37 |
+
buggy_code=obs_data.get("buggy_code", ""),
|
| 38 |
+
instructions=obs_data.get("instructions", ""),
|
| 39 |
+
test_cases_description=obs_data.get("test_cases_description", ""),
|
| 40 |
+
reward=obs_data.get("reward"),
|
| 41 |
+
passed_tests=obs_data.get("passed_tests"),
|
| 42 |
+
total_tests=obs_data.get("total_tests"),
|
| 43 |
+
feedback=obs_data.get("feedback"),
|
| 44 |
+
done=payload.get("done", False),
|
| 45 |
+
)
|
| 46 |
+
return StepResult(
|
| 47 |
+
observation=obs,
|
| 48 |
+
reward=payload.get("reward", 0.0),
|
| 49 |
+
done=payload.get("done", False),
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
def _parse_state(self, payload: dict) -> DebugState:
|
| 53 |
+
return DebugState(
|
| 54 |
+
episode_id=payload.get("episode_id", ""),
|
| 55 |
+
step_count=payload.get("step_count", 0),
|
| 56 |
+
task_id=payload.get("task_id", ""),
|
| 57 |
+
difficulty=payload.get("difficulty", "easy"),
|
| 58 |
+
max_steps=payload.get("max_steps", 3),
|
| 59 |
+
current_reward=payload.get("current_reward", 0.0),
|
| 60 |
+
best_reward=payload.get("best_reward", 0.0),
|
| 61 |
+
done=payload.get("done", False),
|
| 62 |
+
)
|
inference.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# inference.py
|
| 3 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
# Baseline inference script for the Code Debug Environment.
|
| 5 |
+
# Must be run from the project root.
|
| 6 |
+
#
|
| 7 |
+
# Required environment variables:
|
| 8 |
+
# API_BASE_URL β LLM API endpoint (OpenAI-compatible)
|
| 9 |
+
# MODEL_NAME β Model identifier
|
| 10 |
+
# HF_TOKEN β Hugging Face / API key
|
| 11 |
+
#
|
| 12 |
+
# Usage:
|
| 13 |
+
# python inference.py
|
| 14 |
+
# python inference.py --url https://your-hf-space.hf.space
|
| 15 |
+
# python inference.py --difficulty easy
|
| 16 |
+
#
|
| 17 |
+
# Log format: [START], [STEP], [END] β strictly followed for evaluation scoring.
|
| 18 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
+
|
| 20 |
+
import os
|
| 21 |
+
import sys
|
| 22 |
+
import json
|
| 23 |
+
import time
|
| 24 |
+
import argparse
|
| 25 |
+
import requests
|
| 26 |
+
from openai import OpenAI
|
| 27 |
+
|
| 28 |
+
# βββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
+
|
| 30 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 31 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
|
| 32 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 33 |
+
ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
|
| 34 |
+
|
| 35 |
+
MAX_STEPS = 3
|
| 36 |
+
DIFFICULTIES = ["easy", "medium", "hard"]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# βββ OpenAI Client βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
|
| 41 |
+
client = OpenAI(
|
| 42 |
+
api_key=HF_TOKEN or "dummy",
|
| 43 |
+
base_url=API_BASE_URL,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# βββ Logging (strict format required by evaluator) βββββββββββββββββββββββββββ
|
| 48 |
+
|
| 49 |
+
def log_start(task_id: str, difficulty: str, episode: int):
|
| 50 |
+
print(json.dumps({
|
| 51 |
+
"type": "START",
|
| 52 |
+
"episode": episode,
|
| 53 |
+
"task_id": task_id,
|
| 54 |
+
"difficulty": difficulty,
|
| 55 |
+
"timestamp": time.time(),
|
| 56 |
+
}), flush=True)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def log_step(task_id: str, step: int, action_summary: str, reward: float, done: bool):
|
| 60 |
+
print(json.dumps({
|
| 61 |
+
"type": "STEP",
|
| 62 |
+
"task_id": task_id,
|
| 63 |
+
"step": step,
|
| 64 |
+
"action": action_summary,
|
| 65 |
+
"reward": reward,
|
| 66 |
+
"done": done,
|
| 67 |
+
"timestamp": time.time(),
|
| 68 |
+
}), flush=True)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def log_end(task_id: str, difficulty: str, final_reward: float, steps_taken: int, episode: int):
|
| 72 |
+
print(json.dumps({
|
| 73 |
+
"type": "END",
|
| 74 |
+
"episode": episode,
|
| 75 |
+
"task_id": task_id,
|
| 76 |
+
"difficulty": difficulty,
|
| 77 |
+
"final_reward": final_reward,
|
| 78 |
+
"steps_taken": steps_taken,
|
| 79 |
+
"timestamp": time.time(),
|
| 80 |
+
}), flush=True)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# βββ Environment Client βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
+
|
| 85 |
+
def env_reset(env_url: str, difficulty: str) -> dict:
|
| 86 |
+
resp = requests.post(
|
| 87 |
+
f"{env_url}/reset",
|
| 88 |
+
json={"difficulty": difficulty},
|
| 89 |
+
timeout=30,
|
| 90 |
+
)
|
| 91 |
+
resp.raise_for_status()
|
| 92 |
+
return resp.json()
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def env_step(env_url: str, fixed_code: str, explanation: str = None) -> dict:
|
| 96 |
+
payload = {"fixed_code": fixed_code}
|
| 97 |
+
if explanation:
|
| 98 |
+
payload["explanation"] = explanation
|
| 99 |
+
resp = requests.post(
|
| 100 |
+
f"{env_url}/step",
|
| 101 |
+
json=payload,
|
| 102 |
+
timeout=30,
|
| 103 |
+
)
|
| 104 |
+
resp.raise_for_status()
|
| 105 |
+
return resp.json()
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def env_state(env_url: str) -> dict:
|
| 109 |
+
resp = requests.get(f"{env_url}/state", timeout=10)
|
| 110 |
+
resp.raise_for_status()
|
| 111 |
+
return resp.json()
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# βββ LLM Agent βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 115 |
+
|
| 116 |
+
SYSTEM_PROMPT = """You are an expert Python debugging agent.
|
| 117 |
+
You will be given buggy Python code and must fix it.
|
| 118 |
+
|
| 119 |
+
For easy tasks: fix the single bug.
|
| 120 |
+
For medium tasks: fix both bugs.
|
| 121 |
+
For hard tasks: fix the algorithmic bug AND explain your reasoning in the 'explanation' field.
|
| 122 |
+
|
| 123 |
+
You MUST respond ONLY with valid JSON in this exact format:
|
| 124 |
+
{
|
| 125 |
+
"fixed_code": "<complete fixed Python function as a string>",
|
| 126 |
+
"explanation": "<required for hard tasks; describe what was wrong and why your fix is correct>"
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
Rules:
|
| 130 |
+
- Return the COMPLETE function, not just the changed line.
|
| 131 |
+
- The fixed_code must be valid Python that can be exec'd.
|
| 132 |
+
- For hard tasks, explanation must discuss the algorithm, root cause, and fix.
|
| 133 |
+
- Do NOT include markdown fences or any text outside the JSON object.
|
| 134 |
+
"""
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def call_llm(buggy_code: str, instructions: str, difficulty: str,
|
| 138 |
+
feedback: str = None, attempt: int = 1) -> dict:
|
| 139 |
+
"""Call the LLM and return parsed {fixed_code, explanation}."""
|
| 140 |
+
|
| 141 |
+
user_content = f"""Task difficulty: {difficulty}
|
| 142 |
+
Instructions: {instructions}
|
| 143 |
+
|
| 144 |
+
Buggy code:
|
| 145 |
+
```python
|
| 146 |
+
{buggy_code}
|
| 147 |
+
```
|
| 148 |
+
"""
|
| 149 |
+
if feedback and attempt > 1:
|
| 150 |
+
user_content += f"\nPrevious attempt feedback:\n{feedback}\n\nPlease fix the remaining issues."
|
| 151 |
+
|
| 152 |
+
messages = [
|
| 153 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 154 |
+
{"role": "user", "content": user_content},
|
| 155 |
+
]
|
| 156 |
+
|
| 157 |
+
try:
|
| 158 |
+
response = client.chat.completions.create(
|
| 159 |
+
model=MODEL_NAME,
|
| 160 |
+
messages=messages,
|
| 161 |
+
max_tokens=1000,
|
| 162 |
+
temperature=0.1,
|
| 163 |
+
)
|
| 164 |
+
content = response.choices[0].message.content.strip()
|
| 165 |
+
|
| 166 |
+
# Strip markdown fences if present
|
| 167 |
+
if content.startswith("```"):
|
| 168 |
+
lines = content.split("\n")
|
| 169 |
+
content = "\n".join(lines[1:-1]) if lines[-1] == "```" else "\n".join(lines[1:])
|
| 170 |
+
|
| 171 |
+
parsed = json.loads(content)
|
| 172 |
+
return {
|
| 173 |
+
"fixed_code": parsed.get("fixed_code", ""),
|
| 174 |
+
"explanation": parsed.get("explanation", None),
|
| 175 |
+
}
|
| 176 |
+
except json.JSONDecodeError:
|
| 177 |
+
# Fallback: return original code if parsing fails
|
| 178 |
+
return {"fixed_code": buggy_code, "explanation": None}
|
| 179 |
+
except Exception as e:
|
| 180 |
+
print(f"LLM call failed: {e}", file=sys.stderr)
|
| 181 |
+
return {"fixed_code": buggy_code, "explanation": None}
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# βββ Main Episode Loop ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 185 |
+
|
| 186 |
+
def run_episode(env_url: str, difficulty: str, episode_num: int) -> float:
|
| 187 |
+
"""Run one full episode. Returns final reward."""
|
| 188 |
+
|
| 189 |
+
# Reset
|
| 190 |
+
reset_data = env_reset(env_url, difficulty)
|
| 191 |
+
obs = reset_data["observation"]
|
| 192 |
+
|
| 193 |
+
task_id = obs["task_id"]
|
| 194 |
+
buggy_code = obs["buggy_code"]
|
| 195 |
+
instructions = obs["instructions"]
|
| 196 |
+
|
| 197 |
+
log_start(task_id, difficulty, episode_num)
|
| 198 |
+
|
| 199 |
+
last_feedback = None
|
| 200 |
+
final_reward = 0.0
|
| 201 |
+
step_num = 0
|
| 202 |
+
|
| 203 |
+
for attempt in range(1, MAX_STEPS + 1):
|
| 204 |
+
step_num = attempt
|
| 205 |
+
|
| 206 |
+
# Call LLM
|
| 207 |
+
agent_action = call_llm(
|
| 208 |
+
buggy_code=buggy_code,
|
| 209 |
+
instructions=instructions,
|
| 210 |
+
difficulty=difficulty,
|
| 211 |
+
feedback=last_feedback,
|
| 212 |
+
attempt=attempt,
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
# Submit to environment
|
| 216 |
+
result = env_step(
|
| 217 |
+
env_url,
|
| 218 |
+
fixed_code=agent_action["fixed_code"],
|
| 219 |
+
explanation=agent_action.get("explanation"),
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
reward = result.get("reward", 0.0)
|
| 223 |
+
done = result.get("done", False)
|
| 224 |
+
obs_result = result.get("observation", {})
|
| 225 |
+
last_feedback = obs_result.get("feedback", "")
|
| 226 |
+
|
| 227 |
+
log_step(
|
| 228 |
+
task_id=task_id,
|
| 229 |
+
step=attempt,
|
| 230 |
+
action_summary=f"Submitted fix attempt {attempt} ({len(agent_action['fixed_code'])} chars)",
|
| 231 |
+
reward=reward,
|
| 232 |
+
done=done,
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
final_reward = reward
|
| 236 |
+
|
| 237 |
+
if done:
|
| 238 |
+
break
|
| 239 |
+
|
| 240 |
+
log_end(task_id, difficulty, final_reward, step_num, episode_num)
|
| 241 |
+
return final_reward
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def main():
|
| 245 |
+
parser = argparse.ArgumentParser(description="Code Debug Environment Baseline Agent")
|
| 246 |
+
parser.add_argument("--url", default=ENV_URL, help="Environment base URL")
|
| 247 |
+
parser.add_argument("--difficulty", default=None, choices=["easy", "medium", "hard", "all"],
|
| 248 |
+
help="Difficulty to run. 'all' runs one episode per difficulty.")
|
| 249 |
+
args = parser.parse_args()
|
| 250 |
+
|
| 251 |
+
env_url = args.url.rstrip("/")
|
| 252 |
+
|
| 253 |
+
# Health check
|
| 254 |
+
try:
|
| 255 |
+
health = requests.get(f"{env_url}/health", timeout=10)
|
| 256 |
+
health.raise_for_status()
|
| 257 |
+
print(json.dumps({"type": "INFO", "message": f"Environment healthy at {env_url}"}), flush=True)
|
| 258 |
+
except Exception as e:
|
| 259 |
+
print(json.dumps({"type": "ERROR", "message": f"Health check failed: {e}"}), flush=True)
|
| 260 |
+
sys.exit(1)
|
| 261 |
+
|
| 262 |
+
# Determine episodes to run
|
| 263 |
+
if args.difficulty == "all" or args.difficulty is None:
|
| 264 |
+
episodes = [("easy", 1), ("medium", 2), ("hard", 3)]
|
| 265 |
+
else:
|
| 266 |
+
episodes = [(args.difficulty, 1)]
|
| 267 |
+
|
| 268 |
+
all_rewards = []
|
| 269 |
+
for episode_num, (difficulty, ep_id) in enumerate(episodes, start=1):
|
| 270 |
+
reward = run_episode(env_url, difficulty, episode_num) # use episode_num, not ep_id
|
| 271 |
+
all_rewards.append({"difficulty": difficulty, "reward": reward})
|
| 272 |
+
time.sleep(0.5) # Small pause between episodes
|
| 273 |
+
|
| 274 |
+
# Summary
|
| 275 |
+
print(json.dumps({
|
| 276 |
+
"type": "SUMMARY",
|
| 277 |
+
"total_episodes": len(all_rewards),
|
| 278 |
+
"results": all_rewards,
|
| 279 |
+
"average_reward": round(sum(r["reward"] for r in all_rewards) / len(all_rewards), 3),
|
| 280 |
+
"timestamp": time.time(),
|
| 281 |
+
}), flush=True)
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
if __name__ == "__main__":
|
| 285 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# models.py
|
| 2 |
+
# Typed Pydantic models for Action, Observation, and State
|
| 3 |
+
# These are the contracts between the agent and the environment.
|
| 4 |
+
|
| 5 |
+
from typing import Optional, List
|
| 6 |
+
from pydantic import Field
|
| 7 |
+
from openenv.core.env_server.types import Action, Observation, State
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class DebugAction(Action):
|
| 11 |
+
"""Action submitted by the agent: fixed code + optional explanation."""
|
| 12 |
+
|
| 13 |
+
fixed_code: str = Field(
|
| 14 |
+
...,
|
| 15 |
+
description="The corrected Python function as a string. Must be valid Python."
|
| 16 |
+
)
|
| 17 |
+
explanation: Optional[str] = Field(
|
| 18 |
+
default=None,
|
| 19 |
+
description=(
|
| 20 |
+
"Required for 'hard' difficulty tasks. Explain what was wrong "
|
| 21 |
+
"and why your fix is correct. Affects reward on hard tasks."
|
| 22 |
+
)
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class TestResult(Action):
|
| 27 |
+
"""Sub-model: result of a single test case."""
|
| 28 |
+
test_id: int
|
| 29 |
+
passed: bool
|
| 30 |
+
expected: str
|
| 31 |
+
got: str
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class DebugObservation(Observation):
|
| 35 |
+
"""Observation returned after each step()."""
|
| 36 |
+
|
| 37 |
+
# Task info
|
| 38 |
+
task_id: str = Field(..., description="Unique ID of the current task instance")
|
| 39 |
+
difficulty: str = Field(..., description="Task difficulty: easy | medium | hard")
|
| 40 |
+
buggy_code: str = Field(..., description="The buggy Python code the agent must fix")
|
| 41 |
+
instructions: str = Field(..., description="Natural language instructions for the task")
|
| 42 |
+
test_cases_description: str = Field(
|
| 43 |
+
..., description="Description of what the test cases check"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# After step() β feedback
|
| 47 |
+
reward: Optional[float] = Field(
|
| 48 |
+
default=None, description="Score from 0.0 to 1.0 for this step"
|
| 49 |
+
)
|
| 50 |
+
passed_tests: Optional[int] = Field(
|
| 51 |
+
default=None, description="Number of test cases passed"
|
| 52 |
+
)
|
| 53 |
+
total_tests: Optional[int] = Field(
|
| 54 |
+
default=None, description="Total number of test cases"
|
| 55 |
+
)
|
| 56 |
+
feedback: Optional[str] = Field(
|
| 57 |
+
default=None,
|
| 58 |
+
description="Detailed feedback: which tests failed and why"
|
| 59 |
+
)
|
| 60 |
+
done: bool = Field(default=False, description="True when episode is complete")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class DebugState(State):
|
| 64 |
+
"""Internal environment state, returned by GET /state."""
|
| 65 |
+
|
| 66 |
+
episode_id: str = "" # β required by validator: GET /state must return episode_id
|
| 67 |
+
task_id: str
|
| 68 |
+
difficulty: str
|
| 69 |
+
step_count: int = 0
|
| 70 |
+
max_steps: int = 3
|
| 71 |
+
current_reward: float = 0.0
|
| 72 |
+
best_reward: float = 0.0
|
| 73 |
+
done: bool = False
|
openenv.yaml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: code-debug-env
|
| 3 |
+
type: typed
|
| 4 |
+
description: >
|
| 5 |
+
A real-world RL environment where an LLM agent diagnoses and fixes
|
| 6 |
+
buggy Python code across three difficulty levels (easy, medium, hard).
|
| 7 |
+
Tasks are drawn from real-world domains: data processing, API handlers,
|
| 8 |
+
and algorithmic functions. Rewards are partial and proportional to how
|
| 9 |
+
many test cases pass, with bonuses for correct explanations on hard tasks.
|
| 10 |
+
|
| 11 |
+
version: 1.0.0
|
| 12 |
+
author: your-hf-username # β REPLACE with your actual HF username before submitting
|
| 13 |
+
|
| 14 |
+
runtime:
|
| 15 |
+
type: docker
|
| 16 |
+
port: 7860
|
| 17 |
+
|
| 18 |
+
app:
|
| 19 |
+
entry: server/app.py
|
| 20 |
+
host: 0.0.0.0
|
| 21 |
+
port: 7860
|
| 22 |
+
|
| 23 |
+
tasks:
|
| 24 |
+
- id: easy
|
| 25 |
+
description: "Fix a single off-by-one or operator bug in a Python function"
|
| 26 |
+
difficulty: easy
|
| 27 |
+
max_steps: 3
|
| 28 |
+
reward_range: [0.0, 1.0]
|
| 29 |
+
|
| 30 |
+
- id: medium
|
| 31 |
+
description: "Fix two bugs (logic + edge case) so all test cases pass"
|
| 32 |
+
difficulty: medium
|
| 33 |
+
max_steps: 3
|
| 34 |
+
reward_range: [0.0, 1.0]
|
| 35 |
+
|
| 36 |
+
- id: hard
|
| 37 |
+
description: "Fix an algorithmic bug AND provide a correct explanation"
|
| 38 |
+
difficulty: hard
|
| 39 |
+
max_steps: 3
|
| 40 |
+
reward_range: [0.0, 1.0]
|
| 41 |
+
|
| 42 |
+
reward_range: [0.0, 1.0]
|
| 43 |
+
|
| 44 |
+
api:
|
| 45 |
+
reset: /reset
|
| 46 |
+
step: /step
|
| 47 |
+
state: /state
|
| 48 |
+
health: /health
|
pyproject.toml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68", "wheel"]
|
| 3 |
+
build-backend = "setuptools.backends.legacy:build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "code-debug-env"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
description = "OpenEnv environment for LLM-based code debugging"
|
| 9 |
+
requires-python = ">=3.10"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"fastapi>=0.110.0",
|
| 12 |
+
"uvicorn[standard]>=0.29.0",
|
| 13 |
+
"pydantic>=2.0.0",
|
| 14 |
+
"openai>=1.0.0",
|
| 15 |
+
"requests>=2.31.0",
|
| 16 |
+
"openenv-core>=0.2.0",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
[project.optional-dependencies]
|
| 20 |
+
dev = [
|
| 21 |
+
"pytest>=8.0.0",
|
| 22 |
+
"httpx>=0.27.0",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
[tool.setuptools.packages.find]
|
| 26 |
+
where = ["."]
|
server/Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*
|
| 6 |
+
|
| 7 |
+
RUN git clone https://github.com/meta-pytorch/OpenEnv.git /app/OpenEnv
|
| 8 |
+
|
| 9 |
+
RUN pip install --no-cache-dir \
|
| 10 |
+
fastapi \
|
| 11 |
+
"uvicorn[standard]" \
|
| 12 |
+
pydantic \
|
| 13 |
+
openai \
|
| 14 |
+
requests \
|
| 15 |
+
openenv-core && \
|
| 16 |
+
pip install --no-cache-dir -e /app/OpenEnv || true
|
| 17 |
+
|
| 18 |
+
COPY . .
|
| 19 |
+
|
| 20 |
+
ENV PYTHONPATH="/app:/app/OpenEnv:/app/OpenEnv/src"
|
| 21 |
+
|
| 22 |
+
RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
|
| 23 |
+
USER appuser
|
| 24 |
+
|
| 25 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
|
| 26 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')"
|
| 27 |
+
|
| 28 |
+
EXPOSE 7860
|
| 29 |
+
|
| 30 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# server/__init__.py
|
server/__pycache__/__init__.cpython-39.pyc
ADDED
|
Binary file (161 Bytes). View file
|
|
|
server/app.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/app.py
|
| 2 |
+
# FastAPI server exposing the OpenEnv standard endpoints.
|
| 3 |
+
# Port 7860 required for Hugging Face Spaces.
|
| 4 |
+
|
| 5 |
+
from fastapi import FastAPI, HTTPException
|
| 6 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 7 |
+
from typing import Optional
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
|
| 10 |
+
from server.environment import CodeDebugEnvironment
|
| 11 |
+
from models import DebugAction, DebugObservation, DebugState
|
| 12 |
+
|
| 13 |
+
app = FastAPI(
|
| 14 |
+
title="Code Debug Environment",
|
| 15 |
+
description=(
|
| 16 |
+
"An OpenEnv environment where LLM agents fix buggy Python code. "
|
| 17 |
+
"3 difficulty levels: easy (1 bug), medium (2 bugs), hard (algorithmic + explanation)."
|
| 18 |
+
),
|
| 19 |
+
version="1.0.0",
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
app.add_middleware(
|
| 23 |
+
CORSMiddleware,
|
| 24 |
+
allow_origins=["*"],
|
| 25 |
+
allow_methods=["*"],
|
| 26 |
+
allow_headers=["*"],
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# One global environment instance (single session)
|
| 30 |
+
# For concurrent sessions, instantiate per-request with a session dict
|
| 31 |
+
env = CodeDebugEnvironment()
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# βββ Request Models βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
|
| 36 |
+
class ResetRequest(BaseModel):
|
| 37 |
+
difficulty: Optional[str] = None # "easy" | "medium" | "hard" | None (random)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class StepRequest(BaseModel):
|
| 41 |
+
fixed_code: str
|
| 42 |
+
explanation: Optional[str] = None
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# βββ Response wrapper matching OpenEnv StepResult shape ββββββββββββββββββββββ
|
| 46 |
+
|
| 47 |
+
class StepResponse(BaseModel):
|
| 48 |
+
observation: dict
|
| 49 |
+
reward: float
|
| 50 |
+
done: bool
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# βββ Endpoints βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 54 |
+
|
| 55 |
+
@app.get("/health")
|
| 56 |
+
async def health():
|
| 57 |
+
"""Health check endpoint β must return 200 for submission validation."""
|
| 58 |
+
return {"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@app.post("/reset")
|
| 62 |
+
async def reset(request: ResetRequest = ResetRequest()) -> dict:
|
| 63 |
+
"""
|
| 64 |
+
Reset the environment to start a new episode.
|
| 65 |
+
Optionally pass difficulty: 'easy' | 'medium' | 'hard'
|
| 66 |
+
"""
|
| 67 |
+
try:
|
| 68 |
+
observation = env.reset(difficulty=request.difficulty)
|
| 69 |
+
return {
|
| 70 |
+
"observation": observation.model_dump(),
|
| 71 |
+
"reward": 0.0,
|
| 72 |
+
"done": False,
|
| 73 |
+
}
|
| 74 |
+
except Exception as e:
|
| 75 |
+
raise HTTPException(status_code=500, detail=f"Reset failed: {str(e)}")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
@app.post("/step")
|
| 79 |
+
async def step(request: StepRequest) -> StepResponse:
|
| 80 |
+
"""
|
| 81 |
+
Submit a code fix (and optional explanation for hard tasks).
|
| 82 |
+
Returns observation with reward (0.0β1.0), feedback, and done flag.
|
| 83 |
+
"""
|
| 84 |
+
if not request.fixed_code or not request.fixed_code.strip():
|
| 85 |
+
raise HTTPException(status_code=400, detail="fixed_code must not be empty.")
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
action = DebugAction(
|
| 89 |
+
fixed_code=request.fixed_code,
|
| 90 |
+
explanation=request.explanation,
|
| 91 |
+
)
|
| 92 |
+
observation = env.step(action)
|
| 93 |
+
return StepResponse(
|
| 94 |
+
observation=observation.model_dump(),
|
| 95 |
+
reward=observation.reward or 0.0,
|
| 96 |
+
done=observation.done,
|
| 97 |
+
)
|
| 98 |
+
except Exception as e:
|
| 99 |
+
raise HTTPException(status_code=500, detail=f"Step failed: {str(e)}")
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
@app.get("/state")
|
| 103 |
+
async def state() -> dict:
|
| 104 |
+
"""Return the current episode state."""
|
| 105 |
+
try:
|
| 106 |
+
s = env.state
|
| 107 |
+
return s.model_dump()
|
| 108 |
+
except Exception as e:
|
| 109 |
+
raise HTTPException(status_code=500, detail=f"State failed: {str(e)}")
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
@app.get("/tasks")
|
| 113 |
+
async def list_tasks() -> dict:
|
| 114 |
+
"""List available task IDs per difficulty (for inspection)."""
|
| 115 |
+
from server.tasks.task_easy import EASY_TASKS
|
| 116 |
+
from server.tasks.task_medium import MEDIUM_TASKS
|
| 117 |
+
from server.tasks.task_hard import HARD_TASKS
|
| 118 |
+
return {
|
| 119 |
+
"easy": [t["task_id"] for t in EASY_TASKS],
|
| 120 |
+
"medium": [t["task_id"] for t in MEDIUM_TASKS],
|
| 121 |
+
"hard": [t["task_id"] for t in HARD_TASKS],
|
| 122 |
+
"total": len(EASY_TASKS) + len(MEDIUM_TASKS) + len(HARD_TASKS),
|
| 123 |
+
}
|
server/environment.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/environment.py
|
| 2 |
+
# Core environment: manages episode state, dispatches to task banks and graders.
|
| 3 |
+
|
| 4 |
+
import random
|
| 5 |
+
from uuid import uuid4
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
from openenv.core.env_server.interfaces import Environment
|
| 9 |
+
from openenv.core.env_server.types import State
|
| 10 |
+
|
| 11 |
+
from models import DebugAction, DebugObservation, DebugState
|
| 12 |
+
from server.tasks.task_easy import get_random_easy_task
|
| 13 |
+
from server.tasks.task_medium import get_random_medium_task
|
| 14 |
+
from server.tasks.task_hard import get_random_hard_task
|
| 15 |
+
from server.graders.grader_easy import grade_easy
|
| 16 |
+
from server.graders.grader_medium import grade_medium
|
| 17 |
+
from server.graders.grader_hard import grade_hard
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
TASK_GETTERS = {
|
| 21 |
+
"easy": get_random_easy_task,
|
| 22 |
+
"medium": get_random_medium_task,
|
| 23 |
+
"hard": get_random_hard_task,
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
GRADERS = {
|
| 27 |
+
"easy": grade_easy,
|
| 28 |
+
"medium": grade_medium,
|
| 29 |
+
"hard": grade_hard,
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
MAX_STEPS = 3
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class CodeDebugEnvironment(Environment):
|
| 36 |
+
"""
|
| 37 |
+
OpenEnv environment for LLM-based code debugging.
|
| 38 |
+
Supports 3 difficulty levels with partial rewards.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
def __init__(self):
|
| 42 |
+
self._episode_id: str = str(uuid4())
|
| 43 |
+
self._difficulty: str = "easy"
|
| 44 |
+
self._current_task: Optional[dict] = None
|
| 45 |
+
self._step_count: int = 0
|
| 46 |
+
self._best_reward: float = 0.0
|
| 47 |
+
self._current_reward: float = 0.0
|
| 48 |
+
self._done: bool = False
|
| 49 |
+
|
| 50 |
+
def reset(self, difficulty: Optional[str] = None) -> DebugObservation:
|
| 51 |
+
"""
|
| 52 |
+
Start a new episode. Optionally specify difficulty: easy | medium | hard.
|
| 53 |
+
If not specified, cycles randomly.
|
| 54 |
+
"""
|
| 55 |
+
self._episode_id = str(uuid4())
|
| 56 |
+
self._step_count = 0
|
| 57 |
+
self._best_reward = 0.0
|
| 58 |
+
self._current_reward = 0.0
|
| 59 |
+
self._done = False
|
| 60 |
+
|
| 61 |
+
# Validate difficulty
|
| 62 |
+
if difficulty and difficulty in TASK_GETTERS:
|
| 63 |
+
self._difficulty = difficulty
|
| 64 |
+
else:
|
| 65 |
+
self._difficulty = random.choice(["easy", "medium", "hard"])
|
| 66 |
+
|
| 67 |
+
# Load a task
|
| 68 |
+
self._current_task = TASK_GETTERS[self._difficulty]()
|
| 69 |
+
|
| 70 |
+
return DebugObservation(
|
| 71 |
+
task_id=self._current_task["task_id"],
|
| 72 |
+
difficulty=self._difficulty,
|
| 73 |
+
buggy_code=self._current_task["buggy_code"],
|
| 74 |
+
instructions=self._current_task["instructions"],
|
| 75 |
+
test_cases_description=self._current_task["test_cases_description"],
|
| 76 |
+
reward=None,
|
| 77 |
+
passed_tests=None,
|
| 78 |
+
total_tests=len(self._current_task["test_cases"]),
|
| 79 |
+
feedback=None,
|
| 80 |
+
done=False,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
def step(self, action: DebugAction) -> DebugObservation:
|
| 84 |
+
"""
|
| 85 |
+
Agent submits fixed_code (and optionally explanation for hard tasks).
|
| 86 |
+
Returns observation with reward, feedback, and done flag.
|
| 87 |
+
"""
|
| 88 |
+
if self._done:
|
| 89 |
+
return DebugObservation(
|
| 90 |
+
task_id=self._current_task["task_id"] if self._current_task else "none",
|
| 91 |
+
difficulty=self._difficulty,
|
| 92 |
+
buggy_code=self._current_task["buggy_code"] if self._current_task else "",
|
| 93 |
+
instructions="Episode is already done. Call reset() to start a new episode.",
|
| 94 |
+
test_cases_description="",
|
| 95 |
+
reward=self._best_reward,
|
| 96 |
+
passed_tests=None,
|
| 97 |
+
total_tests=0,
|
| 98 |
+
feedback="Episode ended. Please call reset() to start a new task.",
|
| 99 |
+
done=True,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
self._step_count += 1
|
| 103 |
+
|
| 104 |
+
# Grade the submission
|
| 105 |
+
grader = GRADERS[self._difficulty]
|
| 106 |
+
if self._difficulty == "hard":
|
| 107 |
+
reward, passed, total, feedback, _ = grader(
|
| 108 |
+
action.fixed_code, self._current_task, action.explanation
|
| 109 |
+
)
|
| 110 |
+
else:
|
| 111 |
+
reward, passed, total, feedback, _ = grader(
|
| 112 |
+
action.fixed_code, self._current_task
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
self._current_reward = reward
|
| 116 |
+
self._best_reward = max(self._best_reward, reward)
|
| 117 |
+
|
| 118 |
+
# Episode ends if: perfect score OR max steps reached
|
| 119 |
+
done = (reward == 1.0) or (self._step_count >= MAX_STEPS)
|
| 120 |
+
self._done = done
|
| 121 |
+
|
| 122 |
+
return DebugObservation(
|
| 123 |
+
task_id=self._current_task["task_id"],
|
| 124 |
+
difficulty=self._difficulty,
|
| 125 |
+
buggy_code=self._current_task["buggy_code"],
|
| 126 |
+
instructions=self._current_task["instructions"],
|
| 127 |
+
test_cases_description=self._current_task["test_cases_description"],
|
| 128 |
+
reward=reward,
|
| 129 |
+
passed_tests=passed,
|
| 130 |
+
total_tests=total,
|
| 131 |
+
feedback=feedback,
|
| 132 |
+
done=done,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
@property
|
| 136 |
+
def state(self) -> DebugState:
|
| 137 |
+
"""Return current episode metadata."""
|
| 138 |
+
return DebugState(
|
| 139 |
+
episode_id=self._episode_id,
|
| 140 |
+
step_count=self._step_count,
|
| 141 |
+
task_id=self._current_task["task_id"] if self._current_task else "none",
|
| 142 |
+
difficulty=self._difficulty,
|
| 143 |
+
max_steps=MAX_STEPS,
|
| 144 |
+
current_reward=self._current_reward,
|
| 145 |
+
best_reward=self._best_reward,
|
| 146 |
+
done=self._done,
|
| 147 |
+
)
|
server/graders/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/graders/__init__.py
|
| 2 |
+
from .grader_easy import grade_easy
|
| 3 |
+
from .grader_medium import grade_medium
|
| 4 |
+
from .grader_hard import grade_hard
|
| 5 |
+
|
| 6 |
+
__all__ = ["grade_easy", "grade_medium", "grade_hard"]
|
server/graders/grader_easy.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/graders/grader_easy.py
|
| 2 |
+
# Grades easy tasks: 1 bug, 3 test cases.
|
| 3 |
+
# Reward is proportional to tests passed (0.33, 0.66, 1.0).
|
| 4 |
+
|
| 5 |
+
import traceback
|
| 6 |
+
from typing import Tuple, List
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _run_code_safely(code: str, func_name: str, test_input):
|
| 10 |
+
"""
|
| 11 |
+
Executes the submitted code in an isolated namespace and calls the function.
|
| 12 |
+
Returns (output, error_message).
|
| 13 |
+
"""
|
| 14 |
+
namespace = {}
|
| 15 |
+
try:
|
| 16 |
+
exec(compile(code, "<submitted>", "exec"), namespace)
|
| 17 |
+
except SyntaxError as e:
|
| 18 |
+
return None, f"SyntaxError: {e}"
|
| 19 |
+
except Exception as e:
|
| 20 |
+
return None, f"Compile error: {e}"
|
| 21 |
+
|
| 22 |
+
func = namespace.get(func_name)
|
| 23 |
+
if func is None:
|
| 24 |
+
# Try to find any callable
|
| 25 |
+
funcs = [v for v in namespace.values() if callable(v) and not v.__name__.startswith("_")]
|
| 26 |
+
if not funcs:
|
| 27 |
+
return None, "No callable function found in submitted code."
|
| 28 |
+
func = funcs[0]
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
if isinstance(test_input, list):
|
| 32 |
+
result = func(*test_input)
|
| 33 |
+
else:
|
| 34 |
+
result = func(test_input)
|
| 35 |
+
return result, None
|
| 36 |
+
except Exception as e:
|
| 37 |
+
return None, f"RuntimeError: {traceback.format_exc(limit=2)}"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _extract_func_name(code: str) -> str:
|
| 41 |
+
"""Extract the first function name defined in the code."""
|
| 42 |
+
for line in code.splitlines():
|
| 43 |
+
line = line.strip()
|
| 44 |
+
if line.startswith("def "):
|
| 45 |
+
return line.split("(")[0].replace("def ", "").strip()
|
| 46 |
+
return "unknown"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def grade_easy(fixed_code: str, task: dict) -> Tuple[float, int, int, str, List[dict]]:
|
| 50 |
+
"""
|
| 51 |
+
Grade an easy task submission.
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
reward (float): 0.0 to 1.0
|
| 55 |
+
passed (int): number of tests passed
|
| 56 |
+
total (int): total test cases
|
| 57 |
+
feedback (str): detailed feedback message
|
| 58 |
+
results (list): per-test results
|
| 59 |
+
"""
|
| 60 |
+
test_cases = task["test_cases"]
|
| 61 |
+
total = len(test_cases)
|
| 62 |
+
passed = 0
|
| 63 |
+
results = []
|
| 64 |
+
func_name = _extract_func_name(fixed_code)
|
| 65 |
+
feedback_lines = []
|
| 66 |
+
|
| 67 |
+
for i, tc in enumerate(test_cases):
|
| 68 |
+
inp = tc["input"]
|
| 69 |
+
expected = tc["expected"]
|
| 70 |
+
got, error = _run_code_safely(fixed_code, func_name, inp)
|
| 71 |
+
|
| 72 |
+
if error:
|
| 73 |
+
results.append({"test_id": i + 1, "passed": False, "expected": str(expected), "got": f"ERROR: {error}"})
|
| 74 |
+
feedback_lines.append(f"Test {i+1}: β Error β {error}")
|
| 75 |
+
elif got == expected:
|
| 76 |
+
passed += 1
|
| 77 |
+
results.append({"test_id": i + 1, "passed": True, "expected": str(expected), "got": str(got)})
|
| 78 |
+
feedback_lines.append(f"Test {i+1}: β
Passed β got {got!r}")
|
| 79 |
+
else:
|
| 80 |
+
results.append({"test_id": i + 1, "passed": False, "expected": str(expected), "got": str(got)})
|
| 81 |
+
feedback_lines.append(f"Test {i+1}: β Failed β expected {expected!r}, got {got!r}")
|
| 82 |
+
|
| 83 |
+
reward = round(passed / total, 2)
|
| 84 |
+
feedback = "\n".join(feedback_lines)
|
| 85 |
+
if passed == total:
|
| 86 |
+
feedback += "\nπ All tests passed! Full reward."
|
| 87 |
+
else:
|
| 88 |
+
feedback += f"\n{passed}/{total} tests passed. Review the failing cases."
|
| 89 |
+
|
| 90 |
+
return reward, passed, total, feedback, results
|
server/graders/grader_hard.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/graders/grader_hard.py
|
| 2 |
+
# Grades hard tasks: algorithmic bug + explanation required.
|
| 3 |
+
# Reward = 0.7 * test_score + 0.3 * explanation_score
|
| 4 |
+
|
| 5 |
+
from typing import Tuple, List, Optional
|
| 6 |
+
from .grader_easy import grade_easy
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple[float, str]:
|
| 10 |
+
"""
|
| 11 |
+
Scores the explanation by checking for required conceptual keywords.
|
| 12 |
+
Returns (score 0.0-1.0, feedback string).
|
| 13 |
+
"""
|
| 14 |
+
if not explanation or len(explanation.strip()) < 10:
|
| 15 |
+
return 0.0, "β No explanation provided. Hard tasks require an explanation field."
|
| 16 |
+
|
| 17 |
+
explanation_lower = explanation.lower()
|
| 18 |
+
hits = [kw for kw in keywords if kw.lower() in explanation_lower]
|
| 19 |
+
score = min(1.0, len(hits) / max(1, len(keywords) // 2)) # need at least half the keywords
|
| 20 |
+
|
| 21 |
+
if score == 1.0:
|
| 22 |
+
feedback = f"β
Explanation excellent! Mentioned key concepts: {', '.join(hits)}"
|
| 23 |
+
elif score > 0:
|
| 24 |
+
feedback = (
|
| 25 |
+
f"β οΈ Partial explanation. Mentioned: {', '.join(hits) if hits else 'none'}. "
|
| 26 |
+
f"Consider discussing: {', '.join(kw for kw in keywords if kw.lower() not in explanation_lower)[:3]}"
|
| 27 |
+
)
|
| 28 |
+
else:
|
| 29 |
+
feedback = (
|
| 30 |
+
f"β Explanation missing key concepts. "
|
| 31 |
+
f"Try to explain: {', '.join(keywords[:3])} in your analysis."
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
return round(score, 2), feedback
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -> Tuple[float, int, int, str, List[dict]]:
|
| 38 |
+
"""
|
| 39 |
+
Grade a hard task submission.
|
| 40 |
+
Reward = 0.7 * test_score + 0.3 * explanation_score
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
reward (float): 0.0 to 1.0
|
| 44 |
+
passed (int)
|
| 45 |
+
total (int)
|
| 46 |
+
feedback (str)
|
| 47 |
+
results (list)
|
| 48 |
+
"""
|
| 49 |
+
# Grade code
|
| 50 |
+
test_reward, passed, total, code_feedback, results = grade_easy(fixed_code, task)
|
| 51 |
+
|
| 52 |
+
# Grade explanation
|
| 53 |
+
keywords = task.get("explanation_keywords", [])
|
| 54 |
+
exp_score, exp_feedback = _score_explanation(explanation, keywords)
|
| 55 |
+
|
| 56 |
+
# Combined reward
|
| 57 |
+
final_reward = round(0.7 * test_reward + 0.3 * exp_score, 2)
|
| 58 |
+
|
| 59 |
+
feedback = (
|
| 60 |
+
f"--- Code Score (70% weight): {test_reward:.2f} ---\n"
|
| 61 |
+
f"{code_feedback}\n\n"
|
| 62 |
+
f"--- Explanation Score (30% weight): {exp_score:.2f} ---\n"
|
| 63 |
+
f"{exp_feedback}\n\n"
|
| 64 |
+
f"=== Final Reward: {final_reward:.2f} ==="
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
if passed < total and not explanation:
|
| 68 |
+
feedback += "\nπ‘ Tip: Fix the code bugs AND provide a clear explanation for max reward."
|
| 69 |
+
|
| 70 |
+
return final_reward, passed, total, feedback, results
|
server/graders/grader_medium.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/graders/grader_medium.py
|
| 2 |
+
# Grades medium tasks: 2 bugs, 3 test cases.
|
| 3 |
+
# Same proportional reward as easy but stricter β both bugs must be fixed for full score.
|
| 4 |
+
|
| 5 |
+
from .grader_easy import grade_easy # reuse the same logic
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def grade_medium(fixed_code: str, task: dict):
|
| 9 |
+
"""
|
| 10 |
+
Grade a medium task. Same mechanics as easy β proportional reward by tests passed.
|
| 11 |
+
Returns same tuple: reward, passed, total, feedback, results
|
| 12 |
+
"""
|
| 13 |
+
reward, passed, total, feedback, results = grade_easy(fixed_code, task)
|
| 14 |
+
|
| 15 |
+
# Add medium-specific feedback hint
|
| 16 |
+
if passed < total:
|
| 17 |
+
feedback += (
|
| 18 |
+
"\nπ‘ Hint: Medium tasks have TWO bugs. "
|
| 19 |
+
"Make sure you fixed both the primary logic bug AND the edge case."
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
return reward, passed, total, feedback, results
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.110.0
|
| 2 |
+
uvicorn[standard]>=0.29.0
|
| 3 |
+
pydantic>=2.0.0
|
| 4 |
+
openai>=1.0.0
|
| 5 |
+
requests>=2.31.0
|
| 6 |
+
openenv-core>=0.2.0
|
server/tasks/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/tasks/__init__.py
|
| 2 |
+
from .task_easy import get_random_easy_task, EASY_TASKS
|
| 3 |
+
from .task_medium import get_random_medium_task, MEDIUM_TASKS
|
| 4 |
+
from .task_hard import get_random_hard_task, HARD_TASKS
|
| 5 |
+
|
| 6 |
+
__all__ = [
|
| 7 |
+
"get_random_easy_task", "EASY_TASKS",
|
| 8 |
+
"get_random_medium_task", "MEDIUM_TASKS",
|
| 9 |
+
"get_random_hard_task", "HARD_TASKS",
|
| 10 |
+
]
|
server/tasks/__pycache__/__init__.cpython-39.pyc
ADDED
|
Binary file (449 Bytes). View file
|
|
|
server/tasks/__pycache__/task_easy.cpython-39.pyc
ADDED
|
Binary file (7.37 kB). View file
|
|
|
server/tasks/__pycache__/task_hard.cpython-39.pyc
ADDED
|
Binary file (16.5 kB). View file
|
|
|
server/tasks/__pycache__/task_medium.cpython-39.pyc
ADDED
|
Binary file (10.5 kB). View file
|
|
|
server/tasks/task_easy.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/tasks/task_easy.py
|
| 2 |
+
# 15 single-bug tasks from real-world domains.
|
| 3 |
+
# Each bug is exactly ONE mistake: off-by-one, wrong operator, wrong return, etc.
|
| 4 |
+
|
| 5 |
+
import random
|
| 6 |
+
|
| 7 |
+
EASY_TASKS = [
|
| 8 |
+
{
|
| 9 |
+
"task_id": "easy_001",
|
| 10 |
+
"domain": "data processing",
|
| 11 |
+
"instructions": (
|
| 12 |
+
"The function below is supposed to return the average of a list of numbers. "
|
| 13 |
+
"It has exactly one bug. Fix it."
|
| 14 |
+
),
|
| 15 |
+
"buggy_code": """\
|
| 16 |
+
def average(nums):
|
| 17 |
+
total = 0
|
| 18 |
+
for n in nums:
|
| 19 |
+
total += n
|
| 20 |
+
return total / len(nums) + 1
|
| 21 |
+
""",
|
| 22 |
+
"fixed_code": """\
|
| 23 |
+
def average(nums):
|
| 24 |
+
total = 0
|
| 25 |
+
for n in nums:
|
| 26 |
+
total += n
|
| 27 |
+
return total / len(nums)
|
| 28 |
+
""",
|
| 29 |
+
"test_cases": [
|
| 30 |
+
{"input": [2, 4, 6], "expected": 4.0},
|
| 31 |
+
{"input": [1, 1, 1, 1], "expected": 1.0},
|
| 32 |
+
{"input": [10, 20], "expected": 15.0},
|
| 33 |
+
],
|
| 34 |
+
"test_cases_description": "Checks that average([2,4,6])==4.0, average([1,1,1,1])==1.0, average([10,20])==15.0",
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"task_id": "easy_002",
|
| 38 |
+
"domain": "string processing",
|
| 39 |
+
"instructions": (
|
| 40 |
+
"The function should count how many words are in a sentence. "
|
| 41 |
+
"It has exactly one bug. Fix it."
|
| 42 |
+
),
|
| 43 |
+
"buggy_code": """\
|
| 44 |
+
def count_words(sentence):
|
| 45 |
+
words = sentence.split(' ')
|
| 46 |
+
return len(words) - 1
|
| 47 |
+
""",
|
| 48 |
+
"fixed_code": """\
|
| 49 |
+
def count_words(sentence):
|
| 50 |
+
words = sentence.split()
|
| 51 |
+
return len(words)
|
| 52 |
+
""",
|
| 53 |
+
"test_cases": [
|
| 54 |
+
{"input": "hello world", "expected": 2},
|
| 55 |
+
{"input": "one two three four", "expected": 4},
|
| 56 |
+
{"input": "single", "expected": 1},
|
| 57 |
+
],
|
| 58 |
+
"test_cases_description": "Counts words in a sentence correctly",
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"task_id": "easy_003",
|
| 62 |
+
"domain": "data processing",
|
| 63 |
+
"instructions": (
|
| 64 |
+
"The function should return the maximum value in a list. "
|
| 65 |
+
"It has exactly one bug. Fix it."
|
| 66 |
+
),
|
| 67 |
+
"buggy_code": """\
|
| 68 |
+
def find_max(nums):
|
| 69 |
+
max_val = nums[0]
|
| 70 |
+
for i in range(1, len(nums) + 1):
|
| 71 |
+
if nums[i] > max_val:
|
| 72 |
+
max_val = nums[i]
|
| 73 |
+
return max_val
|
| 74 |
+
""",
|
| 75 |
+
"fixed_code": """\
|
| 76 |
+
def find_max(nums):
|
| 77 |
+
max_val = nums[0]
|
| 78 |
+
for i in range(1, len(nums)):
|
| 79 |
+
if nums[i] > max_val:
|
| 80 |
+
max_val = nums[i]
|
| 81 |
+
return max_val
|
| 82 |
+
""",
|
| 83 |
+
"test_cases": [
|
| 84 |
+
{"input": [3, 1, 4, 1, 5, 9], "expected": 9},
|
| 85 |
+
{"input": [10, 2, 8], "expected": 10},
|
| 86 |
+
{"input": [7], "expected": 7},
|
| 87 |
+
],
|
| 88 |
+
"test_cases_description": "Finds max value in a list without IndexError",
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"task_id": "easy_004",
|
| 92 |
+
"domain": "boolean logic",
|
| 93 |
+
"instructions": (
|
| 94 |
+
"The function checks if a number is even. "
|
| 95 |
+
"It has exactly one bug. Fix it."
|
| 96 |
+
),
|
| 97 |
+
"buggy_code": """\
|
| 98 |
+
def is_even(n):
|
| 99 |
+
return n % 2 == 1
|
| 100 |
+
""",
|
| 101 |
+
"fixed_code": """\
|
| 102 |
+
def is_even(n):
|
| 103 |
+
return n % 2 == 0
|
| 104 |
+
""",
|
| 105 |
+
"test_cases": [
|
| 106 |
+
{"input": 4, "expected": True},
|
| 107 |
+
{"input": 7, "expected": False},
|
| 108 |
+
{"input": 0, "expected": True},
|
| 109 |
+
],
|
| 110 |
+
"test_cases_description": "Correctly identifies even numbers",
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"task_id": "easy_005",
|
| 114 |
+
"domain": "list operations",
|
| 115 |
+
"instructions": (
|
| 116 |
+
"The function should return the second element of a list. "
|
| 117 |
+
"It has exactly one bug. Fix it."
|
| 118 |
+
),
|
| 119 |
+
"buggy_code": """\
|
| 120 |
+
def second_element(lst):
|
| 121 |
+
return lst[2]
|
| 122 |
+
""",
|
| 123 |
+
"fixed_code": """\
|
| 124 |
+
def second_element(lst):
|
| 125 |
+
return lst[1]
|
| 126 |
+
""",
|
| 127 |
+
"test_cases": [
|
| 128 |
+
{"input": [10, 20, 30], "expected": 20},
|
| 129 |
+
{"input": ["a", "b", "c"], "expected": "b"},
|
| 130 |
+
{"input": [99, 100], "expected": 100},
|
| 131 |
+
],
|
| 132 |
+
"test_cases_description": "Returns correct second element (index 1)",
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"task_id": "easy_006",
|
| 136 |
+
"domain": "math",
|
| 137 |
+
"instructions": (
|
| 138 |
+
"The function should compute the factorial of n. "
|
| 139 |
+
"It has exactly one bug. Fix it."
|
| 140 |
+
),
|
| 141 |
+
"buggy_code": """\
|
| 142 |
+
def factorial(n):
|
| 143 |
+
if n == 0:
|
| 144 |
+
return 0
|
| 145 |
+
result = 1
|
| 146 |
+
for i in range(1, n + 1):
|
| 147 |
+
result *= i
|
| 148 |
+
return result
|
| 149 |
+
""",
|
| 150 |
+
"fixed_code": """\
|
| 151 |
+
def factorial(n):
|
| 152 |
+
if n == 0:
|
| 153 |
+
return 1
|
| 154 |
+
result = 1
|
| 155 |
+
for i in range(1, n + 1):
|
| 156 |
+
result *= i
|
| 157 |
+
return result
|
| 158 |
+
""",
|
| 159 |
+
"test_cases": [
|
| 160 |
+
{"input": 0, "expected": 1},
|
| 161 |
+
{"input": 5, "expected": 120},
|
| 162 |
+
{"input": 3, "expected": 6},
|
| 163 |
+
],
|
| 164 |
+
"test_cases_description": "Correct factorial including base case factorial(0)==1",
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"task_id": "easy_007",
|
| 168 |
+
"domain": "string processing",
|
| 169 |
+
"instructions": (
|
| 170 |
+
"The function should check if a string is a palindrome. "
|
| 171 |
+
"It has exactly one bug. Fix it."
|
| 172 |
+
),
|
| 173 |
+
"buggy_code": """\
|
| 174 |
+
def is_palindrome(s):
|
| 175 |
+
return s == s[1:][::-1]
|
| 176 |
+
""",
|
| 177 |
+
"fixed_code": """\
|
| 178 |
+
def is_palindrome(s):
|
| 179 |
+
return s == s[::-1]
|
| 180 |
+
""",
|
| 181 |
+
"test_cases": [
|
| 182 |
+
{"input": "racecar", "expected": True},
|
| 183 |
+
{"input": "hello", "expected": False},
|
| 184 |
+
{"input": "madam", "expected": True},
|
| 185 |
+
],
|
| 186 |
+
"test_cases_description": "Correctly identifies palindromes",
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"task_id": "easy_008",
|
| 190 |
+
"domain": "data processing",
|
| 191 |
+
"instructions": (
|
| 192 |
+
"The function should sum all even numbers in a list. "
|
| 193 |
+
"It has exactly one bug. Fix it."
|
| 194 |
+
),
|
| 195 |
+
"buggy_code": """\
|
| 196 |
+
def sum_evens(nums):
|
| 197 |
+
total = 0
|
| 198 |
+
for n in nums:
|
| 199 |
+
if n % 2 == 1:
|
| 200 |
+
total += n
|
| 201 |
+
return total
|
| 202 |
+
""",
|
| 203 |
+
"fixed_code": """\
|
| 204 |
+
def sum_evens(nums):
|
| 205 |
+
total = 0
|
| 206 |
+
for n in nums:
|
| 207 |
+
if n % 2 == 0:
|
| 208 |
+
total += n
|
| 209 |
+
return total
|
| 210 |
+
""",
|
| 211 |
+
"test_cases": [
|
| 212 |
+
{"input": [1, 2, 3, 4, 5, 6], "expected": 12},
|
| 213 |
+
{"input": [1, 3, 5], "expected": 0},
|
| 214 |
+
{"input": [2, 4], "expected": 6},
|
| 215 |
+
],
|
| 216 |
+
"test_cases_description": "Sums only even numbers",
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"task_id": "easy_009",
|
| 220 |
+
"domain": "list operations",
|
| 221 |
+
"instructions": (
|
| 222 |
+
"The function should reverse a string. "
|
| 223 |
+
"It has exactly one bug. Fix it."
|
| 224 |
+
),
|
| 225 |
+
"buggy_code": """\
|
| 226 |
+
def reverse_string(s):
|
| 227 |
+
return s[1:][::-1]
|
| 228 |
+
""",
|
| 229 |
+
"fixed_code": """\
|
| 230 |
+
def reverse_string(s):
|
| 231 |
+
return s[::-1]
|
| 232 |
+
""",
|
| 233 |
+
"test_cases": [
|
| 234 |
+
{"input": "hello", "expected": "olleh"},
|
| 235 |
+
{"input": "abc", "expected": "cba"},
|
| 236 |
+
{"input": "x", "expected": "x"},
|
| 237 |
+
],
|
| 238 |
+
"test_cases_description": "Reverses a string correctly",
|
| 239 |
+
},
|
| 240 |
+
{
|
| 241 |
+
"task_id": "easy_010",
|
| 242 |
+
"domain": "data processing",
|
| 243 |
+
"instructions": (
|
| 244 |
+
"The function should return the minimum value from a list. "
|
| 245 |
+
"It has exactly one bug. Fix it."
|
| 246 |
+
),
|
| 247 |
+
"buggy_code": """\
|
| 248 |
+
def find_min(nums):
|
| 249 |
+
min_val = nums[0]
|
| 250 |
+
for n in nums:
|
| 251 |
+
if n > min_val:
|
| 252 |
+
min_val = n
|
| 253 |
+
return min_val
|
| 254 |
+
""",
|
| 255 |
+
"fixed_code": """\
|
| 256 |
+
def find_min(nums):
|
| 257 |
+
min_val = nums[0]
|
| 258 |
+
for n in nums:
|
| 259 |
+
if n < min_val:
|
| 260 |
+
min_val = n
|
| 261 |
+
return min_val
|
| 262 |
+
""",
|
| 263 |
+
"test_cases": [
|
| 264 |
+
{"input": [3, 1, 4, 1, 5], "expected": 1},
|
| 265 |
+
{"input": [10, 2, 8], "expected": 2},
|
| 266 |
+
{"input": [-5, 0, 5], "expected": -5},
|
| 267 |
+
],
|
| 268 |
+
"test_cases_description": "Finds minimum value in a list",
|
| 269 |
+
},
|
| 270 |
+
{
|
| 271 |
+
"task_id": "easy_011",
|
| 272 |
+
"domain": "math",
|
| 273 |
+
"instructions": (
|
| 274 |
+
"The function should check if a number is prime. "
|
| 275 |
+
"It has exactly one bug. Fix it."
|
| 276 |
+
),
|
| 277 |
+
"buggy_code": """\
|
| 278 |
+
def is_prime(n):
|
| 279 |
+
if n < 2:
|
| 280 |
+
return False
|
| 281 |
+
for i in range(2, n):
|
| 282 |
+
if n % i == 0:
|
| 283 |
+
return True
|
| 284 |
+
return False
|
| 285 |
+
""",
|
| 286 |
+
"fixed_code": """\
|
| 287 |
+
def is_prime(n):
|
| 288 |
+
if n < 2:
|
| 289 |
+
return False
|
| 290 |
+
for i in range(2, n):
|
| 291 |
+
if n % i == 0:
|
| 292 |
+
return False
|
| 293 |
+
return True
|
| 294 |
+
""",
|
| 295 |
+
"test_cases": [
|
| 296 |
+
{"input": 7, "expected": True},
|
| 297 |
+
{"input": 4, "expected": False},
|
| 298 |
+
{"input": 13, "expected": True},
|
| 299 |
+
],
|
| 300 |
+
"test_cases_description": "Correctly identifies prime numbers",
|
| 301 |
+
},
|
| 302 |
+
{
|
| 303 |
+
"task_id": "easy_012",
|
| 304 |
+
"domain": "list operations",
|
| 305 |
+
"instructions": (
|
| 306 |
+
"The function should remove duplicates from a list while preserving order. "
|
| 307 |
+
"It has exactly one bug. Fix it."
|
| 308 |
+
),
|
| 309 |
+
"buggy_code": """\
|
| 310 |
+
def remove_duplicates(lst):
|
| 311 |
+
seen = set()
|
| 312 |
+
result = []
|
| 313 |
+
for item in lst:
|
| 314 |
+
if item in seen:
|
| 315 |
+
result.append(item)
|
| 316 |
+
seen.add(item)
|
| 317 |
+
return result
|
| 318 |
+
""",
|
| 319 |
+
"fixed_code": """\
|
| 320 |
+
def remove_duplicates(lst):
|
| 321 |
+
seen = set()
|
| 322 |
+
result = []
|
| 323 |
+
for item in lst:
|
| 324 |
+
if item not in seen:
|
| 325 |
+
result.append(item)
|
| 326 |
+
seen.add(item)
|
| 327 |
+
return result
|
| 328 |
+
""",
|
| 329 |
+
"test_cases": [
|
| 330 |
+
{"input": [1, 2, 2, 3, 3, 3], "expected": [1, 2, 3]},
|
| 331 |
+
{"input": ["a", "b", "a"], "expected": ["a", "b"]},
|
| 332 |
+
{"input": [1], "expected": [1]},
|
| 333 |
+
],
|
| 334 |
+
"test_cases_description": "Removes duplicates while preserving order",
|
| 335 |
+
},
|
| 336 |
+
{
|
| 337 |
+
"task_id": "easy_013",
|
| 338 |
+
"domain": "string processing",
|
| 339 |
+
"instructions": (
|
| 340 |
+
"The function should capitalize the first letter of every word. "
|
| 341 |
+
"It has exactly one bug. Fix it."
|
| 342 |
+
),
|
| 343 |
+
"buggy_code": """\
|
| 344 |
+
def title_case(sentence):
|
| 345 |
+
return sentence.lower()
|
| 346 |
+
""",
|
| 347 |
+
"fixed_code": """\
|
| 348 |
+
def title_case(sentence):
|
| 349 |
+
return sentence.title()
|
| 350 |
+
""",
|
| 351 |
+
"test_cases": [
|
| 352 |
+
{"input": "hello world", "expected": "Hello World"},
|
| 353 |
+
{"input": "the quick brown fox", "expected": "The Quick Brown Fox"},
|
| 354 |
+
{"input": "python", "expected": "Python"},
|
| 355 |
+
],
|
| 356 |
+
"test_cases_description": "Converts sentence to title case",
|
| 357 |
+
},
|
| 358 |
+
{
|
| 359 |
+
"task_id": "easy_014",
|
| 360 |
+
"domain": "data processing",
|
| 361 |
+
"instructions": (
|
| 362 |
+
"The function should return the length of the longest word in a sentence. "
|
| 363 |
+
"It has exactly one bug. Fix it."
|
| 364 |
+
),
|
| 365 |
+
"buggy_code": """\
|
| 366 |
+
def longest_word_length(sentence):
|
| 367 |
+
words = sentence.split()
|
| 368 |
+
return min(len(w) for w in words)
|
| 369 |
+
""",
|
| 370 |
+
"fixed_code": """\
|
| 371 |
+
def longest_word_length(sentence):
|
| 372 |
+
words = sentence.split()
|
| 373 |
+
return max(len(w) for w in words)
|
| 374 |
+
""",
|
| 375 |
+
"test_cases": [
|
| 376 |
+
{"input": "hello world", "expected": 5},
|
| 377 |
+
{"input": "I am learning Python programming", "expected": 11},
|
| 378 |
+
{"input": "cat", "expected": 3},
|
| 379 |
+
],
|
| 380 |
+
"test_cases_description": "Returns length of the longest word",
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"task_id": "easy_015",
|
| 384 |
+
"domain": "math",
|
| 385 |
+
"instructions": (
|
| 386 |
+
"The function should return n raised to the power of 2. "
|
| 387 |
+
"It has exactly one bug. Fix it."
|
| 388 |
+
),
|
| 389 |
+
"buggy_code": """\
|
| 390 |
+
def square(n):
|
| 391 |
+
return n * 3
|
| 392 |
+
""",
|
| 393 |
+
"fixed_code": """\
|
| 394 |
+
def square(n):
|
| 395 |
+
return n * n
|
| 396 |
+
""",
|
| 397 |
+
"test_cases": [
|
| 398 |
+
{"input": 4, "expected": 16},
|
| 399 |
+
{"input": 0, "expected": 0},
|
| 400 |
+
{"input": -3, "expected": 9},
|
| 401 |
+
],
|
| 402 |
+
"test_cases_description": "Returns n squared",
|
| 403 |
+
},
|
| 404 |
+
]
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
def get_random_easy_task() -> dict:
|
| 408 |
+
return random.choice(EASY_TASKS).copy()
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
def get_task_by_id(task_id: str) -> dict:
|
| 412 |
+
for t in EASY_TASKS:
|
| 413 |
+
if t["task_id"] == task_id:
|
| 414 |
+
return t.copy()
|
| 415 |
+
return random.choice(EASY_TASKS).copy()
|
server/tasks/task_hard.py
ADDED
|
@@ -0,0 +1,628 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/tasks/task_hard.py
|
| 2 |
+
# 15 hard tasks: algorithmic bugs + agent must explain what was wrong.
|
| 3 |
+
# Reward is based on test pass rate PLUS explanation quality.
|
| 4 |
+
|
| 5 |
+
import random
|
| 6 |
+
|
| 7 |
+
HARD_TASKS = [
|
| 8 |
+
{
|
| 9 |
+
"task_id": "hard_001",
|
| 10 |
+
"domain": "sorting algorithm",
|
| 11 |
+
"instructions": (
|
| 12 |
+
"The function implements bubble sort but is broken. "
|
| 13 |
+
"Fix the algorithm AND explain what was wrong in your 'explanation' field. "
|
| 14 |
+
"Explanation must mention: loop range, boundary, or swap logic."
|
| 15 |
+
),
|
| 16 |
+
"buggy_code": """\
|
| 17 |
+
def bubble_sort(arr):
|
| 18 |
+
n = len(arr)
|
| 19 |
+
for i in range(n):
|
| 20 |
+
for j in range(n - i):
|
| 21 |
+
if arr[j] > arr[j + 1]:
|
| 22 |
+
arr[j], arr[j + 1] = arr[j + 1], arr[j]
|
| 23 |
+
return arr
|
| 24 |
+
""",
|
| 25 |
+
"fixed_code": """\
|
| 26 |
+
def bubble_sort(arr):
|
| 27 |
+
n = len(arr)
|
| 28 |
+
for i in range(n):
|
| 29 |
+
for j in range(n - i - 1):
|
| 30 |
+
if arr[j] > arr[j + 1]:
|
| 31 |
+
arr[j], arr[j + 1] = arr[j + 1], arr[j]
|
| 32 |
+
return arr
|
| 33 |
+
""",
|
| 34 |
+
"explanation_keywords": ["boundary", "index", "range", "n - i - 1", "out of bounds", "last element"],
|
| 35 |
+
"test_cases": [
|
| 36 |
+
{"input": [64, 34, 25, 12, 22, 11, 90], "expected": [11, 12, 22, 25, 34, 64, 90]},
|
| 37 |
+
{"input": [5, 1, 4, 2, 8], "expected": [1, 2, 4, 5, 8]},
|
| 38 |
+
{"input": [1], "expected": [1]},
|
| 39 |
+
],
|
| 40 |
+
"test_cases_description": "Bubble sort with correct inner loop boundary (n - i - 1)",
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"task_id": "hard_002",
|
| 44 |
+
"domain": "dynamic programming",
|
| 45 |
+
"instructions": (
|
| 46 |
+
"The function computes the longest increasing subsequence (LIS) length. "
|
| 47 |
+
"Fix the algorithm AND explain what was wrong. "
|
| 48 |
+
"Explanation must mention: initialization, dp transition, or base case."
|
| 49 |
+
),
|
| 50 |
+
"buggy_code": """\
|
| 51 |
+
def lis_length(nums):
|
| 52 |
+
if not nums:
|
| 53 |
+
return 0
|
| 54 |
+
dp = [0] * len(nums)
|
| 55 |
+
for i in range(len(nums)):
|
| 56 |
+
for j in range(i):
|
| 57 |
+
if nums[j] < nums[i]:
|
| 58 |
+
dp[i] = max(dp[i], dp[j] + 1)
|
| 59 |
+
return max(dp)
|
| 60 |
+
""",
|
| 61 |
+
"fixed_code": """\
|
| 62 |
+
def lis_length(nums):
|
| 63 |
+
if not nums:
|
| 64 |
+
return 0
|
| 65 |
+
dp = [1] * len(nums)
|
| 66 |
+
for i in range(len(nums)):
|
| 67 |
+
for j in range(i):
|
| 68 |
+
if nums[j] < nums[i]:
|
| 69 |
+
dp[i] = max(dp[i], dp[j] + 1)
|
| 70 |
+
return max(dp)
|
| 71 |
+
""",
|
| 72 |
+
"explanation_keywords": ["initialization", "base case", "dp[i]", "1", "zero", "initial value"],
|
| 73 |
+
"test_cases": [
|
| 74 |
+
{"input": [10, 9, 2, 5, 3, 7, 101, 18], "expected": 4},
|
| 75 |
+
{"input": [0, 1, 0, 3, 2, 3], "expected": 4},
|
| 76 |
+
{"input": [7, 7, 7, 7], "expected": 1},
|
| 77 |
+
],
|
| 78 |
+
"test_cases_description": "LIS with dp initialized to 1 (not 0)",
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"task_id": "hard_003",
|
| 82 |
+
"domain": "binary search",
|
| 83 |
+
"instructions": (
|
| 84 |
+
"The function does binary search on a sorted list. "
|
| 85 |
+
"Fix the algorithm AND explain what was wrong. "
|
| 86 |
+
"Explanation must mention: mid calculation, overflow, boundary, or infinite loop."
|
| 87 |
+
),
|
| 88 |
+
"buggy_code": """\
|
| 89 |
+
def binary_search(arr, target):
|
| 90 |
+
low, high = 0, len(arr)
|
| 91 |
+
while low < high:
|
| 92 |
+
mid = (low + high) // 2
|
| 93 |
+
if arr[mid] == target:
|
| 94 |
+
return mid
|
| 95 |
+
elif arr[mid] < target:
|
| 96 |
+
low = mid
|
| 97 |
+
else:
|
| 98 |
+
high = mid - 1
|
| 99 |
+
return -1
|
| 100 |
+
""",
|
| 101 |
+
"fixed_code": """\
|
| 102 |
+
def binary_search(arr, target):
|
| 103 |
+
low, high = 0, len(arr) - 1
|
| 104 |
+
while low <= high:
|
| 105 |
+
mid = (low + high) // 2
|
| 106 |
+
if arr[mid] == target:
|
| 107 |
+
return mid
|
| 108 |
+
elif arr[mid] < target:
|
| 109 |
+
low = mid + 1
|
| 110 |
+
else:
|
| 111 |
+
high = mid - 1
|
| 112 |
+
return -1
|
| 113 |
+
""",
|
| 114 |
+
"explanation_keywords": ["high", "len - 1", "low = mid", "infinite loop", "boundary", "off-by-one"],
|
| 115 |
+
"test_cases": [
|
| 116 |
+
{"input": [[1, 3, 5, 7, 9], 7], "expected": 3},
|
| 117 |
+
{"input": [[1, 3, 5, 7, 9], 1], "expected": 0},
|
| 118 |
+
{"input": [[1, 3, 5, 7, 9], 6], "expected": -1},
|
| 119 |
+
],
|
| 120 |
+
"test_cases_description": "Binary search: high = len-1, low = mid+1, while low <= high",
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"task_id": "hard_004",
|
| 124 |
+
"domain": "dynamic programming",
|
| 125 |
+
"instructions": (
|
| 126 |
+
"The function computes the minimum number of coins to make 'amount'. "
|
| 127 |
+
"Fix the algorithm AND explain what was wrong. "
|
| 128 |
+
"Explanation must mention: initialization, infinity, dp table, or base case."
|
| 129 |
+
),
|
| 130 |
+
"buggy_code": """\
|
| 131 |
+
def coin_change(coins, amount):
|
| 132 |
+
dp = [0] * (amount + 1)
|
| 133 |
+
dp[0] = 0
|
| 134 |
+
for i in range(1, amount + 1):
|
| 135 |
+
for coin in coins:
|
| 136 |
+
if coin <= i:
|
| 137 |
+
dp[i] = min(dp[i], dp[i - coin] + 1)
|
| 138 |
+
return dp[amount] if dp[amount] != 0 else -1
|
| 139 |
+
""",
|
| 140 |
+
"fixed_code": """\
|
| 141 |
+
def coin_change(coins, amount):
|
| 142 |
+
dp = [float('inf')] * (amount + 1)
|
| 143 |
+
dp[0] = 0
|
| 144 |
+
for i in range(1, amount + 1):
|
| 145 |
+
for coin in coins:
|
| 146 |
+
if coin <= i:
|
| 147 |
+
dp[i] = min(dp[i], dp[i - coin] + 1)
|
| 148 |
+
return dp[amount] if dp[amount] != float('inf') else -1
|
| 149 |
+
""",
|
| 150 |
+
"explanation_keywords": ["infinity", "inf", "initialization", "0 instead of inf", "unreachable", "base"],
|
| 151 |
+
"test_cases": [
|
| 152 |
+
{"input": [[1, 5, 6, 9], 11], "expected": 2},
|
| 153 |
+
{"input": [[2], 3], "expected": -1},
|
| 154 |
+
{"input": [[1, 2, 5], 11], "expected": 3},
|
| 155 |
+
],
|
| 156 |
+
"test_cases_description": "Coin change DP: initialized to inf, not 0",
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"task_id": "hard_005",
|
| 160 |
+
"domain": "graph algorithm",
|
| 161 |
+
"instructions": (
|
| 162 |
+
"The function checks if a directed graph has a cycle using DFS. "
|
| 163 |
+
"Fix it AND explain what was wrong. "
|
| 164 |
+
"Explanation must mention: visited, recursion stack, back edge, or state."
|
| 165 |
+
),
|
| 166 |
+
"buggy_code": """\
|
| 167 |
+
def has_cycle(graph):
|
| 168 |
+
visited = set()
|
| 169 |
+
|
| 170 |
+
def dfs(node):
|
| 171 |
+
visited.add(node)
|
| 172 |
+
for neighbor in graph.get(node, []):
|
| 173 |
+
if neighbor in visited:
|
| 174 |
+
return True
|
| 175 |
+
if dfs(neighbor):
|
| 176 |
+
return True
|
| 177 |
+
return False
|
| 178 |
+
|
| 179 |
+
for node in graph:
|
| 180 |
+
if node not in visited:
|
| 181 |
+
if dfs(node):
|
| 182 |
+
return True
|
| 183 |
+
return False
|
| 184 |
+
""",
|
| 185 |
+
"fixed_code": """\
|
| 186 |
+
def has_cycle(graph):
|
| 187 |
+
visited = set()
|
| 188 |
+
rec_stack = set()
|
| 189 |
+
|
| 190 |
+
def dfs(node):
|
| 191 |
+
visited.add(node)
|
| 192 |
+
rec_stack.add(node)
|
| 193 |
+
for neighbor in graph.get(node, []):
|
| 194 |
+
if neighbor not in visited:
|
| 195 |
+
if dfs(neighbor):
|
| 196 |
+
return True
|
| 197 |
+
elif neighbor in rec_stack:
|
| 198 |
+
return True
|
| 199 |
+
rec_stack.remove(node)
|
| 200 |
+
return False
|
| 201 |
+
|
| 202 |
+
for node in graph:
|
| 203 |
+
if node not in visited:
|
| 204 |
+
if dfs(node):
|
| 205 |
+
return True
|
| 206 |
+
return False
|
| 207 |
+
""",
|
| 208 |
+
"explanation_keywords": ["recursion stack", "rec_stack", "back edge", "visited", "false positive", "path"],
|
| 209 |
+
"test_cases": [
|
| 210 |
+
{"input": {"A": ["B"], "B": ["C"], "C": ["A"]}, "expected": True},
|
| 211 |
+
{"input": {"A": ["B"], "B": ["C"], "C": []}, "expected": False},
|
| 212 |
+
{"input": {"A": ["B", "C"], "B": ["D"], "C": ["D"], "D": []}, "expected": False},
|
| 213 |
+
],
|
| 214 |
+
"test_cases_description": "Cycle detection in directed graph using recursion stack",
|
| 215 |
+
},
|
| 216 |
+
{
|
| 217 |
+
"task_id": "hard_006",
|
| 218 |
+
"domain": "dynamic programming",
|
| 219 |
+
"instructions": (
|
| 220 |
+
"The function computes the maximum subarray sum (Kadane's algorithm). "
|
| 221 |
+
"Fix it AND explain what was wrong. "
|
| 222 |
+
"Explanation must mention: initialization, negative numbers, current_sum, or reset."
|
| 223 |
+
),
|
| 224 |
+
"buggy_code": """\
|
| 225 |
+
def max_subarray(nums):
|
| 226 |
+
max_sum = 0
|
| 227 |
+
current_sum = 0
|
| 228 |
+
for n in nums:
|
| 229 |
+
current_sum = max(n, current_sum + n)
|
| 230 |
+
max_sum = max(max_sum, current_sum)
|
| 231 |
+
return max_sum
|
| 232 |
+
""",
|
| 233 |
+
"fixed_code": """\
|
| 234 |
+
def max_subarray(nums):
|
| 235 |
+
max_sum = nums[0]
|
| 236 |
+
current_sum = nums[0]
|
| 237 |
+
for n in nums[1:]:
|
| 238 |
+
current_sum = max(n, current_sum + n)
|
| 239 |
+
max_sum = max(max_sum, current_sum)
|
| 240 |
+
return max_sum
|
| 241 |
+
""",
|
| 242 |
+
"explanation_keywords": ["initialization", "negative", "nums[0]", "all negative", "zero", "initial"],
|
| 243 |
+
"test_cases": [
|
| 244 |
+
{"input": [-2, 1, -3, 4, -1, 2, 1, -5, 4], "expected": 6},
|
| 245 |
+
{"input": [-1, -2, -3, -4], "expected": -1},
|
| 246 |
+
{"input": [1], "expected": 1},
|
| 247 |
+
],
|
| 248 |
+
"test_cases_description": "Kadane's algorithm handles all-negative arrays",
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"task_id": "hard_007",
|
| 252 |
+
"domain": "string algorithm",
|
| 253 |
+
"instructions": (
|
| 254 |
+
"The function checks if a string has balanced brackets. "
|
| 255 |
+
"Fix it AND explain what was wrong. "
|
| 256 |
+
"Explanation must mention: stack, matching, empty stack, or closing bracket."
|
| 257 |
+
),
|
| 258 |
+
"buggy_code": """\
|
| 259 |
+
def is_balanced(s):
|
| 260 |
+
stack = []
|
| 261 |
+
matching = {')': '(', ']': '[', '}': '{'}
|
| 262 |
+
for ch in s:
|
| 263 |
+
if ch in '([{':
|
| 264 |
+
stack.append(ch)
|
| 265 |
+
elif ch in ')]}':
|
| 266 |
+
if stack and stack[-1] == matching[ch]:
|
| 267 |
+
stack.pop()
|
| 268 |
+
return len(stack) == 0
|
| 269 |
+
""",
|
| 270 |
+
"fixed_code": """\
|
| 271 |
+
def is_balanced(s):
|
| 272 |
+
stack = []
|
| 273 |
+
matching = {')': '(', ']': '[', '}': '{'}
|
| 274 |
+
for ch in s:
|
| 275 |
+
if ch in '([{':
|
| 276 |
+
stack.append(ch)
|
| 277 |
+
elif ch in ')]}':
|
| 278 |
+
if not stack or stack[-1] != matching[ch]:
|
| 279 |
+
return False
|
| 280 |
+
stack.pop()
|
| 281 |
+
return len(stack) == 0
|
| 282 |
+
""",
|
| 283 |
+
"explanation_keywords": ["stack", "empty stack", "mismatch", "not stack", "early return", "closing"],
|
| 284 |
+
"test_cases": [
|
| 285 |
+
{"input": "([{}])", "expected": True},
|
| 286 |
+
{"input": "([)]", "expected": False},
|
| 287 |
+
{"input": "]", "expected": False},
|
| 288 |
+
],
|
| 289 |
+
"test_cases_description": "Balanced brackets: early return False on mismatch or empty stack",
|
| 290 |
+
},
|
| 291 |
+
{
|
| 292 |
+
"task_id": "hard_008",
|
| 293 |
+
"domain": "dynamic programming",
|
| 294 |
+
"instructions": (
|
| 295 |
+
"The function computes the number of ways to climb n stairs (1 or 2 steps at a time). "
|
| 296 |
+
"Fix it AND explain what was wrong. "
|
| 297 |
+
"Explanation must mention: base case, dp, index, or off-by-one."
|
| 298 |
+
),
|
| 299 |
+
"buggy_code": """\
|
| 300 |
+
def climb_stairs(n):
|
| 301 |
+
if n <= 0:
|
| 302 |
+
return 0
|
| 303 |
+
dp = [0] * (n + 1)
|
| 304 |
+
dp[0] = 1
|
| 305 |
+
dp[1] = 1
|
| 306 |
+
for i in range(3, n + 1):
|
| 307 |
+
dp[i] = dp[i - 1] + dp[i - 2]
|
| 308 |
+
return dp[n]
|
| 309 |
+
""",
|
| 310 |
+
"fixed_code": """\
|
| 311 |
+
def climb_stairs(n):
|
| 312 |
+
if n <= 0:
|
| 313 |
+
return 0
|
| 314 |
+
dp = [0] * (n + 1)
|
| 315 |
+
dp[0] = 1
|
| 316 |
+
dp[1] = 1
|
| 317 |
+
for i in range(2, n + 1):
|
| 318 |
+
dp[i] = dp[i - 1] + dp[i - 2]
|
| 319 |
+
return dp[n]
|
| 320 |
+
""",
|
| 321 |
+
"explanation_keywords": ["range", "starts at 3", "range(2", "off-by-one", "dp[2]", "skipped"],
|
| 322 |
+
"test_cases": [
|
| 323 |
+
{"input": 2, "expected": 2},
|
| 324 |
+
{"input": 3, "expected": 3},
|
| 325 |
+
{"input": 5, "expected": 8},
|
| 326 |
+
],
|
| 327 |
+
"test_cases_description": "Climb stairs DP: loop starts at range(2, ...) not range(3, ...)",
|
| 328 |
+
},
|
| 329 |
+
{
|
| 330 |
+
"task_id": "hard_009",
|
| 331 |
+
"domain": "data processing",
|
| 332 |
+
"instructions": (
|
| 333 |
+
"The function implements quicksort. "
|
| 334 |
+
"Fix it AND explain what was wrong. "
|
| 335 |
+
"Explanation must mention: pivot, partition, recursion, or base case."
|
| 336 |
+
),
|
| 337 |
+
"buggy_code": """\
|
| 338 |
+
def quicksort(arr):
|
| 339 |
+
if len(arr) <= 1:
|
| 340 |
+
return arr
|
| 341 |
+
pivot = arr[0]
|
| 342 |
+
left = [x for x in arr if x < pivot]
|
| 343 |
+
right = [x for x in arr if x > pivot]
|
| 344 |
+
return quicksort(left) + [pivot] + quicksort(right)
|
| 345 |
+
""",
|
| 346 |
+
"fixed_code": """\
|
| 347 |
+
def quicksort(arr):
|
| 348 |
+
if len(arr) <= 1:
|
| 349 |
+
return arr
|
| 350 |
+
pivot = arr[0]
|
| 351 |
+
left = [x for x in arr[1:] if x <= pivot]
|
| 352 |
+
right = [x for x in arr[1:] if x > pivot]
|
| 353 |
+
return quicksort(left) + [pivot] + quicksort(right)
|
| 354 |
+
""",
|
| 355 |
+
"explanation_keywords": ["duplicate", "arr[1:]", "pivot included", "equal", "lost", "missing"],
|
| 356 |
+
"test_cases": [
|
| 357 |
+
{"input": [3, 6, 8, 10, 1, 2, 1], "expected": [1, 1, 2, 3, 6, 8, 10]},
|
| 358 |
+
{"input": [5, 5, 5], "expected": [5, 5, 5]},
|
| 359 |
+
{"input": [1], "expected": [1]},
|
| 360 |
+
],
|
| 361 |
+
"test_cases_description": "Quicksort handles duplicates: arr[1:] and x <= pivot",
|
| 362 |
+
},
|
| 363 |
+
{
|
| 364 |
+
"task_id": "hard_010",
|
| 365 |
+
"domain": "graph algorithm",
|
| 366 |
+
"instructions": (
|
| 367 |
+
"The function finds the shortest path length in an unweighted graph using BFS. "
|
| 368 |
+
"Fix it AND explain what was wrong. "
|
| 369 |
+
"Explanation must mention: visited, queue, infinite loop, or distance tracking."
|
| 370 |
+
),
|
| 371 |
+
"buggy_code": """\
|
| 372 |
+
from collections import deque
|
| 373 |
+
|
| 374 |
+
def bfs_shortest_path(graph, start, end):
|
| 375 |
+
queue = deque([(start, 0)])
|
| 376 |
+
while queue:
|
| 377 |
+
node, dist = queue.popleft()
|
| 378 |
+
if node == end:
|
| 379 |
+
return dist
|
| 380 |
+
for neighbor in graph.get(node, []):
|
| 381 |
+
queue.append((neighbor, dist + 1))
|
| 382 |
+
return -1
|
| 383 |
+
""",
|
| 384 |
+
"fixed_code": """\
|
| 385 |
+
from collections import deque
|
| 386 |
+
|
| 387 |
+
def bfs_shortest_path(graph, start, end):
|
| 388 |
+
visited = set([start])
|
| 389 |
+
queue = deque([(start, 0)])
|
| 390 |
+
while queue:
|
| 391 |
+
node, dist = queue.popleft()
|
| 392 |
+
if node == end:
|
| 393 |
+
return dist
|
| 394 |
+
for neighbor in graph.get(node, []):
|
| 395 |
+
if neighbor not in visited:
|
| 396 |
+
visited.add(neighbor)
|
| 397 |
+
queue.append((neighbor, dist + 1))
|
| 398 |
+
return -1
|
| 399 |
+
""",
|
| 400 |
+
"explanation_keywords": ["visited", "infinite loop", "revisit", "cycle", "set", "already visited"],
|
| 401 |
+
"test_cases": [
|
| 402 |
+
{"input": [{"A": ["B", "C"], "B": ["D"], "C": ["D"], "D": []}, "A", "D"], "expected": 2},
|
| 403 |
+
{"input": [{"A": ["B"], "B": ["A"]}, "A", "B"], "expected": 1},
|
| 404 |
+
{"input": [{"A": ["B"]}, "A", "C"], "expected": -1},
|
| 405 |
+
],
|
| 406 |
+
"test_cases_description": "BFS shortest path with visited set to prevent revisiting",
|
| 407 |
+
},
|
| 408 |
+
{
|
| 409 |
+
"task_id": "hard_011",
|
| 410 |
+
"domain": "dynamic programming",
|
| 411 |
+
"instructions": (
|
| 412 |
+
"The function computes the 0/1 knapsack maximum value. "
|
| 413 |
+
"Fix it AND explain what was wrong. "
|
| 414 |
+
"Explanation must mention: capacity, dp table, iteration order, or overwrite."
|
| 415 |
+
),
|
| 416 |
+
"buggy_code": """\
|
| 417 |
+
def knapsack(weights, values, capacity):
|
| 418 |
+
n = len(weights)
|
| 419 |
+
dp = [0] * (capacity + 1)
|
| 420 |
+
for i in range(n):
|
| 421 |
+
for w in range(weights[i], capacity + 1):
|
| 422 |
+
dp[w] = max(dp[w], dp[w - weights[i]] + values[i])
|
| 423 |
+
return dp[capacity]
|
| 424 |
+
""",
|
| 425 |
+
"fixed_code": """\
|
| 426 |
+
def knapsack(weights, values, capacity):
|
| 427 |
+
n = len(weights)
|
| 428 |
+
dp = [0] * (capacity + 1)
|
| 429 |
+
for i in range(n):
|
| 430 |
+
for w in range(capacity, weights[i] - 1, -1):
|
| 431 |
+
dp[w] = max(dp[w], dp[w - weights[i]] + values[i])
|
| 432 |
+
return dp[capacity]
|
| 433 |
+
""",
|
| 434 |
+
"explanation_keywords": ["reverse", "backward", "overwrite", "0/1", "unbounded", "iteration order", "right to left"],
|
| 435 |
+
"test_cases": [
|
| 436 |
+
{"input": [[2, 3, 4, 5], [3, 4, 5, 6], 5], "expected": 7},
|
| 437 |
+
{"input": [[1, 2, 3], [6, 10, 12], 5], "expected": 22},
|
| 438 |
+
{"input": [[5], [10], 3], "expected": 0},
|
| 439 |
+
],
|
| 440 |
+
"test_cases_description": "0/1 Knapsack: inner loop must go backward to avoid using item twice",
|
| 441 |
+
},
|
| 442 |
+
{
|
| 443 |
+
"task_id": "hard_012",
|
| 444 |
+
"domain": "string algorithm",
|
| 445 |
+
"instructions": (
|
| 446 |
+
"The function finds the length of the longest substring without repeating characters. "
|
| 447 |
+
"Fix it AND explain what was wrong. "
|
| 448 |
+
"Explanation must mention: window, pointer, index, or update."
|
| 449 |
+
),
|
| 450 |
+
"buggy_code": """\
|
| 451 |
+
def length_of_longest_substring(s):
|
| 452 |
+
char_index = {}
|
| 453 |
+
left = 0
|
| 454 |
+
max_len = 0
|
| 455 |
+
for right, ch in enumerate(s):
|
| 456 |
+
if ch in char_index:
|
| 457 |
+
left = char_index[ch] + 1
|
| 458 |
+
char_index[ch] = right
|
| 459 |
+
max_len = max(max_len, right - left + 1)
|
| 460 |
+
return max_len
|
| 461 |
+
""",
|
| 462 |
+
"fixed_code": """\
|
| 463 |
+
def length_of_longest_substring(s):
|
| 464 |
+
char_index = {}
|
| 465 |
+
left = 0
|
| 466 |
+
max_len = 0
|
| 467 |
+
for right, ch in enumerate(s):
|
| 468 |
+
if ch in char_index and char_index[ch] >= left:
|
| 469 |
+
left = char_index[ch] + 1
|
| 470 |
+
char_index[ch] = right
|
| 471 |
+
max_len = max(max_len, right - left + 1)
|
| 472 |
+
return max_len
|
| 473 |
+
""",
|
| 474 |
+
"explanation_keywords": ["left pointer", "stale", "char_index[ch] >= left", "window", "shrink", "old index"],
|
| 475 |
+
"test_cases": [
|
| 476 |
+
{"input": "abcabcbb", "expected": 3},
|
| 477 |
+
{"input": "bbbbb", "expected": 1},
|
| 478 |
+
{"input": "pwwkew", "expected": 3},
|
| 479 |
+
],
|
| 480 |
+
"test_cases_description": "Longest substring without repeating: only update left if char is within current window",
|
| 481 |
+
},
|
| 482 |
+
{
|
| 483 |
+
"task_id": "hard_013",
|
| 484 |
+
"domain": "data processing",
|
| 485 |
+
"instructions": (
|
| 486 |
+
"The function merges overlapping intervals. "
|
| 487 |
+
"Fix it AND explain what was wrong. "
|
| 488 |
+
"Explanation must mention: sort, overlap, merge condition, or end index."
|
| 489 |
+
),
|
| 490 |
+
"buggy_code": """\
|
| 491 |
+
def merge_intervals(intervals):
|
| 492 |
+
if not intervals:
|
| 493 |
+
return []
|
| 494 |
+
intervals.sort(key=lambda x: x[0])
|
| 495 |
+
merged = [intervals[0]]
|
| 496 |
+
for start, end in intervals[1:]:
|
| 497 |
+
if start <= merged[-1][0]:
|
| 498 |
+
merged[-1][1] = max(merged[-1][1], end)
|
| 499 |
+
else:
|
| 500 |
+
merged.append([start, end])
|
| 501 |
+
return merged
|
| 502 |
+
""",
|
| 503 |
+
"fixed_code": """\
|
| 504 |
+
def merge_intervals(intervals):
|
| 505 |
+
if not intervals:
|
| 506 |
+
return []
|
| 507 |
+
intervals.sort(key=lambda x: x[0])
|
| 508 |
+
merged = [intervals[0]]
|
| 509 |
+
for start, end in intervals[1:]:
|
| 510 |
+
if start <= merged[-1][1]:
|
| 511 |
+
merged[-1][1] = max(merged[-1][1], end)
|
| 512 |
+
else:
|
| 513 |
+
merged.append([start, end])
|
| 514 |
+
return merged
|
| 515 |
+
""",
|
| 516 |
+
"explanation_keywords": ["merged[-1][1]", "end", "start", "overlap", "last interval", "index 1 vs 0"],
|
| 517 |
+
"test_cases": [
|
| 518 |
+
{"input": [[1, 3], [2, 6], [8, 10]], "expected": [[1, 6], [8, 10]]},
|
| 519 |
+
{"input": [[1, 4], [4, 5]], "expected": [[1, 5]]},
|
| 520 |
+
{"input": [[1, 2]], "expected": [[1, 2]]},
|
| 521 |
+
],
|
| 522 |
+
"test_cases_description": "Merge intervals: compare start with merged[-1][1] (end), not [0] (start)",
|
| 523 |
+
},
|
| 524 |
+
{
|
| 525 |
+
"task_id": "hard_014",
|
| 526 |
+
"domain": "math",
|
| 527 |
+
"instructions": (
|
| 528 |
+
"The function does integer square root (floor) without using sqrt(). "
|
| 529 |
+
"Fix it AND explain what was wrong. "
|
| 530 |
+
"Explanation must mention: binary search, convergence, mid, or boundary."
|
| 531 |
+
),
|
| 532 |
+
"buggy_code": """\
|
| 533 |
+
def integer_sqrt(n):
|
| 534 |
+
if n < 2:
|
| 535 |
+
return n
|
| 536 |
+
low, high = 1, n
|
| 537 |
+
while low <= high:
|
| 538 |
+
mid = (low + high) // 2
|
| 539 |
+
if mid * mid == n:
|
| 540 |
+
return mid
|
| 541 |
+
elif mid * mid < n:
|
| 542 |
+
low = mid + 1
|
| 543 |
+
else:
|
| 544 |
+
high = mid - 1
|
| 545 |
+
return low
|
| 546 |
+
""",
|
| 547 |
+
"fixed_code": """\
|
| 548 |
+
def integer_sqrt(n):
|
| 549 |
+
if n < 2:
|
| 550 |
+
return n
|
| 551 |
+
low, high = 1, n // 2
|
| 552 |
+
while low <= high:
|
| 553 |
+
mid = (low + high) // 2
|
| 554 |
+
if mid * mid == n:
|
| 555 |
+
return mid
|
| 556 |
+
elif mid * mid < n:
|
| 557 |
+
low = mid + 1
|
| 558 |
+
else:
|
| 559 |
+
high = mid - 1
|
| 560 |
+
return high
|
| 561 |
+
""",
|
| 562 |
+
"explanation_keywords": ["high", "n // 2", "return high", "return low", "floor", "boundary", "last valid"],
|
| 563 |
+
"test_cases": [
|
| 564 |
+
{"input": 16, "expected": 4},
|
| 565 |
+
{"input": 8, "expected": 2},
|
| 566 |
+
{"input": 1, "expected": 1},
|
| 567 |
+
],
|
| 568 |
+
"test_cases_description": "Integer square root: high=n//2, return high (floor result)",
|
| 569 |
+
},
|
| 570 |
+
{
|
| 571 |
+
"task_id": "hard_015",
|
| 572 |
+
"domain": "string algorithm",
|
| 573 |
+
"instructions": (
|
| 574 |
+
"The function implements the Z-algorithm to count pattern occurrences in text. "
|
| 575 |
+
"Fix it AND explain what was wrong. "
|
| 576 |
+
"Explanation must mention: concatenation, Z-array, separator, or index offset."
|
| 577 |
+
),
|
| 578 |
+
"buggy_code": """\
|
| 579 |
+
def count_occurrences(text, pattern):
|
| 580 |
+
concat = pattern + text
|
| 581 |
+
n = len(concat)
|
| 582 |
+
z = [0] * n
|
| 583 |
+
l, r = 0, 0
|
| 584 |
+
for i in range(1, n):
|
| 585 |
+
if i < r:
|
| 586 |
+
z[i] = min(r - i, z[i - l])
|
| 587 |
+
while i + z[i] < n and concat[z[i]] == concat[i + z[i]]:
|
| 588 |
+
z[i] += 1
|
| 589 |
+
if i + z[i] > r:
|
| 590 |
+
l, r = i, i + z[i]
|
| 591 |
+
return sum(1 for i in range(len(pattern), n) if z[i] == len(pattern))
|
| 592 |
+
""",
|
| 593 |
+
"fixed_code": """\
|
| 594 |
+
def count_occurrences(text, pattern):
|
| 595 |
+
concat = pattern + '#' + text
|
| 596 |
+
n = len(concat)
|
| 597 |
+
z = [0] * n
|
| 598 |
+
l, r = 0, 0
|
| 599 |
+
for i in range(1, n):
|
| 600 |
+
if i < r:
|
| 601 |
+
z[i] = min(r - i, z[i - l])
|
| 602 |
+
while i + z[i] < n and concat[z[i]] == concat[i + z[i]]:
|
| 603 |
+
z[i] += 1
|
| 604 |
+
if i + z[i] > r:
|
| 605 |
+
l, r = i, i + z[i]
|
| 606 |
+
p_len = len(pattern)
|
| 607 |
+
return sum(1 for i in range(p_len + 1, n) if z[i] == p_len)
|
| 608 |
+
""",
|
| 609 |
+
"explanation_keywords": ["separator", "#", "without separator", "bleed", "p_len + 1", "offset", "boundary"],
|
| 610 |
+
"test_cases": [
|
| 611 |
+
{"input": ["aabxaabaab", "aab"], "expected": 3},
|
| 612 |
+
{"input": ["hello world", "world"], "expected": 1},
|
| 613 |
+
{"input": ["aaaa", "aa"], "expected": 3},
|
| 614 |
+
],
|
| 615 |
+
"test_cases_description": "Z-algorithm with '#' separator and corrected offset p_len+1",
|
| 616 |
+
},
|
| 617 |
+
]
|
| 618 |
+
|
| 619 |
+
|
| 620 |
+
def get_random_hard_task() -> dict:
|
| 621 |
+
return random.choice(HARD_TASKS).copy()
|
| 622 |
+
|
| 623 |
+
|
| 624 |
+
def get_task_by_id(task_id: str) -> dict:
|
| 625 |
+
for t in HARD_TASKS:
|
| 626 |
+
if t["task_id"] == task_id:
|
| 627 |
+
return t.copy()
|
| 628 |
+
return random.choice(HARD_TASKS).copy()
|
server/tasks/task_medium.py
ADDED
|
@@ -0,0 +1,507 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/tasks/task_medium.py
|
| 2 |
+
# 15 medium tasks: each function has TWO bugs (logic + edge case).
|
| 3 |
+
# Agent must fix both to get full reward.
|
| 4 |
+
|
| 5 |
+
import random
|
| 6 |
+
|
| 7 |
+
MEDIUM_TASKS = [
|
| 8 |
+
{
|
| 9 |
+
"task_id": "medium_001",
|
| 10 |
+
"domain": "data processing",
|
| 11 |
+
"instructions": (
|
| 12 |
+
"The function should return the average of a list, returning 0.0 for an empty list. "
|
| 13 |
+
"It has TWO bugs. Fix both."
|
| 14 |
+
),
|
| 15 |
+
"buggy_code": """\
|
| 16 |
+
def safe_average(nums):
|
| 17 |
+
if len(nums) == 0:
|
| 18 |
+
return -1
|
| 19 |
+
total = 0
|
| 20 |
+
for n in nums:
|
| 21 |
+
total += n
|
| 22 |
+
return total / len(nums) + 1
|
| 23 |
+
""",
|
| 24 |
+
"fixed_code": """\
|
| 25 |
+
def safe_average(nums):
|
| 26 |
+
if len(nums) == 0:
|
| 27 |
+
return 0.0
|
| 28 |
+
total = 0
|
| 29 |
+
for n in nums:
|
| 30 |
+
total += n
|
| 31 |
+
return total / len(nums)
|
| 32 |
+
""",
|
| 33 |
+
"test_cases": [
|
| 34 |
+
{"input": [2, 4, 6], "expected": 4.0},
|
| 35 |
+
{"input": [], "expected": 0.0},
|
| 36 |
+
{"input": [10], "expected": 10.0},
|
| 37 |
+
],
|
| 38 |
+
"test_cases_description": "Average of list; empty list returns 0.0, not -1; no +1 added to result",
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"task_id": "medium_002",
|
| 42 |
+
"domain": "string processing",
|
| 43 |
+
"instructions": (
|
| 44 |
+
"The function should count vowels in a string (case-insensitive). "
|
| 45 |
+
"It has TWO bugs. Fix both."
|
| 46 |
+
),
|
| 47 |
+
"buggy_code": """\
|
| 48 |
+
def count_vowels(s):
|
| 49 |
+
vowels = 'aeiou'
|
| 50 |
+
count = 0
|
| 51 |
+
for ch in s:
|
| 52 |
+
if ch in vowels:
|
| 53 |
+
count += 1
|
| 54 |
+
return count + 1
|
| 55 |
+
""",
|
| 56 |
+
"fixed_code": """\
|
| 57 |
+
def count_vowels(s):
|
| 58 |
+
vowels = 'aeiouAEIOU'
|
| 59 |
+
count = 0
|
| 60 |
+
for ch in s:
|
| 61 |
+
if ch in vowels:
|
| 62 |
+
count += 1
|
| 63 |
+
return count
|
| 64 |
+
""",
|
| 65 |
+
"test_cases": [
|
| 66 |
+
{"input": "hello", "expected": 2},
|
| 67 |
+
{"input": "HELLO", "expected": 2},
|
| 68 |
+
{"input": "rhythm", "expected": 0},
|
| 69 |
+
],
|
| 70 |
+
"test_cases_description": "Counts vowels case-insensitively without off-by-one",
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"task_id": "medium_003",
|
| 74 |
+
"domain": "list operations",
|
| 75 |
+
"instructions": (
|
| 76 |
+
"The function should flatten a list of lists into one list. "
|
| 77 |
+
"It has TWO bugs. Fix both."
|
| 78 |
+
),
|
| 79 |
+
"buggy_code": """\
|
| 80 |
+
def flatten(lists):
|
| 81 |
+
result = []
|
| 82 |
+
for sublist in lists:
|
| 83 |
+
for item in sublist:
|
| 84 |
+
result.append(item)
|
| 85 |
+
return result[1:]
|
| 86 |
+
""",
|
| 87 |
+
"fixed_code": """\
|
| 88 |
+
def flatten(lists):
|
| 89 |
+
result = []
|
| 90 |
+
for sublist in lists:
|
| 91 |
+
for item in sublist:
|
| 92 |
+
result.append(item)
|
| 93 |
+
return result
|
| 94 |
+
""",
|
| 95 |
+
"test_cases": [
|
| 96 |
+
{"input": [[1, 2], [3, 4]], "expected": [1, 2, 3, 4]},
|
| 97 |
+
{"input": [[1]], "expected": [1]},
|
| 98 |
+
{"input": [[], [5, 6]], "expected": [5, 6]},
|
| 99 |
+
],
|
| 100 |
+
"test_cases_description": "Flattens nested lists correctly without slicing off first element",
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"task_id": "medium_004",
|
| 104 |
+
"domain": "math",
|
| 105 |
+
"instructions": (
|
| 106 |
+
"The function should return the GCD of two numbers. "
|
| 107 |
+
"It has TWO bugs. Fix both."
|
| 108 |
+
),
|
| 109 |
+
"buggy_code": """\
|
| 110 |
+
def gcd(a, b):
|
| 111 |
+
while b != 0:
|
| 112 |
+
a = b
|
| 113 |
+
b = a % b
|
| 114 |
+
return b
|
| 115 |
+
""",
|
| 116 |
+
"fixed_code": """\
|
| 117 |
+
def gcd(a, b):
|
| 118 |
+
while b != 0:
|
| 119 |
+
a, b = b, a % b
|
| 120 |
+
return a
|
| 121 |
+
""",
|
| 122 |
+
"test_cases": [
|
| 123 |
+
{"input": [12, 8], "expected": 4},
|
| 124 |
+
{"input": [100, 75], "expected": 25},
|
| 125 |
+
{"input": [7, 3], "expected": 1},
|
| 126 |
+
],
|
| 127 |
+
"test_cases_description": "Correct GCD using Euclidean algorithm",
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"task_id": "medium_005",
|
| 131 |
+
"domain": "data processing",
|
| 132 |
+
"instructions": (
|
| 133 |
+
"The function should count frequency of each element in a list and return a dict. "
|
| 134 |
+
"It has TWO bugs. Fix both."
|
| 135 |
+
),
|
| 136 |
+
"buggy_code": """\
|
| 137 |
+
def count_frequency(lst):
|
| 138 |
+
freq = {}
|
| 139 |
+
for item in lst:
|
| 140 |
+
if item in freq:
|
| 141 |
+
freq[item] = 1
|
| 142 |
+
else:
|
| 143 |
+
freq[item] = freq[item] + 1
|
| 144 |
+
return freq
|
| 145 |
+
""",
|
| 146 |
+
"fixed_code": """\
|
| 147 |
+
def count_frequency(lst):
|
| 148 |
+
freq = {}
|
| 149 |
+
for item in lst:
|
| 150 |
+
if item in freq:
|
| 151 |
+
freq[item] += 1
|
| 152 |
+
else:
|
| 153 |
+
freq[item] = 1
|
| 154 |
+
return freq
|
| 155 |
+
""",
|
| 156 |
+
"test_cases": [
|
| 157 |
+
{"input": [1, 2, 2, 3, 3, 3], "expected": {1: 1, 2: 2, 3: 3}},
|
| 158 |
+
{"input": ["a", "b", "a"], "expected": {"a": 2, "b": 1}},
|
| 159 |
+
{"input": [5], "expected": {5: 1}},
|
| 160 |
+
],
|
| 161 |
+
"test_cases_description": "Correctly counts frequency; swapped if/else logic fixed",
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"task_id": "medium_006",
|
| 165 |
+
"domain": "string processing",
|
| 166 |
+
"instructions": (
|
| 167 |
+
"The function should check if two strings are anagrams (case-insensitive). "
|
| 168 |
+
"It has TWO bugs. Fix both."
|
| 169 |
+
),
|
| 170 |
+
"buggy_code": """\
|
| 171 |
+
def are_anagrams(s1, s2):
|
| 172 |
+
if len(s1) != len(s2):
|
| 173 |
+
return True
|
| 174 |
+
return sorted(s1) == sorted(s2)
|
| 175 |
+
""",
|
| 176 |
+
"fixed_code": """\
|
| 177 |
+
def are_anagrams(s1, s2):
|
| 178 |
+
if len(s1) != len(s2):
|
| 179 |
+
return False
|
| 180 |
+
return sorted(s1.lower()) == sorted(s2.lower())
|
| 181 |
+
""",
|
| 182 |
+
"test_cases": [
|
| 183 |
+
{"input": ["listen", "silent"], "expected": True},
|
| 184 |
+
{"input": ["hello", "world"], "expected": False},
|
| 185 |
+
{"input": ["Listen", "Silent"], "expected": True},
|
| 186 |
+
],
|
| 187 |
+
"test_cases_description": "Anagram check with case-insensitivity and correct early-return logic",
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"task_id": "medium_007",
|
| 191 |
+
"domain": "data processing",
|
| 192 |
+
"instructions": (
|
| 193 |
+
"The function should merge two sorted lists into one sorted list. "
|
| 194 |
+
"It has TWO bugs. Fix both."
|
| 195 |
+
),
|
| 196 |
+
"buggy_code": """\
|
| 197 |
+
def merge_sorted(a, b):
|
| 198 |
+
result = []
|
| 199 |
+
i, j = 0, 0
|
| 200 |
+
while i < len(a) and j < len(b):
|
| 201 |
+
if a[i] < b[j]:
|
| 202 |
+
result.append(b[j])
|
| 203 |
+
i += 1
|
| 204 |
+
else:
|
| 205 |
+
result.append(a[i])
|
| 206 |
+
j += 1
|
| 207 |
+
result.extend(a[i:])
|
| 208 |
+
result.extend(b[j:])
|
| 209 |
+
return result
|
| 210 |
+
""",
|
| 211 |
+
"fixed_code": """\
|
| 212 |
+
def merge_sorted(a, b):
|
| 213 |
+
result = []
|
| 214 |
+
i, j = 0, 0
|
| 215 |
+
while i < len(a) and j < len(b):
|
| 216 |
+
if a[i] < b[j]:
|
| 217 |
+
result.append(a[i])
|
| 218 |
+
i += 1
|
| 219 |
+
else:
|
| 220 |
+
result.append(b[j])
|
| 221 |
+
j += 1
|
| 222 |
+
result.extend(a[i:])
|
| 223 |
+
result.extend(b[j:])
|
| 224 |
+
return result
|
| 225 |
+
""",
|
| 226 |
+
"test_cases": [
|
| 227 |
+
{"input": [[1, 3, 5], [2, 4, 6]], "expected": [1, 2, 3, 4, 5, 6]},
|
| 228 |
+
{"input": [[1, 2], [3, 4]], "expected": [1, 2, 3, 4]},
|
| 229 |
+
{"input": [[], [1, 2]], "expected": [1, 2]},
|
| 230 |
+
],
|
| 231 |
+
"test_cases_description": "Merges two sorted lists correctly",
|
| 232 |
+
},
|
| 233 |
+
{
|
| 234 |
+
"task_id": "medium_008",
|
| 235 |
+
"domain": "API handler",
|
| 236 |
+
"instructions": (
|
| 237 |
+
"The function validates a user registration dict. "
|
| 238 |
+
"It should return True only if 'email' and 'password' are present and password >= 8 chars. "
|
| 239 |
+
"It has TWO bugs. Fix both."
|
| 240 |
+
),
|
| 241 |
+
"buggy_code": """\
|
| 242 |
+
def validate_registration(data):
|
| 243 |
+
if 'email' not in data:
|
| 244 |
+
return False
|
| 245 |
+
if len(data.get('password', '')) > 8:
|
| 246 |
+
return False
|
| 247 |
+
return True
|
| 248 |
+
""",
|
| 249 |
+
"fixed_code": """\
|
| 250 |
+
def validate_registration(data):
|
| 251 |
+
if 'email' not in data:
|
| 252 |
+
return False
|
| 253 |
+
if len(data.get('password', '')) < 8:
|
| 254 |
+
return False
|
| 255 |
+
return True
|
| 256 |
+
""",
|
| 257 |
+
"test_cases": [
|
| 258 |
+
{"input": {"email": "a@b.com", "password": "strongpass"}, "expected": True},
|
| 259 |
+
{"input": {"email": "a@b.com", "password": "short"}, "expected": False},
|
| 260 |
+
{"input": {"password": "strongpass"}, "expected": False},
|
| 261 |
+
],
|
| 262 |
+
"test_cases_description": "Validates registration with correct password length check",
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"task_id": "medium_009",
|
| 266 |
+
"domain": "math",
|
| 267 |
+
"instructions": (
|
| 268 |
+
"The function should return True if a number is a perfect square. "
|
| 269 |
+
"It has TWO bugs. Fix both."
|
| 270 |
+
),
|
| 271 |
+
"buggy_code": """\
|
| 272 |
+
def is_perfect_square(n):
|
| 273 |
+
if n < 0:
|
| 274 |
+
return True
|
| 275 |
+
root = int(n ** 0.5)
|
| 276 |
+
return root * root != n
|
| 277 |
+
""",
|
| 278 |
+
"fixed_code": """\
|
| 279 |
+
def is_perfect_square(n):
|
| 280 |
+
if n < 0:
|
| 281 |
+
return False
|
| 282 |
+
root = int(n ** 0.5)
|
| 283 |
+
return root * root == n
|
| 284 |
+
""",
|
| 285 |
+
"test_cases": [
|
| 286 |
+
{"input": 16, "expected": True},
|
| 287 |
+
{"input": 15, "expected": False},
|
| 288 |
+
{"input": -4, "expected": False},
|
| 289 |
+
],
|
| 290 |
+
"test_cases_description": "Correctly identifies perfect squares including negative number check",
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"task_id": "medium_010",
|
| 294 |
+
"domain": "data processing",
|
| 295 |
+
"instructions": (
|
| 296 |
+
"The function should return the top-k most frequent elements in a list. "
|
| 297 |
+
"It has TWO bugs. Fix both."
|
| 298 |
+
),
|
| 299 |
+
"buggy_code": """\
|
| 300 |
+
def top_k_frequent(nums, k):
|
| 301 |
+
freq = {}
|
| 302 |
+
for n in nums:
|
| 303 |
+
freq[n] = freq.get(n, 0) + 1
|
| 304 |
+
sorted_items = sorted(freq.items(), key=lambda x: x[1])
|
| 305 |
+
return [item[0] for item in sorted_items[:k]]
|
| 306 |
+
""",
|
| 307 |
+
"fixed_code": """\
|
| 308 |
+
def top_k_frequent(nums, k):
|
| 309 |
+
freq = {}
|
| 310 |
+
for n in nums:
|
| 311 |
+
freq[n] = freq.get(n, 0) + 1
|
| 312 |
+
sorted_items = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
| 313 |
+
return [item[0] for item in sorted_items[:k]]
|
| 314 |
+
""",
|
| 315 |
+
"test_cases": [
|
| 316 |
+
{"input": [[1, 1, 1, 2, 2, 3], 2], "expected": [1, 2]},
|
| 317 |
+
{"input": [[4, 4, 5, 5, 5], 1], "expected": [5]},
|
| 318 |
+
{"input": [[1, 2, 3], 3], "expected": [1, 2, 3]},
|
| 319 |
+
],
|
| 320 |
+
"test_cases_description": "Returns top-k frequent elements in descending frequency order",
|
| 321 |
+
},
|
| 322 |
+
{
|
| 323 |
+
"task_id": "medium_011",
|
| 324 |
+
"domain": "string processing",
|
| 325 |
+
"instructions": (
|
| 326 |
+
"The function should return the longest common prefix of a list of strings. "
|
| 327 |
+
"It has TWO bugs. Fix both."
|
| 328 |
+
),
|
| 329 |
+
"buggy_code": """\
|
| 330 |
+
def longest_common_prefix(strs):
|
| 331 |
+
if not strs:
|
| 332 |
+
return ''
|
| 333 |
+
prefix = strs[1]
|
| 334 |
+
for s in strs:
|
| 335 |
+
while not s.startswith(prefix):
|
| 336 |
+
prefix = prefix[:-1]
|
| 337 |
+
if not prefix:
|
| 338 |
+
return ''
|
| 339 |
+
return prefix
|
| 340 |
+
""",
|
| 341 |
+
"fixed_code": """\
|
| 342 |
+
def longest_common_prefix(strs):
|
| 343 |
+
if not strs:
|
| 344 |
+
return ''
|
| 345 |
+
prefix = strs[0]
|
| 346 |
+
for s in strs:
|
| 347 |
+
while not s.startswith(prefix):
|
| 348 |
+
prefix = prefix[:-1]
|
| 349 |
+
if not prefix:
|
| 350 |
+
return ''
|
| 351 |
+
return prefix
|
| 352 |
+
""",
|
| 353 |
+
"test_cases": [
|
| 354 |
+
{"input": ["flower", "flow", "flight"], "expected": "fl"},
|
| 355 |
+
{"input": ["dog", "racecar", "car"], "expected": ""},
|
| 356 |
+
{"input": ["interview", "interact", "interface"], "expected": "inter"},
|
| 357 |
+
],
|
| 358 |
+
"test_cases_description": "Correct longest common prefix starting from index 0",
|
| 359 |
+
},
|
| 360 |
+
{
|
| 361 |
+
"task_id": "medium_012",
|
| 362 |
+
"domain": "list operations",
|
| 363 |
+
"instructions": (
|
| 364 |
+
"The function should rotate a list to the right by k positions. "
|
| 365 |
+
"It has TWO bugs. Fix both."
|
| 366 |
+
),
|
| 367 |
+
"buggy_code": """\
|
| 368 |
+
def rotate_right(lst, k):
|
| 369 |
+
if not lst:
|
| 370 |
+
return lst
|
| 371 |
+
k = k % len(lst)
|
| 372 |
+
return lst[k:] + lst[:k]
|
| 373 |
+
""",
|
| 374 |
+
"fixed_code": """\
|
| 375 |
+
def rotate_right(lst, k):
|
| 376 |
+
if not lst:
|
| 377 |
+
return lst
|
| 378 |
+
k = k % len(lst)
|
| 379 |
+
return lst[-k:] + lst[:-k]
|
| 380 |
+
""",
|
| 381 |
+
"test_cases": [
|
| 382 |
+
{"input": [[1, 2, 3, 4, 5], 2], "expected": [4, 5, 1, 2, 3]},
|
| 383 |
+
{"input": [[1, 2, 3], 1], "expected": [3, 1, 2]},
|
| 384 |
+
{"input": [[], 3], "expected": []},
|
| 385 |
+
],
|
| 386 |
+
"test_cases_description": "Rotates list to the right correctly",
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"task_id": "medium_013",
|
| 390 |
+
"domain": "API handler",
|
| 391 |
+
"instructions": (
|
| 392 |
+
"The function parses a query string into a dict. "
|
| 393 |
+
"It has TWO bugs. Fix both."
|
| 394 |
+
),
|
| 395 |
+
"buggy_code": """\
|
| 396 |
+
def parse_query_string(query):
|
| 397 |
+
if not query:
|
| 398 |
+
return None
|
| 399 |
+
result = {}
|
| 400 |
+
for pair in query.split('&'):
|
| 401 |
+
if '=' in pair:
|
| 402 |
+
key, value = pair.split('=')
|
| 403 |
+
result[value] = key
|
| 404 |
+
return result
|
| 405 |
+
""",
|
| 406 |
+
"fixed_code": """\
|
| 407 |
+
def parse_query_string(query):
|
| 408 |
+
if not query:
|
| 409 |
+
return {}
|
| 410 |
+
result = {}
|
| 411 |
+
for pair in query.split('&'):
|
| 412 |
+
if '=' in pair:
|
| 413 |
+
key, value = pair.split('=', 1)
|
| 414 |
+
result[key] = value
|
| 415 |
+
return result
|
| 416 |
+
""",
|
| 417 |
+
"test_cases": [
|
| 418 |
+
{"input": "name=Alice&age=30", "expected": {"name": "Alice", "age": "30"}},
|
| 419 |
+
{"input": "", "expected": {}},
|
| 420 |
+
{"input": "key=value=extra", "expected": {"key": "value=extra"}},
|
| 421 |
+
],
|
| 422 |
+
"test_cases_description": "Parses query string; empty returns {}; key=value order correct; split on first = only",
|
| 423 |
+
},
|
| 424 |
+
{
|
| 425 |
+
"task_id": "medium_014",
|
| 426 |
+
"domain": "data processing",
|
| 427 |
+
"instructions": (
|
| 428 |
+
"The function should return all pairs of numbers in a list that sum to target. "
|
| 429 |
+
"It has TWO bugs. Fix both."
|
| 430 |
+
),
|
| 431 |
+
"buggy_code": """\
|
| 432 |
+
def find_pairs(nums, target):
|
| 433 |
+
pairs = []
|
| 434 |
+
seen = set()
|
| 435 |
+
for n in nums:
|
| 436 |
+
complement = target + n
|
| 437 |
+
if complement in seen:
|
| 438 |
+
pairs.append((complement, n))
|
| 439 |
+
seen.add(n)
|
| 440 |
+
return pairs
|
| 441 |
+
""",
|
| 442 |
+
"fixed_code": """\
|
| 443 |
+
def find_pairs(nums, target):
|
| 444 |
+
pairs = []
|
| 445 |
+
seen = set()
|
| 446 |
+
for n in nums:
|
| 447 |
+
complement = target - n
|
| 448 |
+
if complement in seen:
|
| 449 |
+
pairs.append((complement, n))
|
| 450 |
+
seen.add(n)
|
| 451 |
+
return pairs
|
| 452 |
+
""",
|
| 453 |
+
"test_cases": [
|
| 454 |
+
{"input": [[2, 7, 11, 15], 9], "expected": [(2, 7)]},
|
| 455 |
+
{"input": [[1, 2, 3, 4], 5], "expected": [(1, 4), (2, 3)]},
|
| 456 |
+
{"input": [[1, 2], 10], "expected": []},
|
| 457 |
+
],
|
| 458 |
+
"test_cases_description": "Finds all pairs summing to target using complement = target - n",
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"task_id": "medium_015",
|
| 462 |
+
"domain": "math",
|
| 463 |
+
"instructions": (
|
| 464 |
+
"The function should return the nth Fibonacci number (0-indexed). "
|
| 465 |
+
"It has TWO bugs. Fix both."
|
| 466 |
+
),
|
| 467 |
+
"buggy_code": """\
|
| 468 |
+
def fibonacci(n):
|
| 469 |
+
if n == 0:
|
| 470 |
+
return 1
|
| 471 |
+
if n == 1:
|
| 472 |
+
return 1
|
| 473 |
+
a, b = 0, 1
|
| 474 |
+
for _ in range(2, n):
|
| 475 |
+
a, b = b, a + b
|
| 476 |
+
return b
|
| 477 |
+
""",
|
| 478 |
+
"fixed_code": """\
|
| 479 |
+
def fibonacci(n):
|
| 480 |
+
if n == 0:
|
| 481 |
+
return 0
|
| 482 |
+
if n == 1:
|
| 483 |
+
return 1
|
| 484 |
+
a, b = 0, 1
|
| 485 |
+
for _ in range(2, n + 1):
|
| 486 |
+
a, b = b, a + b
|
| 487 |
+
return b
|
| 488 |
+
""",
|
| 489 |
+
"test_cases": [
|
| 490 |
+
{"input": 0, "expected": 0},
|
| 491 |
+
{"input": 1, "expected": 1},
|
| 492 |
+
{"input": 6, "expected": 8},
|
| 493 |
+
],
|
| 494 |
+
"test_cases_description": "Correct Fibonacci: fib(0)=0, fib(1)=1, fib(6)=8",
|
| 495 |
+
},
|
| 496 |
+
]
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
def get_random_medium_task() -> dict:
|
| 500 |
+
return random.choice(MEDIUM_TASKS).copy()
|
| 501 |
+
|
| 502 |
+
|
| 503 |
+
def get_task_by_id(task_id: str) -> dict:
|
| 504 |
+
for t in MEDIUM_TASKS:
|
| 505 |
+
if t["task_id"] == task_id:
|
| 506 |
+
return t.copy()
|
| 507 |
+
return random.choice(MEDIUM_TASKS).copy()
|
validator/__pycache__/pre_submit_check.cpython-39.pyc
ADDED
|
Binary file (5.48 kB). View file
|
|
|
validator/pre_submit_check.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# validator/pre_submit_check.py
|
| 3 |
+
# Run this BEFORE submitting to catch any disqualifying issues.
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# python validator/pre_submit_check.py
|
| 7 |
+
# python validator/pre_submit_check.py --url https://your-space.hf.space
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
import json
|
| 12 |
+
import argparse
|
| 13 |
+
import requests
|
| 14 |
+
|
| 15 |
+
PASS = "β
"
|
| 16 |
+
FAIL = "β"
|
| 17 |
+
WARN = "β οΈ"
|
| 18 |
+
|
| 19 |
+
results = []
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def check(name: str, passed: bool, detail: str = ""):
|
| 23 |
+
status = PASS if passed else FAIL
|
| 24 |
+
results.append({"check": name, "passed": passed, "detail": detail})
|
| 25 |
+
print(f" {status} {name}" + (f": {detail}" if detail else ""))
|
| 26 |
+
return passed
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def run_checks(base_url: str):
|
| 30 |
+
print(f"\n{'='*60}")
|
| 31 |
+
print(f" Code Debug Environment β Pre-Submission Validator")
|
| 32 |
+
print(f" Target: {base_url}")
|
| 33 |
+
print(f"{'='*60}\n")
|
| 34 |
+
|
| 35 |
+
all_passed = True
|
| 36 |
+
|
| 37 |
+
# ββ 1. Health check βββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
+
print("[ CHECK 1 ] Health endpoint")
|
| 39 |
+
try:
|
| 40 |
+
r = requests.get(f"{base_url}/health", timeout=10)
|
| 41 |
+
passed = r.status_code == 200 and r.json().get("status") == "ok"
|
| 42 |
+
check("GET /health returns 200 with status=ok", passed, f"HTTP {r.status_code}")
|
| 43 |
+
all_passed &= passed
|
| 44 |
+
except Exception as e:
|
| 45 |
+
check("GET /health", False, str(e))
|
| 46 |
+
all_passed = False
|
| 47 |
+
|
| 48 |
+
# ββ 2. Reset responds βββββββββββββββββββββββββββββββββββββββββ
|
| 49 |
+
print("\n[ CHECK 2 ] POST /reset")
|
| 50 |
+
obs = None
|
| 51 |
+
for difficulty in ["easy", "medium", "hard"]:
|
| 52 |
+
try:
|
| 53 |
+
r = requests.post(f"{base_url}/reset", json={"difficulty": difficulty}, timeout=15)
|
| 54 |
+
data = r.json()
|
| 55 |
+
obs = data.get("observation", {})
|
| 56 |
+
has_fields = all(k in obs for k in ["task_id", "difficulty", "buggy_code", "instructions"])
|
| 57 |
+
passed = r.status_code == 200 and has_fields
|
| 58 |
+
check(f"reset(difficulty='{difficulty}') returns valid observation", passed,
|
| 59 |
+
f"task_id={obs.get('task_id', 'MISSING')}")
|
| 60 |
+
all_passed &= passed
|
| 61 |
+
except Exception as e:
|
| 62 |
+
check(f"reset(difficulty='{difficulty}')", False, str(e))
|
| 63 |
+
all_passed = False
|
| 64 |
+
|
| 65 |
+
# ββ 3. Step responds ββββββββββββββββββββββββββββββββββββββββββ
|
| 66 |
+
print("\n[ CHECK 3 ] POST /step")
|
| 67 |
+
try:
|
| 68 |
+
# Reset first to get a fresh task
|
| 69 |
+
r = requests.post(f"{base_url}/reset", json={"difficulty": "easy"}, timeout=15)
|
| 70 |
+
buggy_code = r.json()["observation"]["buggy_code"]
|
| 71 |
+
|
| 72 |
+
# Submit the buggy code as-is (reward may be 0, that's fine)
|
| 73 |
+
r = requests.post(f"{base_url}/step", json={"fixed_code": buggy_code}, timeout=15)
|
| 74 |
+
data = r.json()
|
| 75 |
+
has_reward = "reward" in data and isinstance(data["reward"], (int, float))
|
| 76 |
+
has_done = "done" in data and isinstance(data["done"], bool)
|
| 77 |
+
reward_in_range = 0.0 <= data.get("reward", -1) <= 1.0
|
| 78 |
+
passed = r.status_code == 200 and has_reward and has_done and reward_in_range
|
| 79 |
+
check("step() returns reward in [0.0, 1.0] and done flag", passed,
|
| 80 |
+
f"reward={data.get('reward')}, done={data.get('done')}")
|
| 81 |
+
all_passed &= passed
|
| 82 |
+
except Exception as e:
|
| 83 |
+
check("POST /step", False, str(e))
|
| 84 |
+
all_passed = False
|
| 85 |
+
|
| 86 |
+
# ββ 4. State responds βββββββββββββββββββββββββββββββββββββββββ
|
| 87 |
+
print("\n[ CHECK 4 ] GET /state")
|
| 88 |
+
try:
|
| 89 |
+
r = requests.get(f"{base_url}/state", timeout=10)
|
| 90 |
+
data = r.json()
|
| 91 |
+
has_fields = all(k in data for k in ["episode_id", "step_count", "difficulty"])
|
| 92 |
+
passed = r.status_code == 200 and has_fields
|
| 93 |
+
check("GET /state returns episode_id, step_count, difficulty", passed)
|
| 94 |
+
all_passed &= passed
|
| 95 |
+
except Exception as e:
|
| 96 |
+
check("GET /state", False, str(e))
|
| 97 |
+
all_passed = False
|
| 98 |
+
|
| 99 |
+
# ββ 5. 3 difficulties all work ββββββββββββββββββββββββββββββββ
|
| 100 |
+
print("\n[ CHECK 5 ] All 3 task difficulties functional")
|
| 101 |
+
for difficulty in ["easy", "medium", "hard"]:
|
| 102 |
+
try:
|
| 103 |
+
r = requests.post(f"{base_url}/reset", json={"difficulty": difficulty}, timeout=15)
|
| 104 |
+
obs = r.json()["observation"]
|
| 105 |
+
passed = obs.get("difficulty") == difficulty
|
| 106 |
+
check(f"difficulty='{difficulty}' task loads correctly",
|
| 107 |
+
passed, f"got difficulty={obs.get('difficulty')}")
|
| 108 |
+
all_passed &= passed
|
| 109 |
+
except Exception as e:
|
| 110 |
+
check(f"difficulty='{difficulty}'", False, str(e))
|
| 111 |
+
all_passed = False
|
| 112 |
+
|
| 113 |
+
# ββ 6. Reward range on perfect answer βββββββββββββββββββββββββ
|
| 114 |
+
print("\n[ CHECK 6 ] Reward range validation (correct fix)")
|
| 115 |
+
try:
|
| 116 |
+
from server.tasks.task_easy import EASY_TASKS
|
| 117 |
+
task = EASY_TASKS[0]
|
| 118 |
+
# Reset with the first easy task
|
| 119 |
+
r = requests.post(f"{base_url}/reset", json={"difficulty": "easy"}, timeout=15)
|
| 120 |
+
# Submit the known correct fix
|
| 121 |
+
r = requests.post(f"{base_url}/step",
|
| 122 |
+
json={"fixed_code": task["fixed_code"]}, timeout=15)
|
| 123 |
+
data = r.json()
|
| 124 |
+
reward = data.get("reward", -1)
|
| 125 |
+
passed = 0.0 <= reward <= 1.0
|
| 126 |
+
check(f"Submitting correct fix yields reward in [0.0, 1.0]", passed,
|
| 127 |
+
f"reward={reward}")
|
| 128 |
+
all_passed &= passed
|
| 129 |
+
except Exception as e:
|
| 130 |
+
check("Reward range check", False, str(e))
|
| 131 |
+
all_passed = False
|
| 132 |
+
|
| 133 |
+
# ββ 7. openenv.yaml exists ββββββββββββββββββββββββββββββββββββ
|
| 134 |
+
print("\n[ CHECK 7 ] Project structure")
|
| 135 |
+
required_files = [
|
| 136 |
+
"openenv.yaml",
|
| 137 |
+
"inference.py",
|
| 138 |
+
"models.py",
|
| 139 |
+
"server/app.py",
|
| 140 |
+
"server/environment.py",
|
| 141 |
+
"server/Dockerfile",
|
| 142 |
+
"server/requirements.txt",
|
| 143 |
+
"pyproject.toml",
|
| 144 |
+
"README.md",
|
| 145 |
+
]
|
| 146 |
+
for fname in required_files:
|
| 147 |
+
exists = os.path.exists(fname)
|
| 148 |
+
check(f"File exists: {fname}", exists)
|
| 149 |
+
all_passed &= exists
|
| 150 |
+
|
| 151 |
+
# ββ 8. inference.py has required log format βββββββββββββββββββ
|
| 152 |
+
print("\n[ CHECK 8 ] inference.py log format")
|
| 153 |
+
try:
|
| 154 |
+
with open("inference.py") as f:
|
| 155 |
+
content = f.read()
|
| 156 |
+
has_start = '"type": "START"' in content
|
| 157 |
+
has_step = '"type": "STEP"' in content
|
| 158 |
+
has_end = '"type": "END"' in content
|
| 159 |
+
check("inference.py emits [START] logs", has_start)
|
| 160 |
+
check("inference.py emits [STEP] logs", has_step)
|
| 161 |
+
check("inference.py emits [END] logs", has_end)
|
| 162 |
+
all_passed &= has_start and has_step and has_end
|
| 163 |
+
except Exception as e:
|
| 164 |
+
check("inference.py log format", False, str(e))
|
| 165 |
+
all_passed = False
|
| 166 |
+
|
| 167 |
+
# ββ Final summary βββββββββββββββββββββββββββββββββββββββββββββ
|
| 168 |
+
total = len(results)
|
| 169 |
+
passed_count = sum(1 for r in results if r["passed"])
|
| 170 |
+
|
| 171 |
+
print(f"\n{'='*60}")
|
| 172 |
+
print(f" Results: {passed_count}/{total} checks passed")
|
| 173 |
+
if all_passed:
|
| 174 |
+
print(f" {PASS} ALL CHECKS PASSED β you are safe to submit!")
|
| 175 |
+
else:
|
| 176 |
+
failed = [r["check"] for r in results if not r["passed"]]
|
| 177 |
+
print(f" {FAIL} FAILED CHECKS β fix these before submitting:")
|
| 178 |
+
for f in failed:
|
| 179 |
+
print(f" β’ {f}")
|
| 180 |
+
print(f"{'='*60}\n")
|
| 181 |
+
|
| 182 |
+
return all_passed
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
if __name__ == "__main__":
|
| 186 |
+
parser = argparse.ArgumentParser()
|
| 187 |
+
parser.add_argument("--url", default="http://localhost:7860",
|
| 188 |
+
help="Base URL of the running environment")
|
| 189 |
+
args = parser.parse_args()
|
| 190 |
+
|
| 191 |
+
success = run_checks(args.url.rstrip("/"))
|
| 192 |
+
sys.exit(0 if success else 1)
|