Spaces:
Running
Running
Commit ·
d510c1d
1
Parent(s): 8ac3859
error fixing
Browse files- .gitignore +1 -0
- README.md +115 -86
- inference.py +94 -77
- models.py +21 -43
- openenv.yaml +20 -21
- py +0 -0
- pyproject.toml +26 -0
- server/environment.py +26 -40
- server/graders/__pycache__/grader_easy.cpython-310.pyc +0 -0
- server/graders/__pycache__/grader_hard.cpython-310.pyc +0 -0
- server/graders/__pycache__/grader_medium.cpython-310.pyc +0 -0
- server/graders/grader_hard.py +51 -34
.gitignore
CHANGED
|
@@ -2,3 +2,4 @@ __pycache__/
|
|
| 2 |
.vscode/
|
| 3 |
__pycache__/
|
| 4 |
.vscode/
|
|
|
|
|
|
| 2 |
.vscode/
|
| 3 |
__pycache__/
|
| 4 |
.vscode/
|
| 5 |
+
.env
|
README.md
CHANGED
|
@@ -1,31 +1,19 @@
|
|
| 1 |
-
---
|
| 2 |
-
|
| 3 |
-
title: Code Debug Environment
|
| 4 |
-
emoji: 🐍
|
| 5 |
-
colorFrom: blue
|
| 6 |
-
colorTo: green
|
| 7 |
-
sdk: docker
|
| 8 |
-
sdk_version: "1.0"
|
| 9 |
-
app_file: server/app.py
|
| 10 |
-
pinned: false
|
| 11 |
-
---
|
| 12 |
-
|
| 13 |
# Code Debug Environment
|
| 14 |
|
| 15 |
-
An OpenEnv-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
|
| 16 |
|
| 17 |
---
|
| 18 |
|
| 19 |
## Overview
|
| 20 |
|
| 21 |
-
| Property
|
| 22 |
-
|
|
| 23 |
-
| Domain
|
| 24 |
-
| Tasks
|
| 25 |
-
| Difficulties
|
| 26 |
-
| Reward Range
|
| 27 |
-
| Max Steps/Episode | 3
|
| 28 |
-
| API
|
| 29 |
|
| 30 |
---
|
| 31 |
|
|
@@ -33,156 +21,196 @@ An OpenEnv-compatible RL environment where an LLM agent diagnoses and fixes bugg
|
|
| 33 |
|
| 34 |
The agent receives a buggy Python function and must fix it. Tasks come from real-world domains: data processing, string algorithms, API validation, sorting, dynamic programming, and graph algorithms.
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
|
| 40 |
---
|
| 41 |
|
| 42 |
## Action Space
|
| 43 |
|
|
|
|
| 44 |
{
|
| 45 |
-
"fixed_code": "string — the corrected Python function (required)",
|
| 46 |
-
"explanation": "string — explanation of what was wrong (required for hard tasks)"
|
| 47 |
}
|
|
|
|
| 48 |
|
| 49 |
-
| Field
|
| 50 |
-
|
|
| 51 |
-
| fixed_code
|
| 52 |
-
| explanation | str
|
| 53 |
|
| 54 |
---
|
| 55 |
|
| 56 |
## Observation Space
|
| 57 |
|
| 58 |
-
Returned by /reset and /step:
|
| 59 |
|
|
|
|
| 60 |
{
|
| 61 |
-
"task_id": "easy_003",
|
| 62 |
-
"difficulty": "easy",
|
| 63 |
-
"buggy_code": "def find_max(nums):\n ...",
|
| 64 |
-
"instructions": "The function has exactly one bug. Fix it.",
|
| 65 |
-
"test_cases_description": "Finds max value in a list without IndexError",
|
| 66 |
-
"reward": 0.67,
|
| 67 |
-
"passed_tests": 2,
|
| 68 |
-
"total_tests": 3,
|
| 69 |
-
"feedback": "Test 1: ✅ ...\nTest 2: ✅ ...\nTest 3: ❌ ...",
|
| 70 |
-
"done": false
|
| 71 |
}
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
| 75 |
-
|
|
| 76 |
-
|
|
| 77 |
-
|
|
| 78 |
-
|
|
| 79 |
-
|
|
| 80 |
-
|
|
| 81 |
-
|
|
| 82 |
-
|
|
| 83 |
-
|
|
| 84 |
-
|
|
|
|
|
| 85 |
|
| 86 |
---
|
| 87 |
|
| 88 |
## Reward Function
|
| 89 |
|
| 90 |
-
Easy & Medium
|
|
|
|
| 91 |
reward = passed_tests / total_tests
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
reward = 0.7 × test_score + 0.3 × explanation_score
|
|
|
|
|
|
|
| 95 |
|
| 96 |
---
|
| 97 |
|
| 98 |
## Setup & Local Run
|
| 99 |
|
| 100 |
-
Prerequisites
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
* Hugging Face CLI
|
| 105 |
-
|
| 106 |
-
Install
|
| 107 |
git clone https://github.com/YOUR_USERNAME/code-debug-env
|
| 108 |
cd code-debug-env
|
| 109 |
pip install -e .
|
| 110 |
-
|
| 111 |
git clone https://github.com/meta-pytorch/OpenEnv.git
|
| 112 |
export PYTHONPATH=$PYTHONPATH:OpenEnv:OpenEnv/src:.
|
|
|
|
| 113 |
|
| 114 |
-
Run locally
|
|
|
|
| 115 |
uvicorn server.app:app --host 0.0.0.0 --port 7860 --reload
|
|
|
|
| 116 |
|
| 117 |
-
Run with Docker
|
|
|
|
| 118 |
docker build -f server/Dockerfile -t code-debug-env .
|
| 119 |
docker run -p 7860:7860 code-debug-env
|
|
|
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
#
|
| 124 |
-
|
| 125 |
curl http://localhost:7860/health
|
| 126 |
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
| 130 |
|
|
|
|
| 131 |
curl http://localhost:7860/state
|
|
|
|
| 132 |
|
| 133 |
---
|
| 134 |
|
| 135 |
## Run Baseline Inference
|
| 136 |
|
|
|
|
| 137 |
export API_BASE_URL="https://api.openai.com/v1"
|
| 138 |
export MODEL_NAME="gpt-4o-mini"
|
| 139 |
export HF_TOKEN="your-api-key"
|
| 140 |
|
|
|
|
| 141 |
python inference.py --url http://localhost:7860
|
|
|
|
|
|
|
| 142 |
python inference.py --url http://localhost:7860 --difficulty hard
|
|
|
|
| 143 |
|
| 144 |
---
|
| 145 |
|
| 146 |
## Pre-Submission Validation
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
python validator/pre_submit_check.py --url http://localhost:7860
|
|
|
|
|
|
|
| 149 |
python validator/pre_submit_check.py --url https://YOUR_SPACE.hf.space
|
|
|
|
| 150 |
|
| 151 |
---
|
| 152 |
|
| 153 |
## Deploy to Hugging Face Spaces
|
| 154 |
|
|
|
|
|
|
|
| 155 |
huggingface-cli login
|
| 156 |
|
|
|
|
| 157 |
huggingface-cli repo create code-debug-env --type space --space_sdk docker
|
| 158 |
-
|
| 159 |
cd code-debug-env
|
| 160 |
git init
|
| 161 |
git remote add origin https://huggingface.co/spaces/YOUR_USERNAME/code-debug-env
|
| 162 |
-
|
| 163 |
git add .
|
| 164 |
git commit -m "Initial commit"
|
| 165 |
git push origin main
|
|
|
|
| 166 |
|
| 167 |
---
|
| 168 |
|
| 169 |
## Project Structure
|
| 170 |
|
|
|
|
| 171 |
code-debug-env/
|
| 172 |
-
├── openenv.yaml
|
| 173 |
-
├── inference.py
|
| 174 |
-
├── pyproject.toml
|
| 175 |
├── README.md
|
| 176 |
-
├── models.py
|
| 177 |
-
├── client.py
|
| 178 |
-
├──
|
| 179 |
├── server/
|
| 180 |
-
│ ├── app.py
|
| 181 |
-
│ ├── environment.py
|
| 182 |
│ ├── tasks/
|
| 183 |
-
│ │ ├── task_easy.py
|
| 184 |
-
│ │ ├── task_medium.py
|
| 185 |
-
│ │ └── task_hard.py
|
| 186 |
│ ├── graders/
|
| 187 |
│ │ ├── grader_easy.py
|
| 188 |
│ │ ├── grader_medium.py
|
|
@@ -190,4 +218,5 @@ code-debug-env/
|
|
| 190 |
│ ├── requirements.txt
|
| 191 |
│ └── Dockerfile
|
| 192 |
└── validator/
|
| 193 |
-
└── pre_submit_check.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Code Debug Environment
|
| 2 |
|
| 3 |
+
An [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
|
| 4 |
|
| 5 |
---
|
| 6 |
|
| 7 |
## Overview
|
| 8 |
|
| 9 |
+
| Property | Value |
|
| 10 |
+
|---|---|
|
| 11 |
+
| Domain | Real-world Python code debugging |
|
| 12 |
+
| Tasks | 45 total (15 easy + 15 medium + 15 hard) |
|
| 13 |
+
| Difficulties | easy → medium → hard |
|
| 14 |
+
| Reward Range | 0.0 – 1.0 (partial, proportional) |
|
| 15 |
+
| Max Steps/Episode | 3 |
|
| 16 |
+
| API | OpenEnv standard: `/reset`, `/step`, `/state` |
|
| 17 |
|
| 18 |
---
|
| 19 |
|
|
|
|
| 21 |
|
| 22 |
The agent receives a buggy Python function and must fix it. Tasks come from real-world domains: data processing, string algorithms, API validation, sorting, dynamic programming, and graph algorithms.
|
| 23 |
|
| 24 |
+
- **Easy**: One bug (wrong operator, off-by-one, incorrect return). Reward proportional to test pass rate.
|
| 25 |
+
- **Medium**: Two bugs (logic bug + edge case). Reward proportional to test pass rate.
|
| 26 |
+
- **Hard**: One algorithmic bug + agent must explain what was wrong. Reward = 0.7 × test score + 0.3 × explanation quality.
|
| 27 |
|
| 28 |
---
|
| 29 |
|
| 30 |
## Action Space
|
| 31 |
|
| 32 |
+
```json
|
| 33 |
{
|
| 34 |
+
"fixed_code": "string — the corrected Python function (required)",
|
| 35 |
+
"explanation": "string — explanation of what was wrong (required for hard tasks)"
|
| 36 |
}
|
| 37 |
+
```
|
| 38 |
|
| 39 |
+
| Field | Type | Required | Description |
|
| 40 |
+
|---|---|---|---|
|
| 41 |
+
| `fixed_code` | `str` | Always | Complete corrected Python function as a string |
|
| 42 |
+
| `explanation` | `str` | Hard tasks | Describe the bug and why your fix is correct |
|
| 43 |
|
| 44 |
---
|
| 45 |
|
| 46 |
## Observation Space
|
| 47 |
|
| 48 |
+
Returned by `/reset` and `/step`:
|
| 49 |
|
| 50 |
+
```json
|
| 51 |
{
|
| 52 |
+
"task_id": "easy_003",
|
| 53 |
+
"difficulty": "easy",
|
| 54 |
+
"buggy_code": "def find_max(nums):\n ...",
|
| 55 |
+
"instructions": "The function has exactly one bug. Fix it.",
|
| 56 |
+
"test_cases_description": "Finds max value in a list without IndexError",
|
| 57 |
+
"reward": 0.67,
|
| 58 |
+
"passed_tests": 2,
|
| 59 |
+
"total_tests": 3,
|
| 60 |
+
"feedback": "Test 1: ✅ ...\nTest 2: ✅ ...\nTest 3: ❌ ...",
|
| 61 |
+
"done": false
|
| 62 |
}
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
| Field | Type | Description |
|
| 66 |
+
|---|---|---|
|
| 67 |
+
| `task_id` | `str` | Unique task identifier |
|
| 68 |
+
| `difficulty` | `str` | `easy` / `medium` / `hard` |
|
| 69 |
+
| `buggy_code` | `str` | Buggy Python function to fix |
|
| 70 |
+
| `instructions` | `str` | Task instructions |
|
| 71 |
+
| `test_cases_description` | `str` | What the test cases check |
|
| 72 |
+
| `reward` | `float\|null` | Score from last step (null on reset) |
|
| 73 |
+
| `passed_tests` | `int\|null` | Tests passed (null on reset) |
|
| 74 |
+
| `total_tests` | `int` | Total number of test cases |
|
| 75 |
+
| `feedback` | `str\|null` | Detailed per-test feedback |
|
| 76 |
+
| `done` | `bool` | True when episode is complete |
|
| 77 |
|
| 78 |
---
|
| 79 |
|
| 80 |
## Reward Function
|
| 81 |
|
| 82 |
+
### Easy & Medium
|
| 83 |
+
```
|
| 84 |
reward = passed_tests / total_tests
|
| 85 |
+
```
|
| 86 |
+
- 3/3 tests → 1.0
|
| 87 |
+
- 2/3 tests → 0.67
|
| 88 |
+
- 1/3 tests → 0.33
|
| 89 |
+
- 0/3 tests → 0.0
|
| 90 |
+
|
| 91 |
+
### Hard
|
| 92 |
+
```
|
| 93 |
reward = 0.7 × test_score + 0.3 × explanation_score
|
| 94 |
+
```
|
| 95 |
+
Explanation is scored by matching key algorithmic concepts. Partial credit is given.
|
| 96 |
|
| 97 |
---
|
| 98 |
|
| 99 |
## Setup & Local Run
|
| 100 |
|
| 101 |
+
### Prerequisites
|
| 102 |
+
- Python 3.10+
|
| 103 |
+
- Docker
|
| 104 |
+
- Hugging Face CLI
|
| 105 |
|
| 106 |
+
### Install
|
| 107 |
+
```bash
|
|
|
|
|
|
|
|
|
|
| 108 |
git clone https://github.com/YOUR_USERNAME/code-debug-env
|
| 109 |
cd code-debug-env
|
| 110 |
pip install -e .
|
| 111 |
+
# Also clone OpenEnv for PYTHONPATH
|
| 112 |
git clone https://github.com/meta-pytorch/OpenEnv.git
|
| 113 |
export PYTHONPATH=$PYTHONPATH:OpenEnv:OpenEnv/src:.
|
| 114 |
+
```
|
| 115 |
|
| 116 |
+
### Run locally
|
| 117 |
+
```bash
|
| 118 |
uvicorn server.app:app --host 0.0.0.0 --port 7860 --reload
|
| 119 |
+
```
|
| 120 |
|
| 121 |
+
### Run with Docker
|
| 122 |
+
```bash
|
| 123 |
docker build -f server/Dockerfile -t code-debug-env .
|
| 124 |
docker run -p 7860:7860 code-debug-env
|
| 125 |
+
```
|
| 126 |
|
| 127 |
+
### Test the API
|
| 128 |
+
```bash
|
| 129 |
+
# Health check
|
|
|
|
| 130 |
curl http://localhost:7860/health
|
| 131 |
|
| 132 |
+
# Reset (easy task)
|
| 133 |
+
curl -X POST http://localhost:7860/reset \
|
| 134 |
+
-H "Content-Type: application/json" \
|
| 135 |
+
-d '{"difficulty": "easy"}'
|
| 136 |
|
| 137 |
+
# Submit a fix
|
| 138 |
+
curl -X POST http://localhost:7860/step \
|
| 139 |
+
-H "Content-Type: application/json" \
|
| 140 |
+
-d '{"fixed_code": "def find_max(nums):\n return max(nums)"}'
|
| 141 |
|
| 142 |
+
# Check state
|
| 143 |
curl http://localhost:7860/state
|
| 144 |
+
```
|
| 145 |
|
| 146 |
---
|
| 147 |
|
| 148 |
## Run Baseline Inference
|
| 149 |
|
| 150 |
+
```bash
|
| 151 |
export API_BASE_URL="https://api.openai.com/v1"
|
| 152 |
export MODEL_NAME="gpt-4o-mini"
|
| 153 |
export HF_TOKEN="your-api-key"
|
| 154 |
|
| 155 |
+
# Run all 3 difficulties
|
| 156 |
python inference.py --url http://localhost:7860
|
| 157 |
+
|
| 158 |
+
# Run specific difficulty
|
| 159 |
python inference.py --url http://localhost:7860 --difficulty hard
|
| 160 |
+
```
|
| 161 |
|
| 162 |
---
|
| 163 |
|
| 164 |
## Pre-Submission Validation
|
| 165 |
|
| 166 |
+
Run before submitting to catch any disqualifying issues:
|
| 167 |
+
|
| 168 |
+
```bash
|
| 169 |
+
# Start the environment first, then:
|
| 170 |
python validator/pre_submit_check.py --url http://localhost:7860
|
| 171 |
+
|
| 172 |
+
# Or against your HF Space:
|
| 173 |
python validator/pre_submit_check.py --url https://YOUR_SPACE.hf.space
|
| 174 |
+
```
|
| 175 |
|
| 176 |
---
|
| 177 |
|
| 178 |
## Deploy to Hugging Face Spaces
|
| 179 |
|
| 180 |
+
```bash
|
| 181 |
+
# Login
|
| 182 |
huggingface-cli login
|
| 183 |
|
| 184 |
+
# Create space and push
|
| 185 |
huggingface-cli repo create code-debug-env --type space --space_sdk docker
|
|
|
|
| 186 |
cd code-debug-env
|
| 187 |
git init
|
| 188 |
git remote add origin https://huggingface.co/spaces/YOUR_USERNAME/code-debug-env
|
|
|
|
| 189 |
git add .
|
| 190 |
git commit -m "Initial commit"
|
| 191 |
git push origin main
|
| 192 |
+
```
|
| 193 |
|
| 194 |
---
|
| 195 |
|
| 196 |
## Project Structure
|
| 197 |
|
| 198 |
+
```
|
| 199 |
code-debug-env/
|
| 200 |
+
├── openenv.yaml ← OpenEnv manifest
|
| 201 |
+
├── inference.py ← Baseline agent (root, required)
|
| 202 |
+
├── pyproject.toml ← Dependencies
|
| 203 |
├── README.md
|
| 204 |
+
├── models.py ← Pydantic Action/Observation/State
|
| 205 |
+
├── client.py ← EnvClient for training loops
|
| 206 |
+
├── __init__.py
|
| 207 |
├── server/
|
| 208 |
+
│ ├── app.py ← FastAPI: /reset /step /state /health
|
| 209 |
+
│ ├── environment.py ← Core episode logic
|
| 210 |
│ ├── tasks/
|
| 211 |
+
│ │ ├── task_easy.py ← 15 single-bug tasks
|
| 212 |
+
│ │ ├── task_medium.py← 15 two-bug tasks
|
| 213 |
+
│ │ └── task_hard.py ← 15 algorithmic tasks
|
| 214 |
│ ├── graders/
|
| 215 |
│ │ ├── grader_easy.py
|
| 216 |
│ │ ├── grader_medium.py
|
|
|
|
| 218 |
│ ├── requirements.txt
|
| 219 |
│ └── Dockerfile
|
| 220 |
└── validator/
|
| 221 |
+
└── pre_submit_check.py
|
| 222 |
+
```
|
inference.py
CHANGED
|
@@ -8,18 +8,18 @@ Usage:
|
|
| 8 |
python inference.py --url https://Souravdanyal-code-debug-env.hf.space
|
| 9 |
python inference.py --difficulty easy
|
| 10 |
|
| 11 |
-
STDOUT FORMAT (required by evaluator):
|
| 12 |
[START] task=<id> env=<benchmark> model=<model>
|
| 13 |
[STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 14 |
-
[END] success=<true|false> steps=<n> rewards=<r1,r2,...>
|
| 15 |
"""
|
| 16 |
|
| 17 |
-
import os, sys, json, time, argparse, requests
|
| 18 |
from openai import OpenAI
|
| 19 |
from typing import List, Optional
|
| 20 |
|
| 21 |
# ── Config ────────────────────────────────────────────────────────────────────
|
| 22 |
-
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.
|
| 23 |
MODEL_NAME = os.environ.get("MODEL_NAME", "llama-3.1-8b-instant")
|
| 24 |
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 25 |
ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
|
|
@@ -28,7 +28,7 @@ MAX_STEPS = 5
|
|
| 28 |
|
| 29 |
client = OpenAI(api_key=HF_TOKEN or "dummy", base_url=API_BASE_URL)
|
| 30 |
|
| 31 |
-
# ── Logging ───────────────────────────────────────────────────
|
| 32 |
def log_start(task_id, env, model):
|
| 33 |
print(f"[START] task={task_id} env={env} model={model}", flush=True)
|
| 34 |
|
|
@@ -53,101 +53,116 @@ def env_step(url, fixed_code, explanation=None):
|
|
| 53 |
return r.json()
|
| 54 |
|
| 55 |
# ── LLM ──────────────────────────────────────────────────────────────────────
|
| 56 |
-
SYSTEM_PROMPT = """You are an expert Python debugging agent.
|
| 57 |
|
| 58 |
-
RESPONSE FORMAT —
|
| 59 |
-
{
|
| 60 |
-
"fixed_code": "<complete corrected Python function including imports>",
|
| 61 |
-
"explanation": "<for hard tasks: explain the bug, root cause, and fix>"
|
| 62 |
-
}
|
| 63 |
|
| 64 |
RULES:
|
| 65 |
-
- Return COMPLETE function
|
| 66 |
-
- fixed_code must be valid Python
|
| 67 |
-
- For hard tasks explanation MUST mention the algorithmic
|
| 68 |
-
|
| 69 |
-
COMMON BUG PATTERNS:
|
| 70 |
-
-
|
| 71 |
-
-
|
| 72 |
-
-
|
| 73 |
-
- 0/1 Knapsack:
|
| 74 |
-
- Binary search
|
| 75 |
-
-
|
| 76 |
-
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
Example: if instructions say "mention: visited" then write about visited set.
|
| 81 |
"""
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
def call_llm(buggy_code, instructions, difficulty, feedback=None, attempt=1, prev_code=None):
|
| 84 |
content = f"Difficulty: {difficulty}\nInstructions: {instructions}\n\nBuggy code:\n```python\n{buggy_code}\n```\n"
|
| 85 |
|
| 86 |
if feedback and attempt > 1:
|
| 87 |
content += f"\nPREVIOUS FIX FAILED. Feedback:\n{feedback}\n\nYour previous code:\n```python\n{prev_code or ''}\n```\n"
|
| 88 |
-
content += "
|
| 89 |
-
content += "-
|
| 90 |
-
content += "- If
|
| 91 |
-
content += "-
|
|
|
|
| 92 |
|
| 93 |
if difficulty == "hard":
|
| 94 |
-
|
| 95 |
-
import re
|
| 96 |
-
hint_match = re.search(r'[Mm]ention[:\s]+([^.]+)', instructions)
|
| 97 |
if hint_match:
|
| 98 |
hints = hint_match.group(1).strip()
|
| 99 |
-
content += f"\nFor
|
| 100 |
-
content += "
|
| 101 |
|
| 102 |
try:
|
| 103 |
resp = client.chat.completions.create(
|
| 104 |
model=MODEL_NAME,
|
| 105 |
-
messages=[
|
|
|
|
|
|
|
|
|
|
| 106 |
max_tokens=1500,
|
| 107 |
temperature=0.1 if attempt == 1 else 0.4,
|
| 108 |
)
|
| 109 |
raw = resp.choices[0].message.content.strip()
|
| 110 |
-
|
| 111 |
-
# Remove markdown fences
|
| 112 |
-
if "```json" in raw:
|
| 113 |
-
raw = raw.split("```json")[1].split("```")[0].strip()
|
| 114 |
-
elif "```" in raw:
|
| 115 |
-
raw = raw.split("```")[1].split("```")[0].strip()
|
| 116 |
-
if raw.startswith("json"):
|
| 117 |
-
raw = raw[4:].strip()
|
| 118 |
-
|
| 119 |
-
# Find JSON object boundaries
|
| 120 |
-
start = raw.find("{")
|
| 121 |
-
end = raw.rfind("}") + 1
|
| 122 |
-
if start >= 0 and end > start:
|
| 123 |
-
raw = raw[start:end]
|
| 124 |
-
|
| 125 |
-
# Try direct parse first
|
| 126 |
-
try:
|
| 127 |
-
parsed = json.loads(raw)
|
| 128 |
-
except json.JSONDecodeError:
|
| 129 |
-
# Fix control characters by replacing literal newlines inside strings
|
| 130 |
-
import re
|
| 131 |
-
# Replace actual newlines within JSON string values with \n escape
|
| 132 |
-
raw = re.sub(r'(?<!\\)\n', r'\\n', raw)
|
| 133 |
-
raw = re.sub(r'(?<!\\)\t', r'\\t', raw)
|
| 134 |
-
raw = re.sub(r'(?<!\\)\r', r'\\r', raw)
|
| 135 |
-
try:
|
| 136 |
-
parsed = json.loads(raw)
|
| 137 |
-
except json.JSONDecodeError:
|
| 138 |
-
# Last resort: extract fixed_code manually using regex
|
| 139 |
-
code_match = re.search(r'"fixed_code"\s*:\s*"(.*?)"(?=\s*[,}])', raw, re.DOTALL)
|
| 140 |
-
exp_match = re.search(r'"explanation"\s*:\s*"(.*?)"(?=\s*[,}])', raw, re.DOTALL)
|
| 141 |
-
if code_match:
|
| 142 |
-
code = code_match.group(1).encode().decode('unicode_escape') if '\\n' in code_match.group(1) else code_match.group(1)
|
| 143 |
-
return {"fixed_code": code, "explanation": exp_match.group(1) if exp_match else None}
|
| 144 |
-
raise
|
| 145 |
-
|
| 146 |
-
return {"fixed_code": parsed.get("fixed_code", ""), "explanation": parsed.get("explanation")}
|
| 147 |
except Exception as e:
|
| 148 |
print(f"# LLM error: {e}", file=sys.stderr)
|
| 149 |
return {"fixed_code": buggy_code, "explanation": None}
|
| 150 |
|
|
|
|
| 151 |
# ── Episode ───────────────────────────────────────────────────────────────────
|
| 152 |
def run_episode(env_url, difficulty):
|
| 153 |
data = env_reset(env_url, difficulty)
|
|
@@ -181,7 +196,8 @@ def run_episode(env_url, difficulty):
|
|
| 181 |
|
| 182 |
reward = result.get("reward", 0.0)
|
| 183 |
done = result.get("done", False)
|
| 184 |
-
|
|
|
|
| 185 |
|
| 186 |
log_step(attempt, f"fix_{difficulty}_attempt{attempt}", reward, done, None)
|
| 187 |
rewards.append(reward)
|
|
@@ -194,9 +210,10 @@ def run_episode(env_url, difficulty):
|
|
| 194 |
log_end(success, steps_taken, rewards)
|
| 195 |
return success, steps_taken, rewards
|
| 196 |
|
|
|
|
| 197 |
# ── Main ──────────────────────────────────────────────────────────────────────
|
| 198 |
def main():
|
| 199 |
-
parser = argparse.ArgumentParser()
|
| 200 |
parser.add_argument("--url", default=ENV_URL)
|
| 201 |
parser.add_argument("--difficulty", default=None, choices=["easy","medium","hard","all"])
|
| 202 |
args = parser.parse_args()
|
|
@@ -222,4 +239,4 @@ def main():
|
|
| 222 |
print(f"# SUMMARY: {sum(successes)}/{len(diffs)} tasks solved | avg_reward={avg}", flush=True)
|
| 223 |
|
| 224 |
if __name__ == "__main__":
|
| 225 |
-
main()
|
|
|
|
| 8 |
python inference.py --url https://Souravdanyal-code-debug-env.hf.space
|
| 9 |
python inference.py --difficulty easy
|
| 10 |
|
| 11 |
+
STDOUT FORMAT (strictly required by evaluator):
|
| 12 |
[START] task=<id> env=<benchmark> model=<model>
|
| 13 |
[STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 14 |
+
[END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
| 15 |
"""
|
| 16 |
|
| 17 |
+
import os, sys, json, time, argparse, requests, re
|
| 18 |
from openai import OpenAI
|
| 19 |
from typing import List, Optional
|
| 20 |
|
| 21 |
# ── Config ────────────────────────────────────────────────────────────────────
|
| 22 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.groq.com/openai/v1")
|
| 23 |
MODEL_NAME = os.environ.get("MODEL_NAME", "llama-3.1-8b-instant")
|
| 24 |
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 25 |
ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
|
|
|
|
| 28 |
|
| 29 |
client = OpenAI(api_key=HF_TOKEN or "dummy", base_url=API_BASE_URL)
|
| 30 |
|
| 31 |
+
# ── Logging — STRICT FORMAT ───────────────────────────────────────────────────
|
| 32 |
def log_start(task_id, env, model):
|
| 33 |
print(f"[START] task={task_id} env={env} model={model}", flush=True)
|
| 34 |
|
|
|
|
| 53 |
return r.json()
|
| 54 |
|
| 55 |
# ── LLM ──────────────────────────────────────────────────────────────────────
|
| 56 |
+
SYSTEM_PROMPT = """You are an expert Python debugging agent.
|
| 57 |
|
| 58 |
+
RESPONSE FORMAT — JSON only, no markdown fences, no extra text:
|
| 59 |
+
{"fixed_code": "<complete Python function with all imports>", "explanation": "<for hard tasks only>"}
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
RULES:
|
| 62 |
+
- Return the COMPLETE function including all imports (e.g. from collections import deque)
|
| 63 |
+
- fixed_code must be valid, executable Python
|
| 64 |
+
- For hard tasks: explanation MUST mention the algorithmic concepts from the instructions
|
| 65 |
+
|
| 66 |
+
COMMON BUG PATTERNS — memorize these:
|
| 67 |
+
- RIGHT rotate list by k: lst[-k:] + lst[:-k] (NOT lst[k:] + lst[:k] which is LEFT rotate)
|
| 68 |
+
- LEFT rotate list by k: lst[k:] + lst[:k]
|
| 69 |
+
- BFS/graph missing visited: add visited=set([start]) before queue, check before appending
|
| 70 |
+
- 0/1 Knapsack: iterate BACKWARD range(capacity, weight-1, -1) NOT forward
|
| 71 |
+
- Binary search boundary: often return high not low, or initial high=n//2 not n
|
| 72 |
+
- Wrong operator: target-n not target+n for complement
|
| 73 |
+
- Off-by-one: lst[1] for second element not lst[2]
|
| 74 |
+
|
| 75 |
+
IMPORTANT: If feedback shows TimeoutError → you have infinite loop → add visited set.
|
| 76 |
+
IMPORTANT: If Expected shows right-rotated list → use lst[-k:] + lst[:-k].
|
|
|
|
| 77 |
"""
|
| 78 |
|
| 79 |
+
def _parse_llm_response(raw: str, buggy_code: str) -> dict:
|
| 80 |
+
"""Robustly parse LLM response handling control chars and malformed JSON."""
|
| 81 |
+
# Remove markdown fences
|
| 82 |
+
if "```json" in raw:
|
| 83 |
+
raw = raw.split("```json")[1].split("```")[0].strip()
|
| 84 |
+
elif "```" in raw:
|
| 85 |
+
parts = raw.split("```")
|
| 86 |
+
if len(parts) >= 2:
|
| 87 |
+
raw = parts[1].strip()
|
| 88 |
+
if raw.startswith("json"):
|
| 89 |
+
raw = raw[4:].strip()
|
| 90 |
+
|
| 91 |
+
# Find JSON boundaries
|
| 92 |
+
start = raw.find("{")
|
| 93 |
+
end = raw.rfind("}") + 1
|
| 94 |
+
if start >= 0 and end > start:
|
| 95 |
+
raw = raw[start:end]
|
| 96 |
+
|
| 97 |
+
# Try direct parse
|
| 98 |
+
try:
|
| 99 |
+
parsed = json.loads(raw)
|
| 100 |
+
return {"fixed_code": parsed.get("fixed_code", ""), "explanation": parsed.get("explanation")}
|
| 101 |
+
except json.JSONDecodeError:
|
| 102 |
+
pass
|
| 103 |
+
|
| 104 |
+
# Fix control characters (literal newlines inside JSON strings)
|
| 105 |
+
try:
|
| 106 |
+
fixed = re.sub(r'(?<!\\)\n', r'\\n', raw)
|
| 107 |
+
fixed = re.sub(r'(?<!\\)\t', r'\\t', raw)
|
| 108 |
+
fixed = re.sub(r'(?<!\\)\r', r'\\r', raw)
|
| 109 |
+
parsed = json.loads(fixed)
|
| 110 |
+
# Unescape the fixed_code back to real newlines
|
| 111 |
+
code = parsed.get("fixed_code", "")
|
| 112 |
+
if "\\n" in code:
|
| 113 |
+
code = code.replace("\\n", "\n").replace("\\t", "\t")
|
| 114 |
+
return {"fixed_code": code, "explanation": parsed.get("explanation")}
|
| 115 |
+
except json.JSONDecodeError:
|
| 116 |
+
pass
|
| 117 |
+
|
| 118 |
+
# Last resort: regex extraction
|
| 119 |
+
code_match = re.search(r'"fixed_code"\s*:\s*"((?:[^"\\]|\\.)*)"\s*[,}]', raw, re.DOTALL)
|
| 120 |
+
exp_match = re.search(r'"explanation"\s*:\s*"((?:[^"\\]|\\.)*)"\s*[,}]', raw, re.DOTALL)
|
| 121 |
+
|
| 122 |
+
if code_match:
|
| 123 |
+
code = code_match.group(1).replace("\\n", "\n").replace("\\t", "\t")
|
| 124 |
+
exp = exp_match.group(1).replace("\\n", "\n") if exp_match else None
|
| 125 |
+
return {"fixed_code": code, "explanation": exp}
|
| 126 |
+
|
| 127 |
+
# Complete fallback
|
| 128 |
+
return {"fixed_code": buggy_code, "explanation": None}
|
| 129 |
+
|
| 130 |
+
|
| 131 |
def call_llm(buggy_code, instructions, difficulty, feedback=None, attempt=1, prev_code=None):
|
| 132 |
content = f"Difficulty: {difficulty}\nInstructions: {instructions}\n\nBuggy code:\n```python\n{buggy_code}\n```\n"
|
| 133 |
|
| 134 |
if feedback and attempt > 1:
|
| 135 |
content += f"\nPREVIOUS FIX FAILED. Feedback:\n{feedback}\n\nYour previous code:\n```python\n{prev_code or ''}\n```\n"
|
| 136 |
+
content += "ANALYZE THE FEEDBACK CAREFULLY:\n"
|
| 137 |
+
content += "- Look at Input/Expected/Got for each failing test\n"
|
| 138 |
+
content += "- If Got shows wrong rotation direction: use lst[-k:] + lst[:-k] for RIGHT rotate\n"
|
| 139 |
+
content += "- If TimeoutError: add visited=set([start]) before queue in graph code\n"
|
| 140 |
+
content += "- Try a COMPLETELY DIFFERENT fix.\n"
|
| 141 |
|
| 142 |
if difficulty == "hard":
|
| 143 |
+
hint_match = re.search(r'[Mm]ention[:\s]+([^.]+?)(?:\.|$)', instructions)
|
|
|
|
|
|
|
| 144 |
if hint_match:
|
| 145 |
hints = hint_match.group(1).strip()
|
| 146 |
+
content += f"\nFor explanation, you MUST mention these concepts: {hints}\n"
|
| 147 |
+
content += "Explanation counts for 30% of reward — make it detailed and specific.\n"
|
| 148 |
|
| 149 |
try:
|
| 150 |
resp = client.chat.completions.create(
|
| 151 |
model=MODEL_NAME,
|
| 152 |
+
messages=[
|
| 153 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 154 |
+
{"role": "user", "content": content}
|
| 155 |
+
],
|
| 156 |
max_tokens=1500,
|
| 157 |
temperature=0.1 if attempt == 1 else 0.4,
|
| 158 |
)
|
| 159 |
raw = resp.choices[0].message.content.strip()
|
| 160 |
+
return _parse_llm_response(raw, buggy_code)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
except Exception as e:
|
| 162 |
print(f"# LLM error: {e}", file=sys.stderr)
|
| 163 |
return {"fixed_code": buggy_code, "explanation": None}
|
| 164 |
|
| 165 |
+
|
| 166 |
# ── Episode ───────────────────────────────────────────────────────────────────
|
| 167 |
def run_episode(env_url, difficulty):
|
| 168 |
data = env_reset(env_url, difficulty)
|
|
|
|
| 196 |
|
| 197 |
reward = result.get("reward", 0.0)
|
| 198 |
done = result.get("done", False)
|
| 199 |
+
obs_r = result.get("observation", {})
|
| 200 |
+
last_feedback = obs_r.get("feedback", "")
|
| 201 |
|
| 202 |
log_step(attempt, f"fix_{difficulty}_attempt{attempt}", reward, done, None)
|
| 203 |
rewards.append(reward)
|
|
|
|
| 210 |
log_end(success, steps_taken, rewards)
|
| 211 |
return success, steps_taken, rewards
|
| 212 |
|
| 213 |
+
|
| 214 |
# ── Main ──────────────────────────────────────────────────────────────────────
|
| 215 |
def main():
|
| 216 |
+
parser = argparse.ArgumentParser(description="Code Debug Environment Baseline Agent")
|
| 217 |
parser.add_argument("--url", default=ENV_URL)
|
| 218 |
parser.add_argument("--difficulty", default=None, choices=["easy","medium","hard","all"])
|
| 219 |
args = parser.parse_args()
|
|
|
|
| 239 |
print(f"# SUMMARY: {sum(successes)}/{len(diffs)} tasks solved | avg_reward={avg}", flush=True)
|
| 240 |
|
| 241 |
if __name__ == "__main__":
|
| 242 |
+
main()
|
models.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
-
# models.py
|
| 2 |
-
# Typed Pydantic models for Action, Observation, and State
|
| 3 |
-
# These are the contracts between the agent and the environment.
|
| 4 |
|
| 5 |
-
from typing import Optional
|
| 6 |
from pydantic import Field
|
| 7 |
from openenv.core.env_server.types import Action, Observation, State
|
| 8 |
|
|
@@ -12,62 +10,42 @@ class DebugAction(Action):
|
|
| 12 |
|
| 13 |
fixed_code: str = Field(
|
| 14 |
...,
|
| 15 |
-
description="
|
| 16 |
)
|
| 17 |
explanation: Optional[str] = Field(
|
| 18 |
default=None,
|
| 19 |
-
description=
|
| 20 |
-
"Required for 'hard' difficulty tasks. Explain what was wrong "
|
| 21 |
-
"and why your fix is correct. Affects reward on hard tasks."
|
| 22 |
-
)
|
| 23 |
)
|
| 24 |
|
| 25 |
|
| 26 |
-
class TestResult(Action):
|
| 27 |
-
"""Sub-model: result of a single test case."""
|
| 28 |
-
test_id: int
|
| 29 |
-
passed: bool
|
| 30 |
-
expected: str
|
| 31 |
-
got: str
|
| 32 |
-
|
| 33 |
-
|
| 34 |
class DebugObservation(Observation):
|
| 35 |
-
"""Observation returned after
|
| 36 |
|
| 37 |
-
|
| 38 |
-
task_id: str = Field(..., description="Unique ID of the current task instance")
|
| 39 |
difficulty: str = Field(..., description="Task difficulty: easy | medium | hard")
|
| 40 |
buggy_code: str = Field(..., description="The buggy Python code the agent must fix")
|
| 41 |
instructions: str = Field(..., description="Natural language instructions for the task")
|
| 42 |
-
test_cases_description: str = Field(
|
| 43 |
-
..., description="Description of what the test cases check"
|
| 44 |
-
)
|
| 45 |
|
| 46 |
-
#
|
| 47 |
-
reward: Optional[float] = Field(
|
| 48 |
-
|
| 49 |
-
)
|
| 50 |
-
passed_tests: Optional[int] = Field(
|
| 51 |
-
|
| 52 |
-
)
|
| 53 |
-
|
| 54 |
-
default=None, description="Total number of test cases"
|
| 55 |
-
)
|
| 56 |
-
feedback: Optional[str] = Field(
|
| 57 |
-
default=None,
|
| 58 |
-
description="Detailed feedback: which tests failed and why"
|
| 59 |
-
)
|
| 60 |
-
done: bool = Field(default=False, description="True when episode is complete")
|
| 61 |
|
| 62 |
|
| 63 |
class DebugState(State):
|
| 64 |
-
"""Internal environment state
|
| 65 |
|
| 66 |
-
episode_id: str = ""
|
| 67 |
-
task_id: str
|
| 68 |
-
difficulty: str
|
| 69 |
step_count: int = 0
|
| 70 |
-
max_steps: int =
|
| 71 |
current_reward: float = 0.0
|
|
|
|
| 72 |
best_reward: float = 0.0
|
| 73 |
done: bool = False
|
|
|
|
| 1 |
+
# models.py — Typed Pydantic models for Action, Observation, and State
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
from typing import Optional
|
| 4 |
from pydantic import Field
|
| 5 |
from openenv.core.env_server.types import Action, Observation, State
|
| 6 |
|
|
|
|
| 10 |
|
| 11 |
fixed_code: str = Field(
|
| 12 |
...,
|
| 13 |
+
description="Complete corrected Python function. Must be valid Python including imports."
|
| 14 |
)
|
| 15 |
explanation: Optional[str] = Field(
|
| 16 |
default=None,
|
| 17 |
+
description="Required for hard tasks. Explain what was wrong and why your fix is correct."
|
|
|
|
|
|
|
|
|
|
| 18 |
)
|
| 19 |
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
class DebugObservation(Observation):
|
| 22 |
+
"""Observation returned after reset() and step()."""
|
| 23 |
|
| 24 |
+
task_id: str = Field(..., description="Unique task identifier e.g. easy_003")
|
|
|
|
| 25 |
difficulty: str = Field(..., description="Task difficulty: easy | medium | hard")
|
| 26 |
buggy_code: str = Field(..., description="The buggy Python code the agent must fix")
|
| 27 |
instructions: str = Field(..., description="Natural language instructions for the task")
|
| 28 |
+
test_cases_description: str = Field(..., description="What the test cases check")
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
# Step feedback fields
|
| 31 |
+
reward: Optional[float] = Field(default=None, description="Immediate reward 0.0-1.0 (null on reset)")
|
| 32 |
+
cumulative_reward: float = Field(default=0.0, description="Total reward accumulated this episode")
|
| 33 |
+
best_reward: float = Field(default=0.0, description="Best reward achieved this episode")
|
| 34 |
+
passed_tests: Optional[int] = Field(default=None, description="Tests passed (null on reset)")
|
| 35 |
+
total_tests: Optional[int] = Field(default=None, description="Total test cases (always 3)")
|
| 36 |
+
feedback: Optional[str] = Field(default=None, description="Per-test feedback: Input, Expected, Got")
|
| 37 |
+
done: bool = Field(default=False, description="True when episode complete")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
class DebugState(State):
|
| 41 |
+
"""Internal environment state returned by GET /state."""
|
| 42 |
|
| 43 |
+
episode_id: str = ""
|
| 44 |
+
task_id: str = "none"
|
| 45 |
+
difficulty: str = "easy"
|
| 46 |
step_count: int = 0
|
| 47 |
+
max_steps: int = 5
|
| 48 |
current_reward: float = 0.0
|
| 49 |
+
cumulative_reward: float = 0.0
|
| 50 |
best_reward: float = 0.0
|
| 51 |
done: bool = False
|
openenv.yaml
CHANGED
|
@@ -2,12 +2,11 @@ spec_version: 1
|
|
| 2 |
name: code-debug-env
|
| 3 |
type: typed
|
| 4 |
description: >
|
| 5 |
-
A real-world RL environment where an LLM agent diagnoses and fixes
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
for correct explanations on hard tasks.
|
| 11 |
|
| 12 |
version: 1.0.0
|
| 13 |
author: Souravdanyal
|
|
@@ -19,6 +18,7 @@ tags:
|
|
| 19 |
- openenv
|
| 20 |
- llm-agent
|
| 21 |
- software-engineering
|
|
|
|
| 22 |
|
| 23 |
runtime:
|
| 24 |
type: docker
|
|
@@ -51,7 +51,7 @@ tasks:
|
|
| 51 |
num_tasks: 15
|
| 52 |
|
| 53 |
- id: hard
|
| 54 |
-
description: "Fix an algorithmic bug AND provide a correct explanation of
|
| 55 |
difficulty: hard
|
| 56 |
max_steps: 5
|
| 57 |
reward_range: [0.0, 1.0]
|
|
@@ -67,51 +67,50 @@ action_space:
|
|
| 67 |
fixed_code:
|
| 68 |
type: string
|
| 69 |
required: true
|
| 70 |
-
description: "Complete corrected Python function
|
| 71 |
explanation:
|
| 72 |
type: string
|
| 73 |
required: false
|
| 74 |
-
description: "Required for hard tasks. Explain the bug, root cause, and
|
| 75 |
|
| 76 |
observation_space:
|
| 77 |
type: dict
|
| 78 |
-
description: "
|
| 79 |
fields:
|
| 80 |
task_id:
|
| 81 |
type: string
|
| 82 |
-
description: "Unique
|
| 83 |
difficulty:
|
| 84 |
type: enum
|
| 85 |
values: [easy, medium, hard]
|
| 86 |
-
description: "Task difficulty level"
|
| 87 |
buggy_code:
|
| 88 |
type: string
|
| 89 |
-
description: "The buggy Python function
|
| 90 |
instructions:
|
| 91 |
type: string
|
| 92 |
-
description: "Natural language description of what is wrong
|
| 93 |
test_cases_description:
|
| 94 |
type: string
|
| 95 |
-
description: "
|
| 96 |
reward:
|
| 97 |
type: float
|
| 98 |
-
description: "Score
|
| 99 |
passed_tests:
|
| 100 |
type: integer
|
| 101 |
-
description: "
|
| 102 |
total_tests:
|
| 103 |
type: integer
|
| 104 |
-
description: "Total
|
| 105 |
feedback:
|
| 106 |
type: string
|
| 107 |
-
description: "
|
| 108 |
done:
|
| 109 |
type: boolean
|
| 110 |
-
description: "True when episode
|
| 111 |
|
| 112 |
api:
|
| 113 |
reset: /reset
|
| 114 |
step: /step
|
| 115 |
state: /state
|
| 116 |
health: /health
|
| 117 |
-
tasks: /tasks
|
|
|
|
| 2 |
name: code-debug-env
|
| 3 |
type: typed
|
| 4 |
description: >
|
| 5 |
+
A real-world RL environment where an LLM agent diagnoses and fixes buggy Python
|
| 6 |
+
code across three difficulty levels (easy, medium, hard). Tasks cover real-world
|
| 7 |
+
domains: data processing, string algorithms, API validation, sorting, dynamic
|
| 8 |
+
programming, and graph algorithms. Rewards are partial and proportional to test
|
| 9 |
+
cases passed, with bonuses for correct explanations on hard tasks.
|
|
|
|
| 10 |
|
| 11 |
version: 1.0.0
|
| 12 |
author: Souravdanyal
|
|
|
|
| 18 |
- openenv
|
| 19 |
- llm-agent
|
| 20 |
- software-engineering
|
| 21 |
+
- real-world
|
| 22 |
|
| 23 |
runtime:
|
| 24 |
type: docker
|
|
|
|
| 51 |
num_tasks: 15
|
| 52 |
|
| 53 |
- id: hard
|
| 54 |
+
description: "Fix an algorithmic bug AND provide a correct explanation of root cause"
|
| 55 |
difficulty: hard
|
| 56 |
max_steps: 5
|
| 57 |
reward_range: [0.0, 1.0]
|
|
|
|
| 67 |
fixed_code:
|
| 68 |
type: string
|
| 69 |
required: true
|
| 70 |
+
description: "Complete corrected Python function. Must be valid Python including imports."
|
| 71 |
explanation:
|
| 72 |
type: string
|
| 73 |
required: false
|
| 74 |
+
description: "Required for hard tasks. Explain the bug, root cause, and fix."
|
| 75 |
|
| 76 |
observation_space:
|
| 77 |
type: dict
|
| 78 |
+
description: "Returned after reset() and step()"
|
| 79 |
fields:
|
| 80 |
task_id:
|
| 81 |
type: string
|
| 82 |
+
description: "Unique task identifier e.g. easy_003"
|
| 83 |
difficulty:
|
| 84 |
type: enum
|
| 85 |
values: [easy, medium, hard]
|
|
|
|
| 86 |
buggy_code:
|
| 87 |
type: string
|
| 88 |
+
description: "The buggy Python function to fix"
|
| 89 |
instructions:
|
| 90 |
type: string
|
| 91 |
+
description: "Natural language description of what is wrong"
|
| 92 |
test_cases_description:
|
| 93 |
type: string
|
| 94 |
+
description: "What the test cases check"
|
| 95 |
reward:
|
| 96 |
type: float
|
| 97 |
+
description: "Score 0.0-1.0 (null on reset)"
|
| 98 |
passed_tests:
|
| 99 |
type: integer
|
| 100 |
+
description: "Test cases passed (null on reset)"
|
| 101 |
total_tests:
|
| 102 |
type: integer
|
| 103 |
+
description: "Total test cases (always 3)"
|
| 104 |
feedback:
|
| 105 |
type: string
|
| 106 |
+
description: "Per-test feedback showing Input, Expected, Got"
|
| 107 |
done:
|
| 108 |
type: boolean
|
| 109 |
+
description: "True when episode complete"
|
| 110 |
|
| 111 |
api:
|
| 112 |
reset: /reset
|
| 113 |
step: /step
|
| 114 |
state: /state
|
| 115 |
health: /health
|
| 116 |
+
tasks: /tasks
|
py
ADDED
|
File without changes
|
pyproject.toml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68", "wheel"]
|
| 3 |
+
build-backend = "setuptools.backends.legacy:build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "code-debug-env"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
description = "OpenEnv environment for LLM-based code debugging"
|
| 9 |
+
requires-python = ">=3.10"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"fastapi>=0.110.0",
|
| 12 |
+
"uvicorn[standard]>=0.29.0",
|
| 13 |
+
"pydantic>=2.0.0",
|
| 14 |
+
"openai>=1.0.0",
|
| 15 |
+
"requests>=2.31.0",
|
| 16 |
+
"openenv-core>=0.2.0",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
[project.optional-dependencies]
|
| 20 |
+
dev = [
|
| 21 |
+
"pytest>=8.0.0",
|
| 22 |
+
"httpx>=0.27.0",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
[tool.setuptools.packages.find]
|
| 26 |
+
where = ["."]
|
server/environment.py
CHANGED
|
@@ -16,17 +16,16 @@ from server.graders.grader_easy import grade_easy
|
|
| 16 |
from server.graders.grader_medium import grade_medium
|
| 17 |
from server.graders.grader_hard import grade_hard
|
| 18 |
|
| 19 |
-
|
| 20 |
TASK_GETTERS = {
|
| 21 |
-
"easy":
|
| 22 |
"medium": get_random_medium_task,
|
| 23 |
-
"hard":
|
| 24 |
}
|
| 25 |
|
| 26 |
GRADERS = {
|
| 27 |
-
"easy":
|
| 28 |
"medium": grade_medium,
|
| 29 |
-
"hard":
|
| 30 |
}
|
| 31 |
|
| 32 |
MAX_STEPS = 5
|
|
@@ -35,7 +34,7 @@ MAX_STEPS = 5
|
|
| 35 |
class CodeDebugEnvironment(Environment):
|
| 36 |
"""
|
| 37 |
OpenEnv environment for LLM-based code debugging.
|
| 38 |
-
Supports 3 difficulty levels with partial rewards.
|
| 39 |
"""
|
| 40 |
|
| 41 |
def __init__(self):
|
|
@@ -43,28 +42,25 @@ class CodeDebugEnvironment(Environment):
|
|
| 43 |
self._difficulty: str = "easy"
|
| 44 |
self._current_task: Optional[dict] = None
|
| 45 |
self._step_count: int = 0
|
|
|
|
| 46 |
self._best_reward: float = 0.0
|
| 47 |
self._current_reward: float = 0.0
|
| 48 |
self._done: bool = False
|
| 49 |
|
| 50 |
def reset(self, difficulty: Optional[str] = None) -> DebugObservation:
|
| 51 |
-
"""
|
| 52 |
-
Start a new episode. Optionally specify difficulty: easy | medium | hard.
|
| 53 |
-
If not specified, cycles randomly.
|
| 54 |
-
"""
|
| 55 |
self._episode_id = str(uuid4())
|
| 56 |
self._step_count = 0
|
|
|
|
| 57 |
self._best_reward = 0.0
|
| 58 |
self._current_reward = 0.0
|
| 59 |
self._done = False
|
| 60 |
|
| 61 |
-
# Validate difficulty
|
| 62 |
if difficulty and difficulty in TASK_GETTERS:
|
| 63 |
self._difficulty = difficulty
|
| 64 |
else:
|
| 65 |
self._difficulty = random.choice(["easy", "medium", "hard"])
|
| 66 |
|
| 67 |
-
# Load a task
|
| 68 |
self._current_task = TASK_GETTERS[self._difficulty]()
|
| 69 |
|
| 70 |
return DebugObservation(
|
|
@@ -74,6 +70,8 @@ class CodeDebugEnvironment(Environment):
|
|
| 74 |
instructions=self._current_task["instructions"],
|
| 75 |
test_cases_description=self._current_task["test_cases_description"],
|
| 76 |
reward=None,
|
|
|
|
|
|
|
| 77 |
passed_tests=None,
|
| 78 |
total_tests=len(self._current_task["test_cases"]),
|
| 79 |
feedback=None,
|
|
@@ -81,31 +79,31 @@ class CodeDebugEnvironment(Environment):
|
|
| 81 |
)
|
| 82 |
|
| 83 |
def step(self, action: DebugAction) -> DebugObservation:
|
| 84 |
-
"""
|
| 85 |
-
Agent submits fixed_code (and optionally explanation for hard tasks).
|
| 86 |
-
Returns observation with reward, feedback, and done flag.
|
| 87 |
-
"""
|
| 88 |
if self._done:
|
| 89 |
return DebugObservation(
|
| 90 |
task_id=self._current_task["task_id"] if self._current_task else "none",
|
| 91 |
difficulty=self._difficulty,
|
| 92 |
buggy_code=self._current_task["buggy_code"] if self._current_task else "",
|
| 93 |
-
instructions="Episode
|
| 94 |
test_cases_description="",
|
| 95 |
reward=self._best_reward,
|
|
|
|
|
|
|
| 96 |
passed_tests=None,
|
| 97 |
total_tests=0,
|
| 98 |
-
feedback="Episode ended.
|
| 99 |
done=True,
|
| 100 |
)
|
| 101 |
|
| 102 |
self._step_count += 1
|
| 103 |
|
| 104 |
-
# ── Invalid action penalty ─────────────────────────────────────────
|
| 105 |
code = action.fixed_code.strip() if action.fixed_code else ""
|
| 106 |
if not code:
|
| 107 |
done = self._step_count >= MAX_STEPS
|
| 108 |
self._done = done
|
|
|
|
| 109 |
return DebugObservation(
|
| 110 |
task_id=self._current_task["task_id"],
|
| 111 |
difficulty=self._difficulty,
|
|
@@ -113,30 +111,15 @@ class CodeDebugEnvironment(Environment):
|
|
| 113 |
instructions=self._current_task["instructions"],
|
| 114 |
test_cases_description=self._current_task["test_cases_description"],
|
| 115 |
reward=0.0,
|
|
|
|
|
|
|
| 116 |
passed_tests=0,
|
| 117 |
total_tests=len(self._current_task["test_cases"]),
|
| 118 |
-
feedback="❌ Invalid action: fixed_code is empty.
|
| 119 |
done=done,
|
| 120 |
)
|
| 121 |
|
| 122 |
-
#
|
| 123 |
-
if len(code) < 5 or ("def " not in code and "lambda" not in code and "=" not in code):
|
| 124 |
-
done = self._step_count >= MAX_STEPS
|
| 125 |
-
self._done = done
|
| 126 |
-
return DebugObservation(
|
| 127 |
-
task_id=self._current_task["task_id"],
|
| 128 |
-
difficulty=self._difficulty,
|
| 129 |
-
buggy_code=self._current_task["buggy_code"],
|
| 130 |
-
instructions=self._current_task["instructions"],
|
| 131 |
-
test_cases_description=self._current_task["test_cases_description"],
|
| 132 |
-
reward=0.0,
|
| 133 |
-
passed_tests=0,
|
| 134 |
-
total_tests=len(self._current_task["test_cases"]),
|
| 135 |
-
feedback="❌ Invalid action: submission does not appear to be valid Python. Penalty applied.",
|
| 136 |
-
done=done,
|
| 137 |
-
)
|
| 138 |
-
|
| 139 |
-
# Grade the submission
|
| 140 |
grader = GRADERS[self._difficulty]
|
| 141 |
if self._difficulty == "hard":
|
| 142 |
reward, passed, total, feedback, _ = grader(
|
|
@@ -148,9 +131,9 @@ class CodeDebugEnvironment(Environment):
|
|
| 148 |
)
|
| 149 |
|
| 150 |
self._current_reward = reward
|
|
|
|
| 151 |
self._best_reward = max(self._best_reward, reward)
|
| 152 |
|
| 153 |
-
# Episode ends if: perfect score OR max steps reached
|
| 154 |
done = (reward == 1.0) or (self._step_count >= MAX_STEPS)
|
| 155 |
self._done = done
|
| 156 |
|
|
@@ -161,6 +144,8 @@ class CodeDebugEnvironment(Environment):
|
|
| 161 |
instructions=self._current_task["instructions"],
|
| 162 |
test_cases_description=self._current_task["test_cases_description"],
|
| 163 |
reward=reward,
|
|
|
|
|
|
|
| 164 |
passed_tests=passed,
|
| 165 |
total_tests=total,
|
| 166 |
feedback=feedback,
|
|
@@ -177,6 +162,7 @@ class CodeDebugEnvironment(Environment):
|
|
| 177 |
difficulty=self._difficulty,
|
| 178 |
max_steps=MAX_STEPS,
|
| 179 |
current_reward=self._current_reward,
|
|
|
|
| 180 |
best_reward=self._best_reward,
|
| 181 |
done=self._done,
|
| 182 |
-
)
|
|
|
|
| 16 |
from server.graders.grader_medium import grade_medium
|
| 17 |
from server.graders.grader_hard import grade_hard
|
| 18 |
|
|
|
|
| 19 |
TASK_GETTERS = {
|
| 20 |
+
"easy": get_random_easy_task,
|
| 21 |
"medium": get_random_medium_task,
|
| 22 |
+
"hard": get_random_hard_task,
|
| 23 |
}
|
| 24 |
|
| 25 |
GRADERS = {
|
| 26 |
+
"easy": grade_easy,
|
| 27 |
"medium": grade_medium,
|
| 28 |
+
"hard": grade_hard,
|
| 29 |
}
|
| 30 |
|
| 31 |
MAX_STEPS = 5
|
|
|
|
| 34 |
class CodeDebugEnvironment(Environment):
|
| 35 |
"""
|
| 36 |
OpenEnv environment for LLM-based code debugging.
|
| 37 |
+
Supports 3 difficulty levels with partial rewards and cumulative tracking.
|
| 38 |
"""
|
| 39 |
|
| 40 |
def __init__(self):
|
|
|
|
| 42 |
self._difficulty: str = "easy"
|
| 43 |
self._current_task: Optional[dict] = None
|
| 44 |
self._step_count: int = 0
|
| 45 |
+
self._cumulative_reward: float = 0.0
|
| 46 |
self._best_reward: float = 0.0
|
| 47 |
self._current_reward: float = 0.0
|
| 48 |
self._done: bool = False
|
| 49 |
|
| 50 |
def reset(self, difficulty: Optional[str] = None) -> DebugObservation:
|
| 51 |
+
"""Start a new episode. Optionally specify difficulty: easy | medium | hard."""
|
|
|
|
|
|
|
|
|
|
| 52 |
self._episode_id = str(uuid4())
|
| 53 |
self._step_count = 0
|
| 54 |
+
self._cumulative_reward = 0.0
|
| 55 |
self._best_reward = 0.0
|
| 56 |
self._current_reward = 0.0
|
| 57 |
self._done = False
|
| 58 |
|
|
|
|
| 59 |
if difficulty and difficulty in TASK_GETTERS:
|
| 60 |
self._difficulty = difficulty
|
| 61 |
else:
|
| 62 |
self._difficulty = random.choice(["easy", "medium", "hard"])
|
| 63 |
|
|
|
|
| 64 |
self._current_task = TASK_GETTERS[self._difficulty]()
|
| 65 |
|
| 66 |
return DebugObservation(
|
|
|
|
| 70 |
instructions=self._current_task["instructions"],
|
| 71 |
test_cases_description=self._current_task["test_cases_description"],
|
| 72 |
reward=None,
|
| 73 |
+
cumulative_reward=0.0,
|
| 74 |
+
best_reward=0.0,
|
| 75 |
passed_tests=None,
|
| 76 |
total_tests=len(self._current_task["test_cases"]),
|
| 77 |
feedback=None,
|
|
|
|
| 79 |
)
|
| 80 |
|
| 81 |
def step(self, action: DebugAction) -> DebugObservation:
|
| 82 |
+
"""Submit fixed_code. Returns observation with reward, cumulative_reward, feedback, done."""
|
|
|
|
|
|
|
|
|
|
| 83 |
if self._done:
|
| 84 |
return DebugObservation(
|
| 85 |
task_id=self._current_task["task_id"] if self._current_task else "none",
|
| 86 |
difficulty=self._difficulty,
|
| 87 |
buggy_code=self._current_task["buggy_code"] if self._current_task else "",
|
| 88 |
+
instructions="Episode done. Call reset() to start a new episode.",
|
| 89 |
test_cases_description="",
|
| 90 |
reward=self._best_reward,
|
| 91 |
+
cumulative_reward=self._cumulative_reward,
|
| 92 |
+
best_reward=self._best_reward,
|
| 93 |
passed_tests=None,
|
| 94 |
total_tests=0,
|
| 95 |
+
feedback="Episode ended. Call reset() to start a new task.",
|
| 96 |
done=True,
|
| 97 |
)
|
| 98 |
|
| 99 |
self._step_count += 1
|
| 100 |
|
| 101 |
+
# ── Invalid action penalty ─────────────────────────────────────────
|
| 102 |
code = action.fixed_code.strip() if action.fixed_code else ""
|
| 103 |
if not code:
|
| 104 |
done = self._step_count >= MAX_STEPS
|
| 105 |
self._done = done
|
| 106 |
+
self._cumulative_reward += 0.0
|
| 107 |
return DebugObservation(
|
| 108 |
task_id=self._current_task["task_id"],
|
| 109 |
difficulty=self._difficulty,
|
|
|
|
| 111 |
instructions=self._current_task["instructions"],
|
| 112 |
test_cases_description=self._current_task["test_cases_description"],
|
| 113 |
reward=0.0,
|
| 114 |
+
cumulative_reward=self._cumulative_reward,
|
| 115 |
+
best_reward=self._best_reward,
|
| 116 |
passed_tests=0,
|
| 117 |
total_tests=len(self._current_task["test_cases"]),
|
| 118 |
+
feedback="❌ Invalid action: fixed_code is empty. Submit valid Python code.",
|
| 119 |
done=done,
|
| 120 |
)
|
| 121 |
|
| 122 |
+
# ── Grade the submission ───────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
grader = GRADERS[self._difficulty]
|
| 124 |
if self._difficulty == "hard":
|
| 125 |
reward, passed, total, feedback, _ = grader(
|
|
|
|
| 131 |
)
|
| 132 |
|
| 133 |
self._current_reward = reward
|
| 134 |
+
self._cumulative_reward += reward
|
| 135 |
self._best_reward = max(self._best_reward, reward)
|
| 136 |
|
|
|
|
| 137 |
done = (reward == 1.0) or (self._step_count >= MAX_STEPS)
|
| 138 |
self._done = done
|
| 139 |
|
|
|
|
| 144 |
instructions=self._current_task["instructions"],
|
| 145 |
test_cases_description=self._current_task["test_cases_description"],
|
| 146 |
reward=reward,
|
| 147 |
+
cumulative_reward=self._cumulative_reward,
|
| 148 |
+
best_reward=self._best_reward,
|
| 149 |
passed_tests=passed,
|
| 150 |
total_tests=total,
|
| 151 |
feedback=feedback,
|
|
|
|
| 162 |
difficulty=self._difficulty,
|
| 163 |
max_steps=MAX_STEPS,
|
| 164 |
current_reward=self._current_reward,
|
| 165 |
+
cumulative_reward=self._cumulative_reward,
|
| 166 |
best_reward=self._best_reward,
|
| 167 |
done=self._done,
|
| 168 |
+
)
|
server/graders/__pycache__/grader_easy.cpython-310.pyc
CHANGED
|
Binary files a/server/graders/__pycache__/grader_easy.cpython-310.pyc and b/server/graders/__pycache__/grader_easy.cpython-310.pyc differ
|
|
|
server/graders/__pycache__/grader_hard.cpython-310.pyc
CHANGED
|
Binary files a/server/graders/__pycache__/grader_hard.cpython-310.pyc and b/server/graders/__pycache__/grader_hard.cpython-310.pyc differ
|
|
|
server/graders/__pycache__/grader_medium.cpython-310.pyc
CHANGED
|
Binary files a/server/graders/__pycache__/grader_medium.cpython-310.pyc and b/server/graders/__pycache__/grader_medium.cpython-310.pyc differ
|
|
|
server/graders/grader_hard.py
CHANGED
|
@@ -6,68 +6,85 @@ from typing import Tuple, List, Optional
|
|
| 6 |
from .grader_easy import grade_easy
|
| 7 |
|
| 8 |
|
| 9 |
-
def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple[float, str]:
|
| 10 |
"""
|
| 11 |
-
Score explanation
|
| 12 |
-
-
|
| 13 |
-
-
|
| 14 |
-
-
|
| 15 |
"""
|
| 16 |
-
if not explanation or len(explanation.strip()) <
|
| 17 |
-
return 0.0, "❌ No explanation provided. Hard tasks require
|
| 18 |
|
| 19 |
-
|
| 20 |
-
hits = [kw for kw in keywords if kw.lower() in
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
else:
|
| 25 |
-
|
| 26 |
-
if len(hits) == 0:
|
| 27 |
-
score = 0.0
|
| 28 |
-
elif len(hits) >= needed:
|
| 29 |
-
score = 1.0
|
| 30 |
-
else:
|
| 31 |
-
score = round(len(hits) / needed, 2)
|
| 32 |
|
| 33 |
-
if score =
|
| 34 |
-
feedback = f"✅ Explanation excellent!
|
| 35 |
elif score > 0:
|
| 36 |
-
missing = [kw for kw in keywords if kw.lower() not in
|
| 37 |
feedback = (
|
| 38 |
-
f"⚠️ Partial explanation (score={score}).
|
| 39 |
-
f"Also
|
| 40 |
)
|
| 41 |
else:
|
| 42 |
-
feedback = (
|
| 43 |
-
f"❌ Explanation missing key concepts. "
|
| 44 |
-
f"Explain: {', '.join(keywords[:3])}"
|
| 45 |
-
)
|
| 46 |
|
| 47 |
return round(score, 2), feedback
|
| 48 |
|
| 49 |
|
| 50 |
def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -> Tuple[float, int, int, str, List[dict]]:
|
| 51 |
"""
|
| 52 |
-
Grade
|
| 53 |
-
Reward = 0.7 × test_score + 0.3 × explanation_score
|
| 54 |
"""
|
| 55 |
test_reward, passed, total, code_feedback, results = grade_easy(fixed_code, task)
|
| 56 |
keywords = task.get("explanation_keywords", [])
|
| 57 |
-
|
|
|
|
| 58 |
final_reward = round(0.7 * test_reward + 0.3 * exp_score, 2)
|
| 59 |
|
| 60 |
feedback = (
|
| 61 |
-
f"--- Code Score (70%
|
| 62 |
f"{code_feedback}\n\n"
|
| 63 |
-
f"--- Explanation Score (30%
|
| 64 |
f"{exp_feedback}\n\n"
|
| 65 |
f"=== Final Reward: {final_reward:.2f} ==="
|
| 66 |
)
|
| 67 |
|
| 68 |
if passed == total and exp_score < 1.0:
|
| 69 |
-
feedback += f"\n💡 Code
|
| 70 |
elif passed < total and not explanation:
|
| 71 |
-
feedback += "\n💡 Fix the code AND
|
| 72 |
|
| 73 |
return final_reward, passed, total, feedback, results
|
|
|
|
| 6 |
from .grader_easy import grade_easy
|
| 7 |
|
| 8 |
|
| 9 |
+
def _score_explanation(explanation: Optional[str], keywords: List[str], instructions: str) -> Tuple[float, str]:
|
| 10 |
"""
|
| 11 |
+
Score explanation semantically:
|
| 12 |
+
- Length check (must be meaningful)
|
| 13 |
+
- Keyword matching (concept coverage)
|
| 14 |
+
- Partial credit for any relevant mention
|
| 15 |
"""
|
| 16 |
+
if not explanation or len(explanation.strip()) < 15:
|
| 17 |
+
return 0.0, "❌ No explanation provided. Hard tasks require explanation field."
|
| 18 |
|
| 19 |
+
exp_lower = explanation.lower()
|
| 20 |
+
hits = [kw for kw in keywords if kw.lower() in exp_lower]
|
| 21 |
|
| 22 |
+
# Also check for common synonyms
|
| 23 |
+
synonym_map = {
|
| 24 |
+
"visited": ["seen", "visited", "track", "memo"],
|
| 25 |
+
"iteration order": ["order", "direction", "forward", "backward", "reverse"],
|
| 26 |
+
"overwrite": ["overwrite", "override", "update", "modify"],
|
| 27 |
+
"reverse": ["reverse", "backward", "right to left", "descending"],
|
| 28 |
+
"0/1": ["0/1", "zero one", "binary", "knapsack"],
|
| 29 |
+
"high": ["high", "upper", "boundary", "bound"],
|
| 30 |
+
"return high": ["return high", "high boundary"],
|
| 31 |
+
"floor": ["floor", "integer", "truncat"],
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
synonym_hits = set(hits)
|
| 35 |
+
for kw in keywords:
|
| 36 |
+
kw_lower = kw.lower()
|
| 37 |
+
if kw_lower in synonym_map:
|
| 38 |
+
for syn in synonym_map[kw_lower]:
|
| 39 |
+
if syn in exp_lower:
|
| 40 |
+
synonym_hits.add(kw)
|
| 41 |
+
break
|
| 42 |
+
|
| 43 |
+
total_hits = len(synonym_hits)
|
| 44 |
+
needed = max(1, len(keywords) // 2)
|
| 45 |
+
|
| 46 |
+
if total_hits == 0:
|
| 47 |
+
score = 0.1 if len(explanation.strip()) > 50 else 0.0 # minimal credit for any long attempt
|
| 48 |
+
elif total_hits >= needed:
|
| 49 |
+
score = 1.0
|
| 50 |
else:
|
| 51 |
+
score = round(total_hits / needed, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
+
if score >= 1.0:
|
| 54 |
+
feedback = f"✅ Explanation excellent! Covered: {', '.join(synonym_hits)}"
|
| 55 |
elif score > 0:
|
| 56 |
+
missing = [kw for kw in keywords if kw.lower() not in exp_lower]
|
| 57 |
feedback = (
|
| 58 |
+
f"⚠️ Partial explanation (score={score}). Covered: {', '.join(synonym_hits) or 'none'}. "
|
| 59 |
+
f"Also mention: {', '.join(missing[:3])}"
|
| 60 |
)
|
| 61 |
else:
|
| 62 |
+
feedback = f"❌ Explanation too vague. Explain: {', '.join(keywords[:3])}"
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
return round(score, 2), feedback
|
| 65 |
|
| 66 |
|
| 67 |
def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -> Tuple[float, int, int, str, List[dict]]:
|
| 68 |
"""
|
| 69 |
+
Grade hard task: Reward = 0.7 × test_score + 0.3 × explanation_score
|
|
|
|
| 70 |
"""
|
| 71 |
test_reward, passed, total, code_feedback, results = grade_easy(fixed_code, task)
|
| 72 |
keywords = task.get("explanation_keywords", [])
|
| 73 |
+
instructions = task.get("instructions", "")
|
| 74 |
+
exp_score, exp_feedback = _score_explanation(explanation, keywords, instructions)
|
| 75 |
final_reward = round(0.7 * test_reward + 0.3 * exp_score, 2)
|
| 76 |
|
| 77 |
feedback = (
|
| 78 |
+
f"--- Code Score (70%): {test_reward:.2f} ---\n"
|
| 79 |
f"{code_feedback}\n\n"
|
| 80 |
+
f"--- Explanation Score (30%): {exp_score:.2f} ---\n"
|
| 81 |
f"{exp_feedback}\n\n"
|
| 82 |
f"=== Final Reward: {final_reward:.2f} ==="
|
| 83 |
)
|
| 84 |
|
| 85 |
if passed == total and exp_score < 1.0:
|
| 86 |
+
feedback += f"\n💡 Code correct! Boost score by mentioning: {', '.join(keywords[:3])}"
|
| 87 |
elif passed < total and not explanation:
|
| 88 |
+
feedback += "\n💡 Fix the code AND add explanation for max reward."
|
| 89 |
|
| 90 |
return final_reward, passed, total, feedback, results
|