Spaces:
Running
Running
Commit ·
8485798
1
Parent(s): c21c4ee
Final complete version - all fixes applied
Browse files- README.md +25 -12
- inference.py +109 -182
- server/__pycache__/__init__.cpython-39.pyc +0 -0
- server/__pycache__/app.cpython-310.pyc +0 -0
- server/__pycache__/app.cpython-39.pyc +0 -0
- server/__pycache__/environment.cpython-310.pyc +0 -0
- server/app.py +33 -71
- server/graders/__pycache__/grader_easy.cpython-310.pyc +0 -0
- server/graders/__pycache__/grader_hard.cpython-310.pyc +0 -0
- server/graders/__pycache__/grader_medium.cpython-310.pyc +0 -0
- server/graders/grader_easy.py +17 -31
- server/graders/grader_hard.py +14 -94
- server/tasks/__pycache__/__init__.cpython-39.pyc +0 -0
- server/tasks/__pycache__/task_easy.cpython-310.pyc +0 -0
- server/tasks/__pycache__/task_easy.cpython-39.pyc +0 -0
- server/tasks/__pycache__/task_hard.cpython-310.pyc +0 -0
- server/tasks/__pycache__/task_hard.cpython-39.pyc +0 -0
- server/tasks/__pycache__/task_medium.cpython-310.pyc +0 -0
- server/tasks/__pycache__/task_medium.cpython-39.pyc +0 -0
- server/tasks/task_hard.py +1 -1
- server/tasks/task_medium.py +1 -1
- spec.md +37 -95
- tests/test_graders.py +30 -49
README.md
CHANGED
|
@@ -1,12 +1,3 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Code Debug Env
|
| 3 |
-
emoji: 🐛
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: green
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
---
|
| 9 |
-
|
| 10 |
# Code Debug Environment
|
| 11 |
|
| 12 |
An [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
|
|
@@ -114,9 +105,10 @@ Explanation is scored by matching key algorithmic concepts. Partial credit is gi
|
|
| 114 |
|
| 115 |
### Install
|
| 116 |
```bash
|
| 117 |
-
git clone https://github.com/
|
| 118 |
cd code-debug-env
|
| 119 |
pip install -e .
|
|
|
|
| 120 |
git clone https://github.com/meta-pytorch/OpenEnv.git
|
| 121 |
export PYTHONPATH=$PYTHONPATH:OpenEnv:OpenEnv/src:.
|
| 122 |
```
|
|
@@ -171,11 +163,32 @@ python inference.py --url http://localhost:7860 --difficulty hard
|
|
| 171 |
|
| 172 |
## Pre-Submission Validation
|
| 173 |
|
|
|
|
|
|
|
| 174 |
```bash
|
|
|
|
| 175 |
python validator/pre_submit_check.py --url http://localhost:7860
|
| 176 |
|
| 177 |
# Or against your HF Space:
|
| 178 |
-
python validator/pre_submit_check.py --url https://
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
```
|
| 180 |
|
| 181 |
---
|
|
@@ -206,4 +219,4 @@ code-debug-env/
|
|
| 206 |
│ └── Dockerfile
|
| 207 |
└── validator/
|
| 208 |
└── pre_submit_check.py
|
| 209 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Code Debug Environment
|
| 2 |
|
| 3 |
An [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
|
|
|
|
| 105 |
|
| 106 |
### Install
|
| 107 |
```bash
|
| 108 |
+
git clone https://github.com/YOUR_USERNAME/code-debug-env
|
| 109 |
cd code-debug-env
|
| 110 |
pip install -e .
|
| 111 |
+
# Also clone OpenEnv for PYTHONPATH
|
| 112 |
git clone https://github.com/meta-pytorch/OpenEnv.git
|
| 113 |
export PYTHONPATH=$PYTHONPATH:OpenEnv:OpenEnv/src:.
|
| 114 |
```
|
|
|
|
| 163 |
|
| 164 |
## Pre-Submission Validation
|
| 165 |
|
| 166 |
+
Run before submitting to catch any disqualifying issues:
|
| 167 |
+
|
| 168 |
```bash
|
| 169 |
+
# Start the environment first, then:
|
| 170 |
python validator/pre_submit_check.py --url http://localhost:7860
|
| 171 |
|
| 172 |
# Or against your HF Space:
|
| 173 |
+
python validator/pre_submit_check.py --url https://YOUR_SPACE.hf.space
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
---
|
| 177 |
+
|
| 178 |
+
## Deploy to Hugging Face Spaces
|
| 179 |
+
|
| 180 |
+
```bash
|
| 181 |
+
# Login
|
| 182 |
+
huggingface-cli login
|
| 183 |
+
|
| 184 |
+
# Create space and push
|
| 185 |
+
huggingface-cli repo create code-debug-env --type space --space_sdk docker
|
| 186 |
+
cd code-debug-env
|
| 187 |
+
git init
|
| 188 |
+
git remote add origin https://huggingface.co/spaces/YOUR_USERNAME/code-debug-env
|
| 189 |
+
git add .
|
| 190 |
+
git commit -m "Initial commit"
|
| 191 |
+
git push origin main
|
| 192 |
```
|
| 193 |
|
| 194 |
---
|
|
|
|
| 219 |
│ └── Dockerfile
|
| 220 |
└── validator/
|
| 221 |
└── pre_submit_check.py
|
| 222 |
+
```
|
inference.py
CHANGED
|
@@ -1,262 +1,189 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
| 17 |
from openai import OpenAI
|
| 18 |
from typing import List, Optional
|
| 19 |
|
| 20 |
-
# ──
|
| 21 |
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 22 |
-
MODEL_NAME = os.environ.get("MODEL_NAME",
|
| 23 |
-
HF_TOKEN = os.environ.get("HF_TOKEN",
|
| 24 |
-
ENV_URL = os.environ.get("ENV_URL",
|
| 25 |
BENCHMARK = "code-debug-env"
|
| 26 |
MAX_STEPS = 5
|
| 27 |
|
| 28 |
-
# ─── OpenAI Client ───────────────────────────────────────────────────────────
|
| 29 |
client = OpenAI(api_key=HF_TOKEN or "dummy", base_url=API_BASE_URL)
|
| 30 |
|
| 31 |
-
# ──
|
| 32 |
-
def log_start(task_id
|
| 33 |
print(f"[START] task={task_id} env={env} model={model}", flush=True)
|
| 34 |
|
| 35 |
-
def log_step(step
|
| 36 |
-
|
| 37 |
-
done_val = str(done).lower()
|
| 38 |
-
print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
|
| 39 |
|
| 40 |
-
def log_end(success
|
| 41 |
-
|
| 42 |
-
print(f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}", flush=True)
|
| 43 |
|
| 44 |
-
# ──
|
| 45 |
-
def env_reset(
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
return
|
| 49 |
|
| 50 |
-
def env_step(
|
| 51 |
payload = {"fixed_code": fixed_code}
|
| 52 |
if explanation:
|
| 53 |
payload["explanation"] = explanation
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
return
|
| 57 |
-
|
| 58 |
-
# ─── LLM Agent ───────────────────────────────────────────────────────────────
|
| 59 |
-
SYSTEM_PROMPT = """You are an expert Python debugging agent. Your job is to find and fix bugs in Python functions.
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
- Return the COMPLETE fixed function, not just the changed line
|
| 64 |
-
- The fixed_code must be syntactically valid Python
|
| 65 |
-
- For hard tasks, the explanation field MUST describe: what the bug was, why it caused failures, and how your fix resolves it
|
| 66 |
|
| 67 |
-
|
| 68 |
{
|
| 69 |
-
"fixed_code": "<complete corrected Python function>",
|
| 70 |
-
"explanation": "<for hard tasks:
|
| 71 |
}
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
"""
|
| 81 |
|
| 82 |
-
def call_llm(buggy_code
|
| 83 |
-
|
| 84 |
-
previous_code: str = None) -> dict:
|
| 85 |
|
| 86 |
-
user_content = f"""Task difficulty: {difficulty}
|
| 87 |
-
Instructions: {instructions}
|
| 88 |
-
|
| 89 |
-
Buggy code to fix:
|
| 90 |
-
```python
|
| 91 |
-
{buggy_code}
|
| 92 |
-
```
|
| 93 |
-
"""
|
| 94 |
if feedback and attempt > 1:
|
| 95 |
-
|
| 96 |
-
PREVIOUS ATTEMPT FAILED. Here is the feedback showing what went wrong:
|
| 97 |
-
{feedback}
|
| 98 |
-
|
| 99 |
-
Your previous fix was:
|
| 100 |
-
```python
|
| 101 |
-
{previous_code or 'unknown'}
|
| 102 |
-
```
|
| 103 |
-
|
| 104 |
-
IMPORTANT: Your previous fix did not work. Carefully analyze the feedback above.
|
| 105 |
-
Look at the Input, Expected, and Got values for each failing test.
|
| 106 |
-
Try a completely different approach to fix the bug.
|
| 107 |
-
"""
|
| 108 |
|
| 109 |
if difficulty == "hard":
|
| 110 |
-
|
| 111 |
-
Remember: For hard tasks you MUST include a detailed explanation field describing:
|
| 112 |
-
- What the algorithmic bug was
|
| 113 |
-
- Why it caused incorrect results
|
| 114 |
-
- How your fix resolves it
|
| 115 |
-
Explanation quality affects 30% of your reward.
|
| 116 |
-
"""
|
| 117 |
-
|
| 118 |
-
messages = [
|
| 119 |
-
{"role": "system", "content": SYSTEM_PROMPT},
|
| 120 |
-
{"role": "user", "content": user_content},
|
| 121 |
-
]
|
| 122 |
|
| 123 |
try:
|
| 124 |
-
|
| 125 |
model=MODEL_NAME,
|
| 126 |
-
messages=
|
| 127 |
max_tokens=1500,
|
| 128 |
-
temperature=0.
|
| 129 |
)
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
#
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
"explanation": parsed.get("explanation", None),
|
| 144 |
-
}
|
| 145 |
-
except json.JSONDecodeError:
|
| 146 |
-
# Try to extract code from malformed response
|
| 147 |
-
if "def " in content:
|
| 148 |
-
lines = content.split("\n")
|
| 149 |
-
code_lines = []
|
| 150 |
-
in_code = False
|
| 151 |
-
for line in lines:
|
| 152 |
-
if line.strip().startswith("def "):
|
| 153 |
-
in_code = True
|
| 154 |
-
if in_code:
|
| 155 |
-
code_lines.append(line)
|
| 156 |
-
if code_lines:
|
| 157 |
-
return {"fixed_code": "\n".join(code_lines), "explanation": None}
|
| 158 |
-
return {"fixed_code": buggy_code, "explanation": None}
|
| 159 |
except Exception as e:
|
| 160 |
-
print(f"# LLM
|
| 161 |
return {"fixed_code": buggy_code, "explanation": None}
|
| 162 |
|
| 163 |
-
# ──
|
| 164 |
-
def run_episode(env_url
|
| 165 |
-
|
| 166 |
-
obs
|
| 167 |
task_id = obs["task_id"]
|
| 168 |
buggy_code = obs["buggy_code"]
|
| 169 |
instructions = obs["instructions"]
|
| 170 |
|
| 171 |
-
log_start(task_id
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
rewards: List[float] = []
|
| 176 |
-
steps_taken = 0
|
| 177 |
-
success = False
|
| 178 |
|
| 179 |
for attempt in range(1, MAX_STEPS + 1):
|
| 180 |
steps_taken = attempt
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
instructions=instructions,
|
| 185 |
-
difficulty=difficulty,
|
| 186 |
-
feedback=last_feedback,
|
| 187 |
-
attempt=attempt,
|
| 188 |
-
previous_code=last_fixed_code,
|
| 189 |
-
)
|
| 190 |
-
|
| 191 |
-
fixed_code = agent_action["fixed_code"]
|
| 192 |
-
last_fixed_code = fixed_code
|
| 193 |
-
|
| 194 |
-
if not fixed_code or not fixed_code.strip():
|
| 195 |
-
log_step(step=attempt, action="empty_submission",
|
| 196 |
-
reward=0.0, done=False, error="empty_code")
|
| 197 |
rewards.append(0.0)
|
| 198 |
continue
|
| 199 |
|
| 200 |
try:
|
| 201 |
-
result = env_step(env_url,
|
| 202 |
-
explanation=agent_action.get("explanation"))
|
| 203 |
except Exception as e:
|
| 204 |
-
log_step(
|
| 205 |
-
reward=0.0, done=False, error=str(e)[:60])
|
| 206 |
rewards.append(0.0)
|
| 207 |
continue
|
| 208 |
|
| 209 |
reward = result.get("reward", 0.0)
|
| 210 |
done = result.get("done", False)
|
| 211 |
-
|
| 212 |
-
last_feedback = obs_r.get("feedback", "")
|
| 213 |
|
| 214 |
-
log_step(
|
| 215 |
-
reward=reward, done=done, error=None)
|
| 216 |
rewards.append(reward)
|
| 217 |
|
| 218 |
if reward >= 1.0:
|
| 219 |
success = True
|
| 220 |
-
|
| 221 |
if done:
|
| 222 |
break
|
| 223 |
|
| 224 |
-
log_end(success
|
| 225 |
return success, steps_taken, rewards
|
| 226 |
|
|
|
|
| 227 |
def main():
|
| 228 |
-
parser = argparse.ArgumentParser(
|
| 229 |
-
parser.add_argument("--url", default=ENV_URL
|
| 230 |
-
parser.add_argument("--difficulty", default=None,
|
| 231 |
-
choices=["easy", "medium", "hard", "all"])
|
| 232 |
args = parser.parse_args()
|
| 233 |
-
|
| 234 |
|
| 235 |
try:
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
print(f"# Environment healthy at {env_url}", flush=True)
|
| 239 |
except Exception as e:
|
| 240 |
print(f"# Health check failed: {e}", file=sys.stderr)
|
| 241 |
sys.exit(1)
|
| 242 |
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
all_rewards = []
|
| 247 |
-
all_successes = []
|
| 248 |
|
| 249 |
-
for
|
| 250 |
-
|
| 251 |
all_rewards.extend(rewards)
|
| 252 |
-
|
| 253 |
time.sleep(0.5)
|
| 254 |
|
| 255 |
-
avg = round(sum(all_rewards)
|
| 256 |
-
print(
|
| 257 |
-
f"# SUMMARY: {sum(all_successes)}/{len(difficulties)} tasks solved | avg_reward={avg}",
|
| 258 |
-
flush=True
|
| 259 |
-
)
|
| 260 |
|
| 261 |
if __name__ == "__main__":
|
| 262 |
-
main()
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
inference.py - Code Debug Environment Baseline Agent
|
| 4 |
+
|
| 5 |
+
Required env vars: API_BASE_URL, MODEL_NAME, HF_TOKEN
|
| 6 |
+
Usage:
|
| 7 |
+
python inference.py
|
| 8 |
+
python inference.py --url https://Souravdanyal-code-debug-env.hf.space
|
| 9 |
+
python inference.py --difficulty easy
|
| 10 |
+
|
| 11 |
+
STDOUT FORMAT (required by evaluator):
|
| 12 |
+
[START] task=<id> env=<benchmark> model=<model>
|
| 13 |
+
[STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 14 |
+
[END] success=<true|false> steps=<n> rewards=<r1,r2,...>
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import os, sys, json, time, argparse, requests
|
| 18 |
from openai import OpenAI
|
| 19 |
from typing import List, Optional
|
| 20 |
|
| 21 |
+
# ── Config ────────────────────────────────────────────────────────────────────
|
| 22 |
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 23 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
|
| 24 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 25 |
+
ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
|
| 26 |
BENCHMARK = "code-debug-env"
|
| 27 |
MAX_STEPS = 5
|
| 28 |
|
|
|
|
| 29 |
client = OpenAI(api_key=HF_TOKEN or "dummy", base_url=API_BASE_URL)
|
| 30 |
|
| 31 |
+
# ── Logging ───────────────────────────────────────────────────────────────────
|
| 32 |
+
def log_start(task_id, env, model):
|
| 33 |
print(f"[START] task={task_id} env={env} model={model}", flush=True)
|
| 34 |
|
| 35 |
+
def log_step(step, action, reward, done, error):
|
| 36 |
+
print(f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={error or 'null'}", flush=True)
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
def log_end(success, steps, rewards):
|
| 39 |
+
print(f"[END] success={str(success).lower()} steps={steps} rewards={','.join(f'{r:.2f}' for r in rewards)}", flush=True)
|
|
|
|
| 40 |
|
| 41 |
+
# ── Env client ────────────────────────────────────────────────────────────────
|
| 42 |
+
def env_reset(url, difficulty):
|
| 43 |
+
r = requests.post(f"{url}/reset", json={"difficulty": difficulty}, timeout=30)
|
| 44 |
+
r.raise_for_status()
|
| 45 |
+
return r.json()
|
| 46 |
|
| 47 |
+
def env_step(url, fixed_code, explanation=None):
|
| 48 |
payload = {"fixed_code": fixed_code}
|
| 49 |
if explanation:
|
| 50 |
payload["explanation"] = explanation
|
| 51 |
+
r = requests.post(f"{url}/step", json=payload, timeout=30)
|
| 52 |
+
r.raise_for_status()
|
| 53 |
+
return r.json()
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
+
# ── LLM ──────────────────────────────────────────────────────────────────────
|
| 56 |
+
SYSTEM_PROMPT = """You are an expert Python debugging agent. Fix bugs in Python functions.
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
RESPONSE FORMAT — strictly JSON only, no markdown:
|
| 59 |
{
|
| 60 |
+
"fixed_code": "<complete corrected Python function including imports>",
|
| 61 |
+
"explanation": "<for hard tasks: explain the bug, root cause, and fix>"
|
| 62 |
}
|
| 63 |
|
| 64 |
+
RULES:
|
| 65 |
+
- Return COMPLETE function with all imports (e.g. from collections import deque)
|
| 66 |
+
- fixed_code must be valid Python
|
| 67 |
+
- For hard tasks explanation MUST mention the algorithmic concept
|
| 68 |
+
|
| 69 |
+
COMMON BUGS:
|
| 70 |
+
- Graph/BFS: missing visited set → infinite loop on cycles → add visited=set()
|
| 71 |
+
- Knapsack DP: wrong loop order (forward=unbounded, backward=0/1 knapsack)
|
| 72 |
+
- Binary search: wrong boundary → return high not low, or high=n//2 not n
|
| 73 |
+
- Off-by-one: lst[2] should be lst[1] for second element
|
| 74 |
+
- Wrong operator: + instead of -, * instead of /
|
| 75 |
+
|
| 76 |
+
IF PREVIOUS ATTEMPT FAILED:
|
| 77 |
+
- Read the Input/Expected/Got carefully
|
| 78 |
+
- Try a completely different fix
|
| 79 |
+
- For TimeoutError: you have an infinite loop, add a visited set
|
| 80 |
"""
|
| 81 |
|
| 82 |
+
def call_llm(buggy_code, instructions, difficulty, feedback=None, attempt=1, prev_code=None):
|
| 83 |
+
content = f"Difficulty: {difficulty}\nInstructions: {instructions}\n\nBuggy code:\n```python\n{buggy_code}\n```\n"
|
|
|
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
if feedback and attempt > 1:
|
| 86 |
+
content += f"\nPREVIOUS FIX FAILED. Feedback:\n{feedback}\n\nYour previous code:\n```python\n{prev_code or ''}\n```\nTry a different approach.\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
if difficulty == "hard":
|
| 89 |
+
content += "\nIMPORTANT: Include a detailed explanation field mentioning the algorithmic concept.\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
try:
|
| 92 |
+
resp = client.chat.completions.create(
|
| 93 |
model=MODEL_NAME,
|
| 94 |
+
messages=[{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": content}],
|
| 95 |
max_tokens=1500,
|
| 96 |
+
temperature=0.1 if attempt == 1 else 0.4,
|
| 97 |
)
|
| 98 |
+
raw = resp.choices[0].message.content.strip()
|
| 99 |
+
# Clean markdown fences
|
| 100 |
+
if "```" in raw:
|
| 101 |
+
raw = raw.split("```")[1] if raw.startswith("```") else raw
|
| 102 |
+
if raw.startswith("json\n"):
|
| 103 |
+
raw = raw[5:]
|
| 104 |
+
# Find JSON object
|
| 105 |
+
start = raw.find("{")
|
| 106 |
+
end = raw.rfind("}") + 1
|
| 107 |
+
if start >= 0 and end > start:
|
| 108 |
+
raw = raw[start:end]
|
| 109 |
+
parsed = json.loads(raw)
|
| 110 |
+
return {"fixed_code": parsed.get("fixed_code", ""), "explanation": parsed.get("explanation")}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
except Exception as e:
|
| 112 |
+
print(f"# LLM error: {e}", file=sys.stderr)
|
| 113 |
return {"fixed_code": buggy_code, "explanation": None}
|
| 114 |
|
| 115 |
+
# ── Episode ───────────────────────────────────────────────────────────────────
|
| 116 |
+
def run_episode(env_url, difficulty):
|
| 117 |
+
data = env_reset(env_url, difficulty)
|
| 118 |
+
obs = data["observation"]
|
| 119 |
task_id = obs["task_id"]
|
| 120 |
buggy_code = obs["buggy_code"]
|
| 121 |
instructions = obs["instructions"]
|
| 122 |
|
| 123 |
+
log_start(task_id, BENCHMARK, MODEL_NAME)
|
| 124 |
|
| 125 |
+
rewards, steps_taken, success = [], 0, False
|
| 126 |
+
last_feedback, last_code = None, None
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
for attempt in range(1, MAX_STEPS + 1):
|
| 129 |
steps_taken = attempt
|
| 130 |
+
action = call_llm(buggy_code, instructions, difficulty, last_feedback, attempt, last_code)
|
| 131 |
+
code = action["fixed_code"]
|
| 132 |
+
last_code = code
|
| 133 |
|
| 134 |
+
if not code or not code.strip():
|
| 135 |
+
log_step(attempt, "empty_submission", 0.0, False, "empty_code")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
rewards.append(0.0)
|
| 137 |
continue
|
| 138 |
|
| 139 |
try:
|
| 140 |
+
result = env_step(env_url, code, action.get("explanation"))
|
|
|
|
| 141 |
except Exception as e:
|
| 142 |
+
log_step(attempt, "step_failed", 0.0, False, str(e)[:60])
|
|
|
|
| 143 |
rewards.append(0.0)
|
| 144 |
continue
|
| 145 |
|
| 146 |
reward = result.get("reward", 0.0)
|
| 147 |
done = result.get("done", False)
|
| 148 |
+
last_feedback = result.get("observation", {}).get("feedback", "")
|
|
|
|
| 149 |
|
| 150 |
+
log_step(attempt, f"fix_{difficulty}_attempt{attempt}", reward, done, None)
|
|
|
|
| 151 |
rewards.append(reward)
|
| 152 |
|
| 153 |
if reward >= 1.0:
|
| 154 |
success = True
|
|
|
|
| 155 |
if done:
|
| 156 |
break
|
| 157 |
|
| 158 |
+
log_end(success, steps_taken, rewards)
|
| 159 |
return success, steps_taken, rewards
|
| 160 |
|
| 161 |
+
# ── Main ──────────────────────────────────────────────────────────────────────
|
| 162 |
def main():
|
| 163 |
+
parser = argparse.ArgumentParser()
|
| 164 |
+
parser.add_argument("--url", default=ENV_URL)
|
| 165 |
+
parser.add_argument("--difficulty", default=None, choices=["easy","medium","hard","all"])
|
|
|
|
| 166 |
args = parser.parse_args()
|
| 167 |
+
url = args.url.rstrip("/")
|
| 168 |
|
| 169 |
try:
|
| 170 |
+
requests.get(f"{url}/health", timeout=10).raise_for_status()
|
| 171 |
+
print(f"# Environment healthy at {url}", flush=True)
|
|
|
|
| 172 |
except Exception as e:
|
| 173 |
print(f"# Health check failed: {e}", file=sys.stderr)
|
| 174 |
sys.exit(1)
|
| 175 |
|
| 176 |
+
diffs = ["easy","medium","hard"] if args.difficulty in (None,"all") else [args.difficulty]
|
| 177 |
+
all_rewards, successes = [], []
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
+
for d in diffs:
|
| 180 |
+
ok, _, rewards = run_episode(url, d)
|
| 181 |
all_rewards.extend(rewards)
|
| 182 |
+
successes.append(ok)
|
| 183 |
time.sleep(0.5)
|
| 184 |
|
| 185 |
+
avg = round(sum(all_rewards)/len(all_rewards), 3) if all_rewards else 0.0
|
| 186 |
+
print(f"# SUMMARY: {sum(successes)}/{len(diffs)} tasks solved | avg_reward={avg}", flush=True)
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
if __name__ == "__main__":
|
| 189 |
+
main()
|
server/__pycache__/__init__.cpython-39.pyc
DELETED
|
Binary file (161 Bytes)
|
|
|
server/__pycache__/app.cpython-310.pyc
CHANGED
|
Binary files a/server/__pycache__/app.cpython-310.pyc and b/server/__pycache__/app.cpython-310.pyc differ
|
|
|
server/__pycache__/app.cpython-39.pyc
DELETED
|
Binary file (4.22 kB)
|
|
|
server/__pycache__/environment.cpython-310.pyc
CHANGED
|
Binary files a/server/__pycache__/environment.cpython-310.pyc and b/server/__pycache__/environment.cpython-310.pyc differ
|
|
|
server/app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
# server/app.py
|
| 2 |
-
# FastAPI server exposing the OpenEnv standard endpoints.
|
| 3 |
# Port 7860 required for Hugging Face Spaces.
|
| 4 |
|
| 5 |
from fastapi import FastAPI, HTTPException
|
|
@@ -14,108 +13,74 @@ from models import DebugAction, DebugObservation, DebugState
|
|
| 14 |
|
| 15 |
app = FastAPI(
|
| 16 |
title="Code Debug Environment",
|
| 17 |
-
description=
|
| 18 |
-
"An OpenEnv environment where LLM agents fix buggy Python code. "
|
| 19 |
-
"3 difficulty levels: easy (1 bug), medium (2 bugs), hard (algorithmic + explanation)."
|
| 20 |
-
),
|
| 21 |
version="1.0.0",
|
| 22 |
)
|
| 23 |
|
| 24 |
-
app.add_middleware(
|
| 25 |
-
CORSMiddleware,
|
| 26 |
-
allow_origins=["*"],
|
| 27 |
-
allow_methods=["*"],
|
| 28 |
-
allow_headers=["*"],
|
| 29 |
-
)
|
| 30 |
|
| 31 |
-
# One global environment instance (single session)
|
| 32 |
-
# For concurrent sessions, instantiate per-request with a session dict
|
| 33 |
env = CodeDebugEnvironment()
|
| 34 |
|
| 35 |
|
| 36 |
-
# ─── Request Models ─────────────────────────────────────────────────────────
|
| 37 |
-
|
| 38 |
class ResetRequest(BaseModel):
|
| 39 |
-
difficulty: Optional[str] = None
|
| 40 |
-
|
| 41 |
|
| 42 |
class StepRequest(BaseModel):
|
| 43 |
fixed_code: str
|
| 44 |
explanation: Optional[str] = None
|
| 45 |
|
| 46 |
-
|
| 47 |
-
# ─── Response wrapper matching OpenEnv StepResult shape ──────────────────────
|
| 48 |
-
|
| 49 |
class StepResponse(BaseModel):
|
| 50 |
observation: dict
|
| 51 |
reward: float
|
| 52 |
done: bool
|
| 53 |
|
| 54 |
|
| 55 |
-
# ─── Endpoints ───────────────────────────────────────────────────────────────
|
| 56 |
-
|
| 57 |
@app.get("/", response_class=HTMLResponse)
|
| 58 |
async def root():
|
| 59 |
-
"""Homepage
|
| 60 |
html_path = os.path.join(os.path.dirname(__file__), "static", "index.html")
|
| 61 |
-
with open(html_path, "r") as f:
|
| 62 |
return f.read()
|
| 63 |
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
@app.get("/health")
|
| 66 |
async def health():
|
| 67 |
-
"""Health check
|
| 68 |
return {"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
|
| 69 |
|
| 70 |
|
| 71 |
@app.post("/reset")
|
| 72 |
async def reset(request: ResetRequest = ResetRequest()) -> dict:
|
| 73 |
-
"""
|
| 74 |
-
Reset the environment to start a new episode.
|
| 75 |
-
Optionally pass difficulty: 'easy' | 'medium' | 'hard'
|
| 76 |
-
"""
|
| 77 |
try:
|
| 78 |
-
|
| 79 |
-
return {
|
| 80 |
-
"observation": observation.model_dump(),
|
| 81 |
-
"reward": 0.0,
|
| 82 |
-
"done": False,
|
| 83 |
-
}
|
| 84 |
except Exception as e:
|
| 85 |
raise HTTPException(status_code=500, detail=f"Reset failed: {str(e)}")
|
| 86 |
|
| 87 |
|
| 88 |
@app.post("/step")
|
| 89 |
async def step(request: StepRequest) -> StepResponse:
|
| 90 |
-
"""
|
| 91 |
-
Submit a code fix (and optional explanation for hard tasks).
|
| 92 |
-
Returns observation with reward (0.0-1.0), feedback, and done flag.
|
| 93 |
-
"""
|
| 94 |
if not request.fixed_code or not request.fixed_code.strip():
|
| 95 |
raise HTTPException(status_code=400, detail="fixed_code must not be empty.")
|
| 96 |
-
|
| 97 |
try:
|
| 98 |
-
action = DebugAction(
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
)
|
| 102 |
-
observation = env.step(action)
|
| 103 |
-
return StepResponse(
|
| 104 |
-
observation=observation.model_dump(),
|
| 105 |
-
reward=observation.reward or 0.0,
|
| 106 |
-
done=observation.done,
|
| 107 |
-
)
|
| 108 |
except TimeoutError:
|
| 109 |
-
# Code execution timed out — return 0 reward instead of 500
|
| 110 |
return StepResponse(
|
| 111 |
-
observation={"task_id": "unknown", "difficulty": "unknown",
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
"done": False},
|
| 117 |
-
reward=0.0,
|
| 118 |
-
done=False,
|
| 119 |
)
|
| 120 |
except Exception as e:
|
| 121 |
raise HTTPException(status_code=500, detail=f"Step failed: {str(e)}")
|
|
@@ -123,30 +88,27 @@ async def step(request: StepRequest) -> StepResponse:
|
|
| 123 |
|
| 124 |
@app.get("/state")
|
| 125 |
async def state() -> dict:
|
| 126 |
-
"""Return
|
| 127 |
try:
|
| 128 |
-
|
| 129 |
-
return s.model_dump()
|
| 130 |
except Exception as e:
|
| 131 |
raise HTTPException(status_code=500, detail=f"State failed: {str(e)}")
|
| 132 |
|
| 133 |
|
| 134 |
@app.get("/tasks")
|
| 135 |
async def list_tasks() -> dict:
|
| 136 |
-
"""List
|
| 137 |
from server.tasks.task_easy import EASY_TASKS
|
| 138 |
from server.tasks.task_medium import MEDIUM_TASKS
|
| 139 |
from server.tasks.task_hard import HARD_TASKS
|
| 140 |
return {
|
| 141 |
-
"easy":
|
| 142 |
"medium": [t["task_id"] for t in MEDIUM_TASKS],
|
| 143 |
-
"hard":
|
| 144 |
-
"total":
|
| 145 |
}
|
| 146 |
|
| 147 |
-
|
| 148 |
if __name__ == "__main__":
|
| 149 |
-
import sys
|
| 150 |
import uvicorn
|
| 151 |
-
|
| 152 |
-
uvicorn.run("server.app:app", host="127.0.0.1", port=7860, reload=True)
|
|
|
|
| 1 |
+
# server/app.py — FastAPI server for Code Debug Environment
|
|
|
|
| 2 |
# Port 7860 required for Hugging Face Spaces.
|
| 3 |
|
| 4 |
from fastapi import FastAPI, HTTPException
|
|
|
|
| 13 |
|
| 14 |
app = FastAPI(
|
| 15 |
title="Code Debug Environment",
|
| 16 |
+
description="OpenEnv RL environment where LLM agents fix buggy Python code. 3 difficulty levels: easy, medium, hard.",
|
|
|
|
|
|
|
|
|
|
| 17 |
version="1.0.0",
|
| 18 |
)
|
| 19 |
|
| 20 |
+
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
|
|
|
|
|
|
| 22 |
env = CodeDebugEnvironment()
|
| 23 |
|
| 24 |
|
|
|
|
|
|
|
| 25 |
class ResetRequest(BaseModel):
|
| 26 |
+
difficulty: Optional[str] = None
|
|
|
|
| 27 |
|
| 28 |
class StepRequest(BaseModel):
|
| 29 |
fixed_code: str
|
| 30 |
explanation: Optional[str] = None
|
| 31 |
|
|
|
|
|
|
|
|
|
|
| 32 |
class StepResponse(BaseModel):
|
| 33 |
observation: dict
|
| 34 |
reward: float
|
| 35 |
done: bool
|
| 36 |
|
| 37 |
|
|
|
|
|
|
|
| 38 |
@app.get("/", response_class=HTMLResponse)
|
| 39 |
async def root():
|
| 40 |
+
"""Homepage with live interactive tester."""
|
| 41 |
html_path = os.path.join(os.path.dirname(__file__), "static", "index.html")
|
| 42 |
+
with open(html_path, "r", encoding="utf-8") as f:
|
| 43 |
return f.read()
|
| 44 |
|
| 45 |
|
| 46 |
+
@app.get("/favicon.ico", include_in_schema=False)
|
| 47 |
+
async def favicon():
|
| 48 |
+
from fastapi.responses import Response
|
| 49 |
+
return Response(content=b"", media_type="image/x-icon")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
@app.get("/health")
|
| 53 |
async def health():
|
| 54 |
+
"""Health check — must return 200 for submission validation."""
|
| 55 |
return {"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
|
| 56 |
|
| 57 |
|
| 58 |
@app.post("/reset")
|
| 59 |
async def reset(request: ResetRequest = ResetRequest()) -> dict:
|
| 60 |
+
"""Reset environment to start a new episode. Pass difficulty: easy | medium | hard"""
|
|
|
|
|
|
|
|
|
|
| 61 |
try:
|
| 62 |
+
obs = env.reset(difficulty=request.difficulty)
|
| 63 |
+
return {"observation": obs.model_dump(), "reward": 0.0, "done": False}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
except Exception as e:
|
| 65 |
raise HTTPException(status_code=500, detail=f"Reset failed: {str(e)}")
|
| 66 |
|
| 67 |
|
| 68 |
@app.post("/step")
|
| 69 |
async def step(request: StepRequest) -> StepResponse:
|
| 70 |
+
"""Submit fixed code. Returns reward (0.0-1.0), feedback, done flag."""
|
|
|
|
|
|
|
|
|
|
| 71 |
if not request.fixed_code or not request.fixed_code.strip():
|
| 72 |
raise HTTPException(status_code=400, detail="fixed_code must not be empty.")
|
|
|
|
| 73 |
try:
|
| 74 |
+
action = DebugAction(fixed_code=request.fixed_code, explanation=request.explanation)
|
| 75 |
+
obs = env.step(action)
|
| 76 |
+
return StepResponse(observation=obs.model_dump(), reward=obs.reward or 0.0, done=obs.done)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
except TimeoutError:
|
|
|
|
| 78 |
return StepResponse(
|
| 79 |
+
observation={"task_id": "unknown", "difficulty": "unknown", "buggy_code": "",
|
| 80 |
+
"instructions": "", "test_cases_description": "", "reward": 0.0,
|
| 81 |
+
"passed_tests": 0, "total_tests": 3, "done": False,
|
| 82 |
+
"feedback": "TimeoutError: Infinite loop detected. Add a visited set for graph traversal."},
|
| 83 |
+
reward=0.0, done=False,
|
|
|
|
|
|
|
|
|
|
| 84 |
)
|
| 85 |
except Exception as e:
|
| 86 |
raise HTTPException(status_code=500, detail=f"Step failed: {str(e)}")
|
|
|
|
| 88 |
|
| 89 |
@app.get("/state")
|
| 90 |
async def state() -> dict:
|
| 91 |
+
"""Return current episode state."""
|
| 92 |
try:
|
| 93 |
+
return env.state.model_dump()
|
|
|
|
| 94 |
except Exception as e:
|
| 95 |
raise HTTPException(status_code=500, detail=f"State failed: {str(e)}")
|
| 96 |
|
| 97 |
|
| 98 |
@app.get("/tasks")
|
| 99 |
async def list_tasks() -> dict:
|
| 100 |
+
"""List all 45 task IDs across difficulty levels."""
|
| 101 |
from server.tasks.task_easy import EASY_TASKS
|
| 102 |
from server.tasks.task_medium import MEDIUM_TASKS
|
| 103 |
from server.tasks.task_hard import HARD_TASKS
|
| 104 |
return {
|
| 105 |
+
"easy": [t["task_id"] for t in EASY_TASKS],
|
| 106 |
"medium": [t["task_id"] for t in MEDIUM_TASKS],
|
| 107 |
+
"hard": [t["task_id"] for t in HARD_TASKS],
|
| 108 |
+
"total": len(EASY_TASKS) + len(MEDIUM_TASKS) + len(HARD_TASKS),
|
| 109 |
}
|
| 110 |
|
| 111 |
+
|
| 112 |
if __name__ == "__main__":
|
|
|
|
| 113 |
import uvicorn
|
| 114 |
+
uvicorn.run("server.app:app", host="127.0.0.1", port=7860, reload=True)
|
|
|
server/graders/__pycache__/grader_easy.cpython-310.pyc
CHANGED
|
Binary files a/server/graders/__pycache__/grader_easy.cpython-310.pyc and b/server/graders/__pycache__/grader_easy.cpython-310.pyc differ
|
|
|
server/graders/__pycache__/grader_hard.cpython-310.pyc
CHANGED
|
Binary files a/server/graders/__pycache__/grader_hard.cpython-310.pyc and b/server/graders/__pycache__/grader_hard.cpython-310.pyc differ
|
|
|
server/graders/__pycache__/grader_medium.cpython-310.pyc
CHANGED
|
Binary files a/server/graders/__pycache__/grader_medium.cpython-310.pyc and b/server/graders/__pycache__/grader_medium.cpython-310.pyc differ
|
|
|
server/graders/grader_easy.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# server/graders/grader_easy.py
|
| 2 |
-
# Grades easy tasks:
|
| 3 |
-
# Reward is proportional to tests passed (0.33, 0.
|
| 4 |
|
| 5 |
import traceback
|
| 6 |
import signal
|
|
@@ -8,37 +8,32 @@ from typing import Tuple, List
|
|
| 8 |
|
| 9 |
|
| 10 |
def _timeout_handler(signum, frame):
|
| 11 |
-
raise TimeoutError("Code
|
| 12 |
|
| 13 |
|
| 14 |
def _run_code_safely(code: str, func_name: str, test_input):
|
| 15 |
-
"""
|
| 16 |
-
Executes the submitted code in an isolated namespace and calls the function.
|
| 17 |
-
Returns (output, error_message).
|
| 18 |
-
Times out after 5 seconds to prevent infinite loops.
|
| 19 |
-
"""
|
| 20 |
namespace = {}
|
| 21 |
try:
|
| 22 |
exec(compile(code, "<submitted>", "exec"), namespace)
|
| 23 |
except SyntaxError as e:
|
| 24 |
return None, f"SyntaxError: {e}"
|
| 25 |
except Exception as e:
|
| 26 |
-
return None, f"
|
| 27 |
|
| 28 |
func = namespace.get(func_name)
|
| 29 |
if func is None:
|
| 30 |
-
funcs = [v for v in namespace.values() if callable(v) and not v.__name__.startswith("_")]
|
| 31 |
if not funcs:
|
| 32 |
return None, "No callable function found in submitted code."
|
| 33 |
func = funcs[0]
|
| 34 |
|
| 35 |
try:
|
| 36 |
-
# Set 5 second timeout to catch infinite loops
|
| 37 |
try:
|
| 38 |
signal.signal(signal.SIGALRM, _timeout_handler)
|
| 39 |
signal.alarm(5)
|
| 40 |
except (AttributeError, OSError):
|
| 41 |
-
pass # Windows
|
| 42 |
|
| 43 |
if isinstance(test_input, list) and len(test_input) > 0 and isinstance(test_input[0], list):
|
| 44 |
result = func(*test_input)
|
|
@@ -51,13 +46,14 @@ def _run_code_safely(code: str, func_name: str, test_input):
|
|
| 51 |
result = func(test_input)
|
| 52 |
|
| 53 |
try:
|
| 54 |
-
signal.alarm(0)
|
| 55 |
except (AttributeError, OSError):
|
| 56 |
pass
|
| 57 |
|
| 58 |
return result, None
|
|
|
|
| 59 |
except TimeoutError as e:
|
| 60 |
-
return None,
|
| 61 |
except Exception as e:
|
| 62 |
try:
|
| 63 |
signal.alarm(0)
|
|
@@ -67,7 +63,6 @@ def _run_code_safely(code: str, func_name: str, test_input):
|
|
| 67 |
|
| 68 |
|
| 69 |
def _extract_func_name(code: str) -> str:
|
| 70 |
-
"""Extract the first function name defined in the code."""
|
| 71 |
for line in code.splitlines():
|
| 72 |
line = line.strip()
|
| 73 |
if line.startswith("def "):
|
|
@@ -77,14 +72,8 @@ def _extract_func_name(code: str) -> str:
|
|
| 77 |
|
| 78 |
def grade_easy(fixed_code: str, task: dict) -> Tuple[float, int, int, str, List[dict]]:
|
| 79 |
"""
|
| 80 |
-
Grade
|
| 81 |
-
|
| 82 |
-
Returns:
|
| 83 |
-
reward (float): 0.0 to 1.0
|
| 84 |
-
passed (int): number of tests passed
|
| 85 |
-
total (int): total test cases
|
| 86 |
-
feedback (str): detailed feedback message
|
| 87 |
-
results (list): per-test results
|
| 88 |
"""
|
| 89 |
test_cases = task["test_cases"]
|
| 90 |
total = len(test_cases)
|
|
@@ -99,21 +88,18 @@ def grade_easy(fixed_code: str, task: dict) -> Tuple[float, int, int, str, List[
|
|
| 99 |
got, error = _run_code_safely(fixed_code, func_name, inp)
|
| 100 |
|
| 101 |
if error:
|
| 102 |
-
results.append({"test_id": i
|
| 103 |
feedback_lines.append(f"Test {i+1}: ❌ Error\n Input : {inp!r}\n Expected : {expected!r}\n Error : {error}")
|
| 104 |
elif got == expected:
|
| 105 |
passed += 1
|
| 106 |
-
results.append({"test_id": i
|
| 107 |
feedback_lines.append(f"Test {i+1}: ✅ Passed\n Input : {inp!r}\n Expected : {expected!r}\n Got : {got!r}")
|
| 108 |
else:
|
| 109 |
-
results.append({"test_id": i
|
| 110 |
feedback_lines.append(f"Test {i+1}: ❌ Failed\n Input : {inp!r}\n Expected : {expected!r}\n Got : {got!r}")
|
| 111 |
|
| 112 |
reward = round(passed / total, 2)
|
| 113 |
feedback = "\n".join(feedback_lines)
|
| 114 |
-
if passed == total
|
| 115 |
-
feedback += "\n🎉 All tests passed! Full reward."
|
| 116 |
-
else:
|
| 117 |
-
feedback += f"\n{passed}/{total} tests passed. Review the failing cases."
|
| 118 |
|
| 119 |
-
return reward, passed, total, feedback, results
|
|
|
|
| 1 |
# server/graders/grader_easy.py
|
| 2 |
+
# Grades easy and medium tasks: runs code against test cases.
|
| 3 |
+
# Reward is proportional to tests passed (0.33, 0.67, 1.0).
|
| 4 |
|
| 5 |
import traceback
|
| 6 |
import signal
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def _timeout_handler(signum, frame):
|
| 11 |
+
raise TimeoutError("Code timed out — likely infinite loop. Check for missing visited set in graph traversal.")
|
| 12 |
|
| 13 |
|
| 14 |
def _run_code_safely(code: str, func_name: str, test_input):
|
| 15 |
+
"""Run submitted code safely with 5s timeout. Returns (result, error)."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
namespace = {}
|
| 17 |
try:
|
| 18 |
exec(compile(code, "<submitted>", "exec"), namespace)
|
| 19 |
except SyntaxError as e:
|
| 20 |
return None, f"SyntaxError: {e}"
|
| 21 |
except Exception as e:
|
| 22 |
+
return None, f"CompileError: {e}"
|
| 23 |
|
| 24 |
func = namespace.get(func_name)
|
| 25 |
if func is None:
|
| 26 |
+
funcs = [v for v in namespace.values() if callable(v) and not str(v.__name__).startswith("_")]
|
| 27 |
if not funcs:
|
| 28 |
return None, "No callable function found in submitted code."
|
| 29 |
func = funcs[0]
|
| 30 |
|
| 31 |
try:
|
|
|
|
| 32 |
try:
|
| 33 |
signal.signal(signal.SIGALRM, _timeout_handler)
|
| 34 |
signal.alarm(5)
|
| 35 |
except (AttributeError, OSError):
|
| 36 |
+
pass # Windows has no SIGALRM
|
| 37 |
|
| 38 |
if isinstance(test_input, list) and len(test_input) > 0 and isinstance(test_input[0], list):
|
| 39 |
result = func(*test_input)
|
|
|
|
| 46 |
result = func(test_input)
|
| 47 |
|
| 48 |
try:
|
| 49 |
+
signal.alarm(0)
|
| 50 |
except (AttributeError, OSError):
|
| 51 |
pass
|
| 52 |
|
| 53 |
return result, None
|
| 54 |
+
|
| 55 |
except TimeoutError as e:
|
| 56 |
+
return None, str(e)
|
| 57 |
except Exception as e:
|
| 58 |
try:
|
| 59 |
signal.alarm(0)
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
def _extract_func_name(code: str) -> str:
|
|
|
|
| 66 |
for line in code.splitlines():
|
| 67 |
line = line.strip()
|
| 68 |
if line.startswith("def "):
|
|
|
|
| 72 |
|
| 73 |
def grade_easy(fixed_code: str, task: dict) -> Tuple[float, int, int, str, List[dict]]:
|
| 74 |
"""
|
| 75 |
+
Grade submission against test cases.
|
| 76 |
+
Returns: (reward, passed, total, feedback, results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
"""
|
| 78 |
test_cases = task["test_cases"]
|
| 79 |
total = len(test_cases)
|
|
|
|
| 88 |
got, error = _run_code_safely(fixed_code, func_name, inp)
|
| 89 |
|
| 90 |
if error:
|
| 91 |
+
results.append({"test_id": i+1, "passed": False, "expected": str(expected), "got": f"ERROR"})
|
| 92 |
feedback_lines.append(f"Test {i+1}: ❌ Error\n Input : {inp!r}\n Expected : {expected!r}\n Error : {error}")
|
| 93 |
elif got == expected:
|
| 94 |
passed += 1
|
| 95 |
+
results.append({"test_id": i+1, "passed": True, "expected": str(expected), "got": str(got)})
|
| 96 |
feedback_lines.append(f"Test {i+1}: ✅ Passed\n Input : {inp!r}\n Expected : {expected!r}\n Got : {got!r}")
|
| 97 |
else:
|
| 98 |
+
results.append({"test_id": i+1, "passed": False, "expected": str(expected), "got": str(got)})
|
| 99 |
feedback_lines.append(f"Test {i+1}: ❌ Failed\n Input : {inp!r}\n Expected : {expected!r}\n Got : {got!r}")
|
| 100 |
|
| 101 |
reward = round(passed / total, 2)
|
| 102 |
feedback = "\n".join(feedback_lines)
|
| 103 |
+
feedback += "\n🎉 All tests passed! Full reward." if passed == total else f"\n{passed}/{total} tests passed."
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
+
return reward, passed, total, feedback, results
|
server/graders/grader_hard.py
CHANGED
|
@@ -8,13 +8,10 @@ from .grader_easy import grade_easy
|
|
| 8 |
|
| 9 |
def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple[float, str]:
|
| 10 |
"""
|
| 11 |
-
|
| 12 |
-
Returns (score 0.0-1.0, feedback string).
|
| 13 |
-
|
| 14 |
-
Scoring:
|
| 15 |
- No explanation → 0.0
|
| 16 |
-
-
|
| 17 |
-
- Half or more keywords →
|
| 18 |
"""
|
| 19 |
if not explanation or len(explanation.strip()) < 10:
|
| 20 |
return 0.0, "❌ No explanation provided. Hard tasks require an explanation field."
|
|
@@ -23,31 +20,28 @@ def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple
|
|
| 23 |
hits = [kw for kw in keywords if kw.lower() in explanation_lower]
|
| 24 |
|
| 25 |
if not keywords:
|
| 26 |
-
# No keywords defined — give full credit for any explanation
|
| 27 |
score = 1.0 if len(explanation.strip()) > 20 else 0.5
|
| 28 |
else:
|
| 29 |
-
|
| 30 |
-
needed_for_full = max(1, len(keywords) // 2)
|
| 31 |
if len(hits) == 0:
|
| 32 |
score = 0.0
|
| 33 |
-
elif len(hits) >=
|
| 34 |
score = 1.0
|
| 35 |
else:
|
| 36 |
-
|
| 37 |
-
score = round(len(hits) / needed_for_full, 2)
|
| 38 |
|
| 39 |
if score == 1.0:
|
| 40 |
-
feedback = f"✅ Explanation excellent! Mentioned
|
| 41 |
elif score > 0:
|
| 42 |
missing = [kw for kw in keywords if kw.lower() not in explanation_lower]
|
| 43 |
feedback = (
|
| 44 |
-
f"⚠️ Partial explanation (score={score}). Mentioned: {', '.join(hits)
|
| 45 |
-
f"
|
| 46 |
)
|
| 47 |
else:
|
| 48 |
feedback = (
|
| 49 |
f"❌ Explanation missing key concepts. "
|
| 50 |
-
f"
|
| 51 |
)
|
| 52 |
|
| 53 |
return round(score, 2), feedback
|
|
@@ -56,23 +50,11 @@ def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple
|
|
| 56 |
def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -> Tuple[float, int, int, str, List[dict]]:
|
| 57 |
"""
|
| 58 |
Grade a hard task submission.
|
| 59 |
-
Reward = 0.7
|
| 60 |
-
|
| 61 |
-
Returns:
|
| 62 |
-
reward (float): 0.0 to 1.0
|
| 63 |
-
passed (int)
|
| 64 |
-
total (int)
|
| 65 |
-
feedback (str)
|
| 66 |
-
results (list)
|
| 67 |
"""
|
| 68 |
-
# Grade code using easy grader (same test execution logic)
|
| 69 |
test_reward, passed, total, code_feedback, results = grade_easy(fixed_code, task)
|
| 70 |
-
|
| 71 |
-
# Grade explanation
|
| 72 |
keywords = task.get("explanation_keywords", [])
|
| 73 |
exp_score, exp_feedback = _score_explanation(explanation, keywords)
|
| 74 |
-
|
| 75 |
-
# Combined reward
|
| 76 |
final_reward = round(0.7 * test_reward + 0.3 * exp_score, 2)
|
| 77 |
|
| 78 |
feedback = (
|
|
@@ -83,71 +65,9 @@ def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -
|
|
| 83 |
f"=== Final Reward: {final_reward:.2f} ==="
|
| 84 |
)
|
| 85 |
|
| 86 |
-
if passed < total and not explanation:
|
| 87 |
-
feedback += "\n💡 Tip: Fix the code bugs AND provide a clear explanation for max reward."
|
| 88 |
-
|
| 89 |
if passed == total and exp_score < 1.0:
|
| 90 |
-
feedback += f"\n💡
|
|
|
|
|
|
|
| 91 |
|
| 92 |
return final_reward, passed, total, feedback, results
|
| 93 |
-
"""
|
| 94 |
-
Scores the explanation by checking for required conceptual keywords.
|
| 95 |
-
Returns (score 0.0-1.0, feedback string).
|
| 96 |
-
"""
|
| 97 |
-
if not explanation or len(explanation.strip()) < 10:
|
| 98 |
-
return 0.0, "❌ No explanation provided. Hard tasks require an explanation field."
|
| 99 |
-
|
| 100 |
-
explanation_lower = explanation.lower()
|
| 101 |
-
hits = [kw for kw in keywords if kw.lower() in explanation_lower]
|
| 102 |
-
score = min(1.0, len(hits) / max(1, len(keywords) // 2)) # need at least half the keywords
|
| 103 |
-
|
| 104 |
-
if score == 1.0:
|
| 105 |
-
feedback = f"✅ Explanation excellent! Mentioned key concepts: {', '.join(hits)}"
|
| 106 |
-
elif score > 0:
|
| 107 |
-
feedback = (
|
| 108 |
-
f"⚠️ Partial explanation. Mentioned: {', '.join(hits) if hits else 'none'}. "
|
| 109 |
-
f"Consider discussing: {', '.join(kw for kw in keywords if kw.lower() not in explanation_lower)[:3]}"
|
| 110 |
-
)
|
| 111 |
-
else:
|
| 112 |
-
feedback = (
|
| 113 |
-
f"❌ Explanation missing key concepts. "
|
| 114 |
-
f"Try to explain: {', '.join(keywords[:3])} in your analysis."
|
| 115 |
-
)
|
| 116 |
-
|
| 117 |
-
return round(score, 2), feedback
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -> Tuple[float, int, int, str, List[dict]]:
|
| 121 |
-
"""
|
| 122 |
-
Grade a hard task submission.
|
| 123 |
-
Reward = 0.7 * test_score + 0.3 * explanation_score
|
| 124 |
-
|
| 125 |
-
Returns:
|
| 126 |
-
reward (float): 0.0 to 1.0
|
| 127 |
-
passed (int)
|
| 128 |
-
total (int)
|
| 129 |
-
feedback (str)
|
| 130 |
-
results (list)
|
| 131 |
-
"""
|
| 132 |
-
# Grade code
|
| 133 |
-
test_reward, passed, total, code_feedback, results = grade_easy(fixed_code, task)
|
| 134 |
-
|
| 135 |
-
# Grade explanation
|
| 136 |
-
keywords = task.get("explanation_keywords", [])
|
| 137 |
-
exp_score, exp_feedback = _score_explanation(explanation, keywords)
|
| 138 |
-
|
| 139 |
-
# Combined reward
|
| 140 |
-
final_reward = round(0.7 * test_reward + 0.3 * exp_score, 2)
|
| 141 |
-
|
| 142 |
-
feedback = (
|
| 143 |
-
f"--- Code Score (70% weight): {test_reward:.2f} ---\n"
|
| 144 |
-
f"{code_feedback}\n\n"
|
| 145 |
-
f"--- Explanation Score (30% weight): {exp_score:.2f} ---\n"
|
| 146 |
-
f"{exp_feedback}\n\n"
|
| 147 |
-
f"=== Final Reward: {final_reward:.2f} ==="
|
| 148 |
-
)
|
| 149 |
-
|
| 150 |
-
if passed < total and not explanation:
|
| 151 |
-
feedback += "\n💡 Tip: Fix the code bugs AND provide a clear explanation for max reward."
|
| 152 |
-
|
| 153 |
-
return final_reward, passed, total, feedback, results
|
|
|
|
| 8 |
|
| 9 |
def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple[float, str]:
|
| 10 |
"""
|
| 11 |
+
Score explanation by checking for required conceptual keywords.
|
|
|
|
|
|
|
|
|
|
| 12 |
- No explanation → 0.0
|
| 13 |
+
- 1+ keyword hit → partial credit proportional to hits
|
| 14 |
+
- Half or more keywords → 1.0
|
| 15 |
"""
|
| 16 |
if not explanation or len(explanation.strip()) < 10:
|
| 17 |
return 0.0, "❌ No explanation provided. Hard tasks require an explanation field."
|
|
|
|
| 20 |
hits = [kw for kw in keywords if kw.lower() in explanation_lower]
|
| 21 |
|
| 22 |
if not keywords:
|
|
|
|
| 23 |
score = 1.0 if len(explanation.strip()) > 20 else 0.5
|
| 24 |
else:
|
| 25 |
+
needed = max(1, len(keywords) // 2)
|
|
|
|
| 26 |
if len(hits) == 0:
|
| 27 |
score = 0.0
|
| 28 |
+
elif len(hits) >= needed:
|
| 29 |
score = 1.0
|
| 30 |
else:
|
| 31 |
+
score = round(len(hits) / needed, 2)
|
|
|
|
| 32 |
|
| 33 |
if score == 1.0:
|
| 34 |
+
feedback = f"✅ Explanation excellent! Mentioned: {', '.join(hits)}"
|
| 35 |
elif score > 0:
|
| 36 |
missing = [kw for kw in keywords if kw.lower() not in explanation_lower]
|
| 37 |
feedback = (
|
| 38 |
+
f"⚠️ Partial explanation (score={score}). Mentioned: {', '.join(hits) or 'none'}. "
|
| 39 |
+
f"Also discuss: {', '.join(missing[:3])}"
|
| 40 |
)
|
| 41 |
else:
|
| 42 |
feedback = (
|
| 43 |
f"❌ Explanation missing key concepts. "
|
| 44 |
+
f"Explain: {', '.join(keywords[:3])}"
|
| 45 |
)
|
| 46 |
|
| 47 |
return round(score, 2), feedback
|
|
|
|
| 50 |
def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -> Tuple[float, int, int, str, List[dict]]:
|
| 51 |
"""
|
| 52 |
Grade a hard task submission.
|
| 53 |
+
Reward = 0.7 × test_score + 0.3 × explanation_score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
"""
|
|
|
|
| 55 |
test_reward, passed, total, code_feedback, results = grade_easy(fixed_code, task)
|
|
|
|
|
|
|
| 56 |
keywords = task.get("explanation_keywords", [])
|
| 57 |
exp_score, exp_feedback = _score_explanation(explanation, keywords)
|
|
|
|
|
|
|
| 58 |
final_reward = round(0.7 * test_reward + 0.3 * exp_score, 2)
|
| 59 |
|
| 60 |
feedback = (
|
|
|
|
| 65 |
f"=== Final Reward: {final_reward:.2f} ==="
|
| 66 |
)
|
| 67 |
|
|
|
|
|
|
|
|
|
|
| 68 |
if passed == total and exp_score < 1.0:
|
| 69 |
+
feedback += f"\n💡 Code is correct! Improve explanation by mentioning: {', '.join(keywords[:3])}"
|
| 70 |
+
elif passed < total and not explanation:
|
| 71 |
+
feedback += "\n💡 Fix the code AND provide a clear explanation for max reward."
|
| 72 |
|
| 73 |
return final_reward, passed, total, feedback, results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/tasks/__pycache__/__init__.cpython-39.pyc
DELETED
|
Binary file (449 Bytes)
|
|
|
server/tasks/__pycache__/task_easy.cpython-310.pyc
CHANGED
|
Binary files a/server/tasks/__pycache__/task_easy.cpython-310.pyc and b/server/tasks/__pycache__/task_easy.cpython-310.pyc differ
|
|
|
server/tasks/__pycache__/task_easy.cpython-39.pyc
DELETED
|
Binary file (7.37 kB)
|
|
|
server/tasks/__pycache__/task_hard.cpython-310.pyc
CHANGED
|
Binary files a/server/tasks/__pycache__/task_hard.cpython-310.pyc and b/server/tasks/__pycache__/task_hard.cpython-310.pyc differ
|
|
|
server/tasks/__pycache__/task_hard.cpython-39.pyc
DELETED
|
Binary file (16.5 kB)
|
|
|
server/tasks/__pycache__/task_medium.cpython-310.pyc
CHANGED
|
Binary files a/server/tasks/__pycache__/task_medium.cpython-310.pyc and b/server/tasks/__pycache__/task_medium.cpython-310.pyc differ
|
|
|
server/tasks/__pycache__/task_medium.cpython-39.pyc
DELETED
|
Binary file (10.5 kB)
|
|
|
server/tasks/task_hard.py
CHANGED
|
@@ -625,4 +625,4 @@ def get_task_by_id(task_id: str) -> dict:
|
|
| 625 |
for t in HARD_TASKS:
|
| 626 |
if t["task_id"] == task_id:
|
| 627 |
return t.copy()
|
| 628 |
-
return random.choice(HARD_TASKS).copy()
|
|
|
|
| 625 |
for t in HARD_TASKS:
|
| 626 |
if t["task_id"] == task_id:
|
| 627 |
return t.copy()
|
| 628 |
+
return random.choice(HARD_TASKS).copy()
|
server/tasks/task_medium.py
CHANGED
|
@@ -504,4 +504,4 @@ def get_task_by_id(task_id: str) -> dict:
|
|
| 504 |
for t in MEDIUM_TASKS:
|
| 505 |
if t["task_id"] == task_id:
|
| 506 |
return t.copy()
|
| 507 |
-
return random.choice(MEDIUM_TASKS).copy()
|
|
|
|
| 504 |
for t in MEDIUM_TASKS:
|
| 505 |
if t["task_id"] == task_id:
|
| 506 |
return t.copy()
|
| 507 |
+
return random.choice(MEDIUM_TASKS).copy()
|
spec.md
CHANGED
|
@@ -1,23 +1,24 @@
|
|
| 1 |
-
# Code Debug Environment — Specification
|
| 2 |
|
| 3 |
## Overview
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
## API Specification
|
| 10 |
-
|
| 11 |
-
### POST /reset
|
| 12 |
-
Start a new episode.
|
| 13 |
-
|
| 14 |
-
**Request:**
|
| 15 |
```json
|
| 16 |
-
{"
|
| 17 |
```
|
| 18 |
|
| 19 |
-
|
| 20 |
```json
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
{
|
| 22 |
"observation": {
|
| 23 |
"task_id": "easy_003",
|
|
@@ -25,112 +26,53 @@ Start a new episode.
|
|
| 25 |
"buggy_code": "def find_max(nums):\n return min(nums)",
|
| 26 |
"instructions": "The function has exactly one bug. Fix it.",
|
| 27 |
"test_cases_description": "Finds max value in a list",
|
| 28 |
-
"reward": null,
|
| 29 |
-
"
|
| 30 |
-
"total_tests": 3,
|
| 31 |
-
"feedback": null,
|
| 32 |
-
"done": false
|
| 33 |
},
|
| 34 |
-
"reward": 0.0,
|
| 35 |
-
"done": false
|
| 36 |
}
|
| 37 |
```
|
| 38 |
|
| 39 |
-
---
|
| 40 |
-
|
| 41 |
### POST /step
|
| 42 |
-
Submit a code fix.
|
| 43 |
-
|
| 44 |
-
**Request:**
|
| 45 |
```json
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
"explanation": "Optional for hard tasks"
|
| 49 |
-
}
|
| 50 |
-
```
|
| 51 |
|
| 52 |
-
|
| 53 |
-
```json
|
| 54 |
{
|
| 55 |
"observation": {
|
| 56 |
-
"task_id": "easy_003",
|
| 57 |
-
"
|
| 58 |
-
"passed_tests": 3,
|
| 59 |
-
"total_tests": 3,
|
| 60 |
"feedback": "Test 1: ✅ Passed\n Input: [1,2,3]\n Expected: 3\n Got: 3",
|
| 61 |
"done": true
|
| 62 |
},
|
| 63 |
-
"reward": 1.0,
|
| 64 |
-
"done": true
|
| 65 |
}
|
| 66 |
```
|
| 67 |
|
| 68 |
-
---
|
| 69 |
-
|
| 70 |
### GET /state
|
| 71 |
-
Returns current episode state.
|
| 72 |
-
|
| 73 |
```json
|
| 74 |
-
{
|
| 75 |
-
|
| 76 |
-
"task_id": "easy_003",
|
| 77 |
-
"difficulty": "easy",
|
| 78 |
-
"step_count": 1,
|
| 79 |
-
"max_steps": 5,
|
| 80 |
-
"current_reward": 1.0,
|
| 81 |
-
"best_reward": 1.0,
|
| 82 |
-
"done": true
|
| 83 |
-
}
|
| 84 |
```
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
### GET /health
|
| 89 |
```json
|
| 90 |
-
{"
|
| 91 |
```
|
| 92 |
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
-
##
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
```
|
| 99 |
-
reward = passed_tests / total_tests
|
| 100 |
-
```
|
| 101 |
-
- 3/3 → 1.00
|
| 102 |
-
- 2/3 → 0.67
|
| 103 |
-
- 1/3 → 0.33
|
| 104 |
-
- 0/3 → 0.00
|
| 105 |
-
|
| 106 |
-
### Hard
|
| 107 |
-
```
|
| 108 |
-
reward = 0.7 × test_score + 0.3 × explanation_score
|
| 109 |
-
```
|
| 110 |
-
|
| 111 |
-
### Invalid Actions
|
| 112 |
-
- Empty code → reward = 0.0 + feedback message
|
| 113 |
-
- Non-Python code → reward = 0.0 + feedback message
|
| 114 |
-
|
| 115 |
-
---
|
| 116 |
|
| 117 |
## Episode Rules
|
| 118 |
-
|
| 119 |
- Max 5 steps per episode
|
| 120 |
-
-
|
| 121 |
-
-
|
| 122 |
-
- Feedback shows Input, Expected, Got for each test
|
| 123 |
-
|
| 124 |
-
---
|
| 125 |
-
|
| 126 |
-
## Task Domains
|
| 127 |
-
|
| 128 |
-
| Domain | Examples |
|
| 129 |
-
|---|---|
|
| 130 |
-
| List operations | second element, max, flatten |
|
| 131 |
-
| String algorithms | palindrome, reverse, word count |
|
| 132 |
-
| Math | fibonacci, factorial, square root |
|
| 133 |
-
| Sorting | bubble sort, binary search |
|
| 134 |
-
| Data processing | JSON parsing, API validation |
|
| 135 |
-
| Graph algorithms | BFS, cycle detection |
|
| 136 |
-
| Dynamic programming | knapsack, longest subsequence |
|
|
|
|
| 1 |
+
# Code Debug Environment — API Specification
|
| 2 |
|
| 3 |
## Overview
|
| 4 |
+
OpenEnv-compatible RL environment where LLM agents fix buggy Python code.
|
| 5 |
+
- 45 tasks: 15 easy + 15 medium + 15 hard
|
| 6 |
+
- Partial rewards: 0.33, 0.67, 1.0 based on test cases passed
|
| 7 |
+
- Hard tasks: reward = 0.7×code + 0.3×explanation
|
| 8 |
|
| 9 |
+
## Endpoints
|
| 10 |
|
| 11 |
+
### GET /health
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
```json
|
| 13 |
+
{"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
|
| 14 |
```
|
| 15 |
|
| 16 |
+
### POST /reset
|
| 17 |
```json
|
| 18 |
+
// Request
|
| 19 |
+
{"difficulty": "easy"} // or "medium", "hard", omit for random
|
| 20 |
+
|
| 21 |
+
// Response
|
| 22 |
{
|
| 23 |
"observation": {
|
| 24 |
"task_id": "easy_003",
|
|
|
|
| 26 |
"buggy_code": "def find_max(nums):\n return min(nums)",
|
| 27 |
"instructions": "The function has exactly one bug. Fix it.",
|
| 28 |
"test_cases_description": "Finds max value in a list",
|
| 29 |
+
"reward": null, "passed_tests": null, "total_tests": 3,
|
| 30 |
+
"feedback": null, "done": false
|
|
|
|
|
|
|
|
|
|
| 31 |
},
|
| 32 |
+
"reward": 0.0, "done": false
|
|
|
|
| 33 |
}
|
| 34 |
```
|
| 35 |
|
|
|
|
|
|
|
| 36 |
### POST /step
|
|
|
|
|
|
|
|
|
|
| 37 |
```json
|
| 38 |
+
// Request
|
| 39 |
+
{"fixed_code": "def find_max(nums):\n return max(nums)", "explanation": "optional for hard"}
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
// Response
|
|
|
|
| 42 |
{
|
| 43 |
"observation": {
|
| 44 |
+
"task_id": "easy_003", "reward": 1.0,
|
| 45 |
+
"passed_tests": 3, "total_tests": 3,
|
|
|
|
|
|
|
| 46 |
"feedback": "Test 1: ✅ Passed\n Input: [1,2,3]\n Expected: 3\n Got: 3",
|
| 47 |
"done": true
|
| 48 |
},
|
| 49 |
+
"reward": 1.0, "done": true
|
|
|
|
| 50 |
}
|
| 51 |
```
|
| 52 |
|
|
|
|
|
|
|
| 53 |
### GET /state
|
|
|
|
|
|
|
| 54 |
```json
|
| 55 |
+
{"episode_id": "uuid", "task_id": "easy_003", "difficulty": "easy",
|
| 56 |
+
"step_count": 1, "max_steps": 5, "current_reward": 1.0, "best_reward": 1.0, "done": true}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
```
|
| 58 |
|
| 59 |
+
### GET /tasks
|
|
|
|
|
|
|
| 60 |
```json
|
| 61 |
+
{"easy": ["easy_001",...], "medium": ["medium_001",...], "hard": ["hard_001",...], "total": 45}
|
| 62 |
```
|
| 63 |
|
| 64 |
+
## Reward Design
|
| 65 |
+
| Task | Formula |
|
| 66 |
+
|------|---------|
|
| 67 |
+
| Easy | passed/3 |
|
| 68 |
+
| Medium | passed/3 |
|
| 69 |
+
| Hard | 0.7×code_score + 0.3×explanation_score |
|
| 70 |
|
| 71 |
+
## Invalid Actions
|
| 72 |
+
- Empty code → reward=0.0 + penalty feedback
|
| 73 |
+
- Infinite loop → TimeoutError → reward=0.0 + hint to add visited set
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
## Episode Rules
|
|
|
|
| 76 |
- Max 5 steps per episode
|
| 77 |
+
- Ends when reward=1.0 OR max steps reached
|
| 78 |
+
- 3 deterministic test cases per task
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_graders.py
CHANGED
|
@@ -1,9 +1,5 @@
|
|
| 1 |
-
# tests/test_graders.py
|
| 2 |
-
|
| 3 |
-
# Run: python -m pytest tests/ -v
|
| 4 |
-
|
| 5 |
-
import sys
|
| 6 |
-
import os
|
| 7 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 8 |
|
| 9 |
from server.graders.grader_easy import grade_easy
|
|
@@ -14,56 +10,41 @@ from server.tasks.task_medium import MEDIUM_TASKS
|
|
| 14 |
from server.tasks.task_hard import HARD_TASKS
|
| 15 |
|
| 16 |
|
| 17 |
-
def
|
| 18 |
-
assert len(EASY_TASKS) == 15
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
def test_medium_tasks_count():
|
| 22 |
-
assert len(MEDIUM_TASKS) == 15, f"Expected 15 medium tasks, got {len(MEDIUM_TASKS)}"
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def test_hard_tasks_count():
|
| 26 |
-
assert len(HARD_TASKS) == 15, f"Expected 15 hard tasks, got {len(HARD_TASKS)}"
|
| 27 |
-
|
| 28 |
|
| 29 |
-
def
|
| 30 |
-
for
|
| 31 |
-
|
| 32 |
-
assert
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
def
|
| 36 |
-
for
|
| 37 |
-
|
| 38 |
-
|
|
|
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
def
|
| 42 |
-
for
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
reward, passed, total, _, _ = grade_hard(task["fixed_code"], task, explanation)
|
| 46 |
-
assert reward >= 0.9, f"{task['task_id']} should score >= 0.9, got {reward}"
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
def test_reward_range():
|
| 50 |
-
for task in EASY_TASKS + MEDIUM_TASKS:
|
| 51 |
-
reward, _, _, _, _ = grade_easy(task["buggy_code"], task)
|
| 52 |
-
assert 0.0 <= reward <= 1.0, f"Reward out of range: {reward}"
|
| 53 |
-
|
| 54 |
|
| 55 |
def test_empty_code_returns_zero():
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
assert reward == 0.0
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
def test_buggy_code_scores_less_than_1():
|
| 62 |
-
for task in EASY_TASKS[:5]:
|
| 63 |
-
reward, _, _, _, _ = grade_easy(task["buggy_code"], task)
|
| 64 |
-
assert reward < 1.0, f"{task['task_id']} buggy code should not score 1.0"
|
| 65 |
-
|
| 66 |
|
| 67 |
if __name__ == "__main__":
|
| 68 |
import pytest
|
| 69 |
-
pytest.main([__file__, "-v"])
|
|
|
|
| 1 |
+
# tests/test_graders.py — Run: python -m pytest tests/ -v
|
| 2 |
+
import sys, os
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 4 |
|
| 5 |
from server.graders.grader_easy import grade_easy
|
|
|
|
| 10 |
from server.tasks.task_hard import HARD_TASKS
|
| 11 |
|
| 12 |
|
| 13 |
+
def test_task_counts():
|
| 14 |
+
assert len(EASY_TASKS) == 15
|
| 15 |
+
assert len(MEDIUM_TASKS) == 15
|
| 16 |
+
assert len(HARD_TASKS) == 15
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
def test_easy_correct_scores_1():
|
| 19 |
+
for t in EASY_TASKS:
|
| 20 |
+
r, _, _, _, _ = grade_easy(t["fixed_code"], t)
|
| 21 |
+
assert r == 1.0, f"{t['task_id']} expected 1.0 got {r}"
|
| 22 |
|
| 23 |
+
def test_medium_correct_scores_1():
|
| 24 |
+
for t in MEDIUM_TASKS:
|
| 25 |
+
r, _, _, _, _ = grade_medium(t["fixed_code"], t)
|
| 26 |
+
assert r == 1.0, f"{t['task_id']} expected 1.0 got {r}"
|
| 27 |
|
| 28 |
+
def test_hard_correct_scores_high():
|
| 29 |
+
for t in HARD_TASKS:
|
| 30 |
+
keywords = t.get("explanation_keywords", [])
|
| 31 |
+
r, _, _, _, _ = grade_hard(t["fixed_code"], t, " ".join(keywords))
|
| 32 |
+
assert r >= 0.9, f"{t['task_id']} expected >=0.9 got {r}"
|
| 33 |
|
| 34 |
+
def test_reward_in_range():
|
| 35 |
+
for t in EASY_TASKS:
|
| 36 |
+
r, _, _, _, _ = grade_easy(t["buggy_code"], t)
|
| 37 |
+
assert 0.0 <= r <= 1.0
|
| 38 |
|
| 39 |
+
def test_buggy_scores_less_than_1():
|
| 40 |
+
for t in EASY_TASKS[:5]:
|
| 41 |
+
r, _, _, _, _ = grade_easy(t["buggy_code"], t)
|
| 42 |
+
assert r < 1.0, f"{t['task_id']} buggy code should not score 1.0"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
def test_empty_code_returns_zero():
|
| 45 |
+
r, _, _, _, _ = grade_easy("", EASY_TASKS[0])
|
| 46 |
+
assert r == 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
if __name__ == "__main__":
|
| 49 |
import pytest
|
| 50 |
+
pytest.main([__file__, "-v"])
|