Spaces:
Running
Running
Commit ·
c01667e
1
Parent(s): 08efbe6
Fix log format, add SPEC.md, tests, invalid action penalty, max_steps=5
Browse files- inference.py +74 -162
- openenv.yaml +79 -10
- server/environment.py +37 -2
- spec.md +136 -0
- tests/test_graders.py +69 -0
inference.py
CHANGED
|
@@ -1,21 +1,12 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
# inference.py
|
| 3 |
-
#
|
| 4 |
-
# Baseline inference script for the Code Debug Environment.
|
| 5 |
-
# Must be run from the project root.
|
| 6 |
-
#
|
| 7 |
-
# Required environment variables:
|
| 8 |
-
# API_BASE_URL — LLM API endpoint (OpenAI-compatible)
|
| 9 |
-
# MODEL_NAME — Model identifier
|
| 10 |
-
# HF_TOKEN — Hugging Face / API key
|
| 11 |
#
|
|
|
|
| 12 |
# Usage:
|
| 13 |
# python inference.py
|
| 14 |
-
# python inference.py --url https://
|
| 15 |
# python inference.py --difficulty easy
|
| 16 |
-
#
|
| 17 |
-
# Log format: [START], [STEP], [END] — strictly followed for evaluation scoring.
|
| 18 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 19 |
|
| 20 |
import os
|
| 21 |
import sys
|
|
@@ -24,96 +15,52 @@ import time
|
|
| 24 |
import argparse
|
| 25 |
import requests
|
| 26 |
from openai import OpenAI
|
|
|
|
| 27 |
|
| 28 |
# ─── Configuration ────────────────────────────────────────────────────────────
|
| 29 |
-
|
| 30 |
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 31 |
-
MODEL_NAME
|
| 32 |
-
HF_TOKEN
|
| 33 |
-
ENV_URL
|
| 34 |
-
|
| 35 |
-
MAX_STEPS
|
| 36 |
-
DIFFICULTIES = ["easy", "medium", "hard"]
|
| 37 |
-
|
| 38 |
|
| 39 |
# ─── OpenAI Client ───────────────────────────────────────────────────────────
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
# ─── Logging (strict format required by evaluator) ───────────────────────────
|
| 48 |
-
|
| 49 |
-
def log_start(task_id: str, difficulty: str, episode: int):
|
| 50 |
-
print(json.dumps({
|
| 51 |
-
"type": "START",
|
| 52 |
-
"episode": episode,
|
| 53 |
-
"task_id": task_id,
|
| 54 |
-
"difficulty": difficulty,
|
| 55 |
-
"timestamp": time.time(),
|
| 56 |
-
}), flush=True)
|
| 57 |
-
|
| 58 |
|
| 59 |
-
def
|
| 60 |
-
print(
|
| 61 |
-
"type": "STEP",
|
| 62 |
-
"task_id": task_id,
|
| 63 |
-
"step": step,
|
| 64 |
-
"action": action_summary,
|
| 65 |
-
"reward": reward,
|
| 66 |
-
"done": done,
|
| 67 |
-
"timestamp": time.time(),
|
| 68 |
-
}), flush=True)
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
def log_end(
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
"episode": episode,
|
| 75 |
-
"task_id": task_id,
|
| 76 |
-
"difficulty": difficulty,
|
| 77 |
-
"final_reward": final_reward,
|
| 78 |
-
"steps_taken": steps_taken,
|
| 79 |
-
"timestamp": time.time(),
|
| 80 |
-
}), flush=True)
|
| 81 |
-
|
| 82 |
|
| 83 |
# ─── Environment Client ───────────────────────────────────────────────────────
|
| 84 |
-
|
| 85 |
def env_reset(env_url: str, difficulty: str) -> dict:
|
| 86 |
-
resp = requests.post(
|
| 87 |
-
f"{env_url}/reset",
|
| 88 |
-
json={"difficulty": difficulty},
|
| 89 |
-
timeout=30,
|
| 90 |
-
)
|
| 91 |
resp.raise_for_status()
|
| 92 |
return resp.json()
|
| 93 |
|
| 94 |
-
|
| 95 |
def env_step(env_url: str, fixed_code: str, explanation: str = None) -> dict:
|
| 96 |
payload = {"fixed_code": fixed_code}
|
| 97 |
if explanation:
|
| 98 |
payload["explanation"] = explanation
|
| 99 |
-
resp = requests.post(
|
| 100 |
-
f"{env_url}/step",
|
| 101 |
-
json=payload,
|
| 102 |
-
timeout=30,
|
| 103 |
-
)
|
| 104 |
-
resp.raise_for_status()
|
| 105 |
-
return resp.json()
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
def env_state(env_url: str) -> dict:
|
| 109 |
-
resp = requests.get(f"{env_url}/state", timeout=10)
|
| 110 |
resp.raise_for_status()
|
| 111 |
return resp.json()
|
| 112 |
|
| 113 |
-
|
| 114 |
# ─── LLM Agent ───────────────────────────────────────────────────────────────
|
| 115 |
-
|
| 116 |
-
SYSTEM_PROMPT = """You are an expert Python debugging agent.
|
| 117 |
You will be given buggy Python code and must fix it.
|
| 118 |
|
| 119 |
For easy tasks: fix the single bug.
|
|
@@ -133,11 +80,8 @@ Rules:
|
|
| 133 |
- Do NOT include markdown fences or any text outside the JSON object.
|
| 134 |
"""
|
| 135 |
|
| 136 |
-
|
| 137 |
def call_llm(buggy_code: str, instructions: str, difficulty: str,
|
| 138 |
feedback: str = None, attempt: int = 1) -> dict:
|
| 139 |
-
"""Call the LLM and return parsed {fixed_code, explanation}."""
|
| 140 |
-
|
| 141 |
user_content = f"""Task difficulty: {difficulty}
|
| 142 |
Instructions: {instructions}
|
| 143 |
|
|
@@ -156,130 +100,98 @@ Buggy code:
|
|
| 156 |
|
| 157 |
try:
|
| 158 |
response = client.chat.completions.create(
|
| 159 |
-
model=MODEL_NAME,
|
| 160 |
-
messages=messages,
|
| 161 |
-
max_tokens=1000,
|
| 162 |
-
temperature=0.1,
|
| 163 |
)
|
| 164 |
content = response.choices[0].message.content.strip()
|
| 165 |
-
|
| 166 |
-
# Strip markdown fences if present
|
| 167 |
if content.startswith("```"):
|
| 168 |
lines = content.split("\n")
|
| 169 |
content = "\n".join(lines[1:-1]) if lines[-1] == "```" else "\n".join(lines[1:])
|
| 170 |
-
|
| 171 |
parsed = json.loads(content)
|
| 172 |
-
return {
|
| 173 |
-
"fixed_code": parsed.get("fixed_code", ""),
|
| 174 |
-
"explanation": parsed.get("explanation", None),
|
| 175 |
-
}
|
| 176 |
except json.JSONDecodeError:
|
| 177 |
-
# Fallback: return original code if parsing fails
|
| 178 |
return {"fixed_code": buggy_code, "explanation": None}
|
| 179 |
except Exception as e:
|
| 180 |
-
print(f"LLM call failed: {e}", file=sys.stderr)
|
| 181 |
return {"fixed_code": buggy_code, "explanation": None}
|
| 182 |
|
| 183 |
-
|
| 184 |
# ─── Main Episode Loop ────────────────────────────────────────────────────────
|
| 185 |
-
|
| 186 |
-
def run_episode(env_url: str, difficulty: str, episode_num: int) -> float:
|
| 187 |
-
"""Run one full episode. Returns final reward."""
|
| 188 |
-
|
| 189 |
-
# Reset
|
| 190 |
reset_data = env_reset(env_url, difficulty)
|
| 191 |
obs = reset_data["observation"]
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
buggy_code = obs["buggy_code"]
|
| 195 |
instructions = obs["instructions"]
|
| 196 |
|
| 197 |
-
log_start(task_id,
|
| 198 |
|
| 199 |
last_feedback = None
|
| 200 |
-
|
| 201 |
-
|
|
|
|
| 202 |
|
| 203 |
for attempt in range(1, MAX_STEPS + 1):
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
# Call LLM
|
| 207 |
agent_action = call_llm(
|
| 208 |
-
buggy_code=buggy_code,
|
| 209 |
-
|
| 210 |
-
difficulty=difficulty,
|
| 211 |
-
feedback=last_feedback,
|
| 212 |
-
attempt=attempt,
|
| 213 |
)
|
|
|
|
| 214 |
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
reward = result.get("reward", 0.0)
|
| 223 |
-
done
|
| 224 |
-
|
| 225 |
-
last_feedback =
|
| 226 |
-
|
| 227 |
-
log_step(
|
| 228 |
-
task_id=task_id,
|
| 229 |
-
step=attempt,
|
| 230 |
-
action_summary=f"Submitted fix attempt {attempt} ({len(agent_action['fixed_code'])} chars)",
|
| 231 |
-
reward=reward,
|
| 232 |
-
done=done,
|
| 233 |
-
)
|
| 234 |
|
| 235 |
-
|
|
|
|
| 236 |
|
|
|
|
|
|
|
| 237 |
if done:
|
| 238 |
break
|
| 239 |
|
| 240 |
-
log_end(
|
| 241 |
-
return
|
| 242 |
-
|
| 243 |
|
| 244 |
def main():
|
| 245 |
parser = argparse.ArgumentParser(description="Code Debug Environment Baseline Agent")
|
| 246 |
parser.add_argument("--url", default=ENV_URL, help="Environment base URL")
|
| 247 |
-
parser.add_argument("--difficulty", default=None, choices=["easy", "medium", "hard", "all"]
|
| 248 |
-
help="Difficulty to run. 'all' runs one episode per difficulty.")
|
| 249 |
args = parser.parse_args()
|
| 250 |
-
|
| 251 |
env_url = args.url.rstrip("/")
|
| 252 |
|
| 253 |
-
# Health check
|
| 254 |
try:
|
| 255 |
health = requests.get(f"{env_url}/health", timeout=10)
|
| 256 |
health.raise_for_status()
|
| 257 |
-
print(
|
| 258 |
except Exception as e:
|
| 259 |
-
print(
|
| 260 |
sys.exit(1)
|
| 261 |
|
| 262 |
-
|
| 263 |
-
if args.difficulty == "all" or args.difficulty is None:
|
| 264 |
-
episodes = [("easy", 1), ("medium", 2), ("hard", 3)]
|
| 265 |
-
else:
|
| 266 |
-
episodes = [(args.difficulty, 1)]
|
| 267 |
|
| 268 |
all_rewards = []
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
print(json.dumps({
|
| 276 |
-
"type": "SUMMARY",
|
| 277 |
-
"total_episodes": len(all_rewards),
|
| 278 |
-
"results": all_rewards,
|
| 279 |
-
"average_reward": round(sum(r["reward"] for r in all_rewards) / len(all_rewards), 3),
|
| 280 |
-
"timestamp": time.time(),
|
| 281 |
-
}), flush=True)
|
| 282 |
|
|
|
|
|
|
|
| 283 |
|
| 284 |
if __name__ == "__main__":
|
| 285 |
-
main()
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
# inference.py — Code Debug Environment Baseline Agent
|
| 3 |
+
# Log format strictly follows [START] [STEP] [END] as required by evaluator.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
#
|
| 5 |
+
# Required env vars: API_BASE_URL, MODEL_NAME, HF_TOKEN
|
| 6 |
# Usage:
|
| 7 |
# python inference.py
|
| 8 |
+
# python inference.py --url https://Souravdanyal-code-debug-env.hf.space
|
| 9 |
# python inference.py --difficulty easy
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
import os
|
| 12 |
import sys
|
|
|
|
| 15 |
import argparse
|
| 16 |
import requests
|
| 17 |
from openai import OpenAI
|
| 18 |
+
from typing import List, Optional
|
| 19 |
|
| 20 |
# ─── Configuration ────────────────────────────────────────────────────────────
|
|
|
|
| 21 |
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 22 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
|
| 23 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 24 |
+
ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
|
| 25 |
+
BENCHMARK = "code-debug-env"
|
| 26 |
+
MAX_STEPS = 5
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# ─── OpenAI Client ───────────────────────────────────────────────────────────
|
| 29 |
+
client = OpenAI(api_key=HF_TOKEN or "dummy", base_url=API_BASE_URL)
|
| 30 |
|
| 31 |
+
# ─── Logging — STRICT FORMAT REQUIRED BY EVALUATOR ───────────────────────────
|
| 32 |
+
# [START] task=<task_id> env=<benchmark> model=<model_name>
|
| 33 |
+
# [STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 34 |
+
# [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
def log_start(task_id: str, env: str, model: str) -> None:
|
| 37 |
+
print(f"[START] task={task_id} env={env} model={model}", flush=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 40 |
+
error_val = error if error else "null"
|
| 41 |
+
done_val = str(done).lower()
|
| 42 |
+
print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
|
| 43 |
|
| 44 |
+
def log_end(success: bool, steps: int, rewards: List[float]) -> None:
|
| 45 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 46 |
+
print(f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}", flush=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
# ─── Environment Client ───────────────────────────────────────────────────────
|
|
|
|
| 49 |
def env_reset(env_url: str, difficulty: str) -> dict:
|
| 50 |
+
resp = requests.post(f"{env_url}/reset", json={"difficulty": difficulty}, timeout=30)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
resp.raise_for_status()
|
| 52 |
return resp.json()
|
| 53 |
|
|
|
|
| 54 |
def env_step(env_url: str, fixed_code: str, explanation: str = None) -> dict:
|
| 55 |
payload = {"fixed_code": fixed_code}
|
| 56 |
if explanation:
|
| 57 |
payload["explanation"] = explanation
|
| 58 |
+
resp = requests.post(f"{env_url}/step", json=payload, timeout=30)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
resp.raise_for_status()
|
| 60 |
return resp.json()
|
| 61 |
|
|
|
|
| 62 |
# ─── LLM Agent ───────────────────────────────────────────────────────────────
|
| 63 |
+
SYSTEM_PROMPT = """You are an expert Python debugging agent.
|
|
|
|
| 64 |
You will be given buggy Python code and must fix it.
|
| 65 |
|
| 66 |
For easy tasks: fix the single bug.
|
|
|
|
| 80 |
- Do NOT include markdown fences or any text outside the JSON object.
|
| 81 |
"""
|
| 82 |
|
|
|
|
| 83 |
def call_llm(buggy_code: str, instructions: str, difficulty: str,
|
| 84 |
feedback: str = None, attempt: int = 1) -> dict:
|
|
|
|
|
|
|
| 85 |
user_content = f"""Task difficulty: {difficulty}
|
| 86 |
Instructions: {instructions}
|
| 87 |
|
|
|
|
| 100 |
|
| 101 |
try:
|
| 102 |
response = client.chat.completions.create(
|
| 103 |
+
model=MODEL_NAME, messages=messages, max_tokens=1000, temperature=0.1,
|
|
|
|
|
|
|
|
|
|
| 104 |
)
|
| 105 |
content = response.choices[0].message.content.strip()
|
|
|
|
|
|
|
| 106 |
if content.startswith("```"):
|
| 107 |
lines = content.split("\n")
|
| 108 |
content = "\n".join(lines[1:-1]) if lines[-1] == "```" else "\n".join(lines[1:])
|
|
|
|
| 109 |
parsed = json.loads(content)
|
| 110 |
+
return {"fixed_code": parsed.get("fixed_code", ""), "explanation": parsed.get("explanation", None)}
|
|
|
|
|
|
|
|
|
|
| 111 |
except json.JSONDecodeError:
|
|
|
|
| 112 |
return {"fixed_code": buggy_code, "explanation": None}
|
| 113 |
except Exception as e:
|
| 114 |
+
print(f"# LLM call failed: {e}", file=sys.stderr)
|
| 115 |
return {"fixed_code": buggy_code, "explanation": None}
|
| 116 |
|
|
|
|
| 117 |
# ─── Main Episode Loop ────────────────────────────────────────────────────────
|
| 118 |
+
def run_episode(env_url: str, difficulty: str) -> tuple:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
reset_data = env_reset(env_url, difficulty)
|
| 120 |
obs = reset_data["observation"]
|
| 121 |
+
task_id = obs["task_id"]
|
| 122 |
+
buggy_code = obs["buggy_code"]
|
|
|
|
| 123 |
instructions = obs["instructions"]
|
| 124 |
|
| 125 |
+
log_start(task_id=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 126 |
|
| 127 |
last_feedback = None
|
| 128 |
+
rewards: List[float] = []
|
| 129 |
+
steps_taken = 0
|
| 130 |
+
success = False
|
| 131 |
|
| 132 |
for attempt in range(1, MAX_STEPS + 1):
|
| 133 |
+
steps_taken = attempt
|
|
|
|
|
|
|
| 134 |
agent_action = call_llm(
|
| 135 |
+
buggy_code=buggy_code, instructions=instructions,
|
| 136 |
+
difficulty=difficulty, feedback=last_feedback, attempt=attempt,
|
|
|
|
|
|
|
|
|
|
| 137 |
)
|
| 138 |
+
fixed_code = agent_action["fixed_code"]
|
| 139 |
|
| 140 |
+
if not fixed_code or not fixed_code.strip():
|
| 141 |
+
log_step(step=attempt, action="empty_submission", reward=0.0, done=False, error="empty_code")
|
| 142 |
+
rewards.append(0.0)
|
| 143 |
+
continue
|
| 144 |
+
|
| 145 |
+
try:
|
| 146 |
+
result = env_step(env_url, fixed_code=fixed_code, explanation=agent_action.get("explanation"))
|
| 147 |
+
except Exception as e:
|
| 148 |
+
log_step(step=attempt, action="step_failed", reward=0.0, done=False, error=str(e)[:60])
|
| 149 |
+
rewards.append(0.0)
|
| 150 |
+
continue
|
| 151 |
|
| 152 |
reward = result.get("reward", 0.0)
|
| 153 |
+
done = result.get("done", False)
|
| 154 |
+
obs_r = result.get("observation", {})
|
| 155 |
+
last_feedback = obs_r.get("feedback", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
+
log_step(step=attempt, action=f"fix_{difficulty}_attempt{attempt}", reward=reward, done=done, error=None)
|
| 158 |
+
rewards.append(reward)
|
| 159 |
|
| 160 |
+
if reward >= 1.0:
|
| 161 |
+
success = True
|
| 162 |
if done:
|
| 163 |
break
|
| 164 |
|
| 165 |
+
log_end(success=success, steps=steps_taken, rewards=rewards)
|
| 166 |
+
return success, steps_taken, rewards
|
|
|
|
| 167 |
|
| 168 |
def main():
|
| 169 |
parser = argparse.ArgumentParser(description="Code Debug Environment Baseline Agent")
|
| 170 |
parser.add_argument("--url", default=ENV_URL, help="Environment base URL")
|
| 171 |
+
parser.add_argument("--difficulty", default=None, choices=["easy", "medium", "hard", "all"])
|
|
|
|
| 172 |
args = parser.parse_args()
|
|
|
|
| 173 |
env_url = args.url.rstrip("/")
|
| 174 |
|
|
|
|
| 175 |
try:
|
| 176 |
health = requests.get(f"{env_url}/health", timeout=10)
|
| 177 |
health.raise_for_status()
|
| 178 |
+
print(f"# Environment healthy at {env_url}", flush=True)
|
| 179 |
except Exception as e:
|
| 180 |
+
print(f"# Health check failed: {e}", file=sys.stderr)
|
| 181 |
sys.exit(1)
|
| 182 |
|
| 183 |
+
difficulties = ["easy", "medium", "hard"] if (args.difficulty in ("all", None)) else [args.difficulty]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
all_rewards = []
|
| 186 |
+
all_successes = []
|
| 187 |
+
for difficulty in difficulties:
|
| 188 |
+
success, steps, rewards = run_episode(env_url, difficulty)
|
| 189 |
+
all_rewards.extend(rewards)
|
| 190 |
+
all_successes.append(success)
|
| 191 |
+
time.sleep(0.5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
+
avg = round(sum(all_rewards) / len(all_rewards), 3) if all_rewards else 0.0
|
| 194 |
+
print(f"# SUMMARY: {sum(all_successes)}/{len(difficulties)} tasks solved | avg_reward={avg}", flush=True)
|
| 195 |
|
| 196 |
if __name__ == "__main__":
|
| 197 |
+
main()
|
openenv.yaml
CHANGED
|
@@ -4,12 +4,21 @@ type: typed
|
|
| 4 |
description: >
|
| 5 |
A real-world RL environment where an LLM agent diagnoses and fixes
|
| 6 |
buggy Python code across three difficulty levels (easy, medium, hard).
|
| 7 |
-
Tasks are drawn from real-world domains: data processing,
|
| 8 |
-
|
| 9 |
-
|
|
|
|
| 10 |
|
| 11 |
version: 1.0.0
|
| 12 |
-
author: Souravdanyal
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
runtime:
|
| 15 |
type: docker
|
|
@@ -20,29 +29,89 @@ app:
|
|
| 20 |
host: 0.0.0.0
|
| 21 |
port: 7860
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
tasks:
|
| 24 |
- id: easy
|
| 25 |
-
description: "Fix a single off-by-one or
|
| 26 |
difficulty: easy
|
| 27 |
-
max_steps:
|
| 28 |
reward_range: [0.0, 1.0]
|
|
|
|
|
|
|
| 29 |
|
| 30 |
- id: medium
|
| 31 |
-
description: "Fix two bugs (logic + edge case) so all test cases pass"
|
| 32 |
difficulty: medium
|
| 33 |
-
max_steps:
|
| 34 |
reward_range: [0.0, 1.0]
|
|
|
|
|
|
|
| 35 |
|
| 36 |
- id: hard
|
| 37 |
-
description: "Fix an algorithmic bug AND provide a correct explanation"
|
| 38 |
difficulty: hard
|
| 39 |
-
max_steps:
|
| 40 |
reward_range: [0.0, 1.0]
|
|
|
|
|
|
|
| 41 |
|
| 42 |
reward_range: [0.0, 1.0]
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
api:
|
| 45 |
reset: /reset
|
| 46 |
step: /step
|
| 47 |
state: /state
|
| 48 |
health: /health
|
|
|
|
|
|
| 4 |
description: >
|
| 5 |
A real-world RL environment where an LLM agent diagnoses and fixes
|
| 6 |
buggy Python code across three difficulty levels (easy, medium, hard).
|
| 7 |
+
Tasks are drawn from real-world domains: data processing, string algorithms,
|
| 8 |
+
API validation, sorting, dynamic programming, and graph algorithms.
|
| 9 |
+
Rewards are partial and proportional to test cases passed, with bonuses
|
| 10 |
+
for correct explanations on hard tasks.
|
| 11 |
|
| 12 |
version: 1.0.0
|
| 13 |
+
author: Souravdanyal
|
| 14 |
+
|
| 15 |
+
tags:
|
| 16 |
+
- code-debugging
|
| 17 |
+
- python
|
| 18 |
+
- reinforcement-learning
|
| 19 |
+
- openenv
|
| 20 |
+
- llm-agent
|
| 21 |
+
- software-engineering
|
| 22 |
|
| 23 |
runtime:
|
| 24 |
type: docker
|
|
|
|
| 29 |
host: 0.0.0.0
|
| 30 |
port: 7860
|
| 31 |
|
| 32 |
+
config:
|
| 33 |
+
episode_timeout: 300
|
| 34 |
+
max_steps: 5
|
| 35 |
+
|
| 36 |
tasks:
|
| 37 |
- id: easy
|
| 38 |
+
description: "Fix a single off-by-one, operator, or return bug in a Python function"
|
| 39 |
difficulty: easy
|
| 40 |
+
max_steps: 5
|
| 41 |
reward_range: [0.0, 1.0]
|
| 42 |
+
grader: deterministic
|
| 43 |
+
num_tasks: 15
|
| 44 |
|
| 45 |
- id: medium
|
| 46 |
+
description: "Fix two bugs (logic bug + edge case) so all test cases pass"
|
| 47 |
difficulty: medium
|
| 48 |
+
max_steps: 5
|
| 49 |
reward_range: [0.0, 1.0]
|
| 50 |
+
grader: deterministic
|
| 51 |
+
num_tasks: 15
|
| 52 |
|
| 53 |
- id: hard
|
| 54 |
+
description: "Fix an algorithmic bug AND provide a correct explanation of the root cause"
|
| 55 |
difficulty: hard
|
| 56 |
+
max_steps: 5
|
| 57 |
reward_range: [0.0, 1.0]
|
| 58 |
+
grader: deterministic
|
| 59 |
+
num_tasks: 15
|
| 60 |
|
| 61 |
reward_range: [0.0, 1.0]
|
| 62 |
|
| 63 |
+
action_space:
|
| 64 |
+
type: dict
|
| 65 |
+
description: "Agent submits fixed Python code and optional explanation"
|
| 66 |
+
fields:
|
| 67 |
+
fixed_code:
|
| 68 |
+
type: string
|
| 69 |
+
required: true
|
| 70 |
+
description: "Complete corrected Python function as a string. Must be valid Python."
|
| 71 |
+
explanation:
|
| 72 |
+
type: string
|
| 73 |
+
required: false
|
| 74 |
+
description: "Required for hard tasks. Explain the bug, root cause, and why fix is correct."
|
| 75 |
+
|
| 76 |
+
observation_space:
|
| 77 |
+
type: dict
|
| 78 |
+
description: "Environment observation returned after reset() and step()"
|
| 79 |
+
fields:
|
| 80 |
+
task_id:
|
| 81 |
+
type: string
|
| 82 |
+
description: "Unique identifier for the current task instance (e.g. easy_003)"
|
| 83 |
+
difficulty:
|
| 84 |
+
type: enum
|
| 85 |
+
values: [easy, medium, hard]
|
| 86 |
+
description: "Task difficulty level"
|
| 87 |
+
buggy_code:
|
| 88 |
+
type: string
|
| 89 |
+
description: "The buggy Python function the agent must fix"
|
| 90 |
+
instructions:
|
| 91 |
+
type: string
|
| 92 |
+
description: "Natural language description of what is wrong and what to fix"
|
| 93 |
+
test_cases_description:
|
| 94 |
+
type: string
|
| 95 |
+
description: "Description of what the test cases check"
|
| 96 |
+
reward:
|
| 97 |
+
type: float
|
| 98 |
+
description: "Score from 0.0 to 1.0 for this step (null on reset)"
|
| 99 |
+
passed_tests:
|
| 100 |
+
type: integer
|
| 101 |
+
description: "Number of test cases passed (null on reset)"
|
| 102 |
+
total_tests:
|
| 103 |
+
type: integer
|
| 104 |
+
description: "Total number of test cases (always 3)"
|
| 105 |
+
feedback:
|
| 106 |
+
type: string
|
| 107 |
+
description: "Detailed per-test feedback showing input, expected, and got values"
|
| 108 |
+
done:
|
| 109 |
+
type: boolean
|
| 110 |
+
description: "True when episode is complete (perfect score or max steps reached)"
|
| 111 |
+
|
| 112 |
api:
|
| 113 |
reset: /reset
|
| 114 |
step: /step
|
| 115 |
state: /state
|
| 116 |
health: /health
|
| 117 |
+
tasks: /tasks
|
server/environment.py
CHANGED
|
@@ -29,7 +29,7 @@ GRADERS = {
|
|
| 29 |
"hard": grade_hard,
|
| 30 |
}
|
| 31 |
|
| 32 |
-
MAX_STEPS =
|
| 33 |
|
| 34 |
|
| 35 |
class CodeDebugEnvironment(Environment):
|
|
@@ -101,6 +101,41 @@ class CodeDebugEnvironment(Environment):
|
|
| 101 |
|
| 102 |
self._step_count += 1
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
# Grade the submission
|
| 105 |
grader = GRADERS[self._difficulty]
|
| 106 |
if self._difficulty == "hard":
|
|
@@ -144,4 +179,4 @@ class CodeDebugEnvironment(Environment):
|
|
| 144 |
current_reward=self._current_reward,
|
| 145 |
best_reward=self._best_reward,
|
| 146 |
done=self._done,
|
| 147 |
-
)
|
|
|
|
| 29 |
"hard": grade_hard,
|
| 30 |
}
|
| 31 |
|
| 32 |
+
MAX_STEPS = 5
|
| 33 |
|
| 34 |
|
| 35 |
class CodeDebugEnvironment(Environment):
|
|
|
|
| 101 |
|
| 102 |
self._step_count += 1
|
| 103 |
|
| 104 |
+
# ── Invalid action penalty ──────────────────────────────────────────
|
| 105 |
+
code = action.fixed_code.strip() if action.fixed_code else ""
|
| 106 |
+
if not code:
|
| 107 |
+
done = self._step_count >= MAX_STEPS
|
| 108 |
+
self._done = done
|
| 109 |
+
return DebugObservation(
|
| 110 |
+
task_id=self._current_task["task_id"],
|
| 111 |
+
difficulty=self._difficulty,
|
| 112 |
+
buggy_code=self._current_task["buggy_code"],
|
| 113 |
+
instructions=self._current_task["instructions"],
|
| 114 |
+
test_cases_description=self._current_task["test_cases_description"],
|
| 115 |
+
reward=0.0,
|
| 116 |
+
passed_tests=0,
|
| 117 |
+
total_tests=len(self._current_task["test_cases"]),
|
| 118 |
+
feedback="❌ Invalid action: fixed_code is empty. Penalty applied. Submit valid Python code.",
|
| 119 |
+
done=done,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
# Check for obvious non-Python (very short or no 'def' keyword)
|
| 123 |
+
if len(code) < 5 or ("def " not in code and "lambda" not in code and "=" not in code):
|
| 124 |
+
done = self._step_count >= MAX_STEPS
|
| 125 |
+
self._done = done
|
| 126 |
+
return DebugObservation(
|
| 127 |
+
task_id=self._current_task["task_id"],
|
| 128 |
+
difficulty=self._difficulty,
|
| 129 |
+
buggy_code=self._current_task["buggy_code"],
|
| 130 |
+
instructions=self._current_task["instructions"],
|
| 131 |
+
test_cases_description=self._current_task["test_cases_description"],
|
| 132 |
+
reward=0.0,
|
| 133 |
+
passed_tests=0,
|
| 134 |
+
total_tests=len(self._current_task["test_cases"]),
|
| 135 |
+
feedback="❌ Invalid action: submission does not appear to be valid Python. Penalty applied.",
|
| 136 |
+
done=done,
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
# Grade the submission
|
| 140 |
grader = GRADERS[self._difficulty]
|
| 141 |
if self._difficulty == "hard":
|
|
|
|
| 179 |
current_reward=self._current_reward,
|
| 180 |
best_reward=self._best_reward,
|
| 181 |
done=self._done,
|
| 182 |
+
)
|
spec.md
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Code Debug Environment — Specification
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
The Code Debug Environment is an OpenEnv-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## API Specification
|
| 10 |
+
|
| 11 |
+
### POST /reset
|
| 12 |
+
Start a new episode.
|
| 13 |
+
|
| 14 |
+
**Request:**
|
| 15 |
+
```json
|
| 16 |
+
{"difficulty": "easy"}
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
**Response:**
|
| 20 |
+
```json
|
| 21 |
+
{
|
| 22 |
+
"observation": {
|
| 23 |
+
"task_id": "easy_003",
|
| 24 |
+
"difficulty": "easy",
|
| 25 |
+
"buggy_code": "def find_max(nums):\n return min(nums)",
|
| 26 |
+
"instructions": "The function has exactly one bug. Fix it.",
|
| 27 |
+
"test_cases_description": "Finds max value in a list",
|
| 28 |
+
"reward": null,
|
| 29 |
+
"passed_tests": null,
|
| 30 |
+
"total_tests": 3,
|
| 31 |
+
"feedback": null,
|
| 32 |
+
"done": false
|
| 33 |
+
},
|
| 34 |
+
"reward": 0.0,
|
| 35 |
+
"done": false
|
| 36 |
+
}
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
---
|
| 40 |
+
|
| 41 |
+
### POST /step
|
| 42 |
+
Submit a code fix.
|
| 43 |
+
|
| 44 |
+
**Request:**
|
| 45 |
+
```json
|
| 46 |
+
{
|
| 47 |
+
"fixed_code": "def find_max(nums):\n return max(nums)",
|
| 48 |
+
"explanation": "Optional for hard tasks"
|
| 49 |
+
}
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
**Response:**
|
| 53 |
+
```json
|
| 54 |
+
{
|
| 55 |
+
"observation": {
|
| 56 |
+
"task_id": "easy_003",
|
| 57 |
+
"reward": 1.0,
|
| 58 |
+
"passed_tests": 3,
|
| 59 |
+
"total_tests": 3,
|
| 60 |
+
"feedback": "Test 1: ✅ Passed\n Input: [1,2,3]\n Expected: 3\n Got: 3",
|
| 61 |
+
"done": true
|
| 62 |
+
},
|
| 63 |
+
"reward": 1.0,
|
| 64 |
+
"done": true
|
| 65 |
+
}
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
---
|
| 69 |
+
|
| 70 |
+
### GET /state
|
| 71 |
+
Returns current episode state.
|
| 72 |
+
|
| 73 |
+
```json
|
| 74 |
+
{
|
| 75 |
+
"episode_id": "uuid",
|
| 76 |
+
"task_id": "easy_003",
|
| 77 |
+
"difficulty": "easy",
|
| 78 |
+
"step_count": 1,
|
| 79 |
+
"max_steps": 5,
|
| 80 |
+
"current_reward": 1.0,
|
| 81 |
+
"best_reward": 1.0,
|
| 82 |
+
"done": true
|
| 83 |
+
}
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
---
|
| 87 |
+
|
| 88 |
+
### GET /health
|
| 89 |
+
```json
|
| 90 |
+
{"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
---
|
| 94 |
+
|
| 95 |
+
## Reward Function
|
| 96 |
+
|
| 97 |
+
### Easy & Medium
|
| 98 |
+
```
|
| 99 |
+
reward = passed_tests / total_tests
|
| 100 |
+
```
|
| 101 |
+
- 3/3 → 1.00
|
| 102 |
+
- 2/3 → 0.67
|
| 103 |
+
- 1/3 → 0.33
|
| 104 |
+
- 0/3 → 0.00
|
| 105 |
+
|
| 106 |
+
### Hard
|
| 107 |
+
```
|
| 108 |
+
reward = 0.7 × test_score + 0.3 × explanation_score
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
### Invalid Actions
|
| 112 |
+
- Empty code → reward = 0.0 + feedback message
|
| 113 |
+
- Non-Python code → reward = 0.0 + feedback message
|
| 114 |
+
|
| 115 |
+
---
|
| 116 |
+
|
| 117 |
+
## Episode Rules
|
| 118 |
+
|
| 119 |
+
- Max 5 steps per episode
|
| 120 |
+
- Episode ends when reward = 1.0 OR max steps reached
|
| 121 |
+
- Each step runs fixed_code against 3 deterministic test cases
|
| 122 |
+
- Feedback shows Input, Expected, Got for each test
|
| 123 |
+
|
| 124 |
+
---
|
| 125 |
+
|
| 126 |
+
## Task Domains
|
| 127 |
+
|
| 128 |
+
| Domain | Examples |
|
| 129 |
+
|---|---|
|
| 130 |
+
| List operations | second element, max, flatten |
|
| 131 |
+
| String algorithms | palindrome, reverse, word count |
|
| 132 |
+
| Math | fibonacci, factorial, square root |
|
| 133 |
+
| Sorting | bubble sort, binary search |
|
| 134 |
+
| Data processing | JSON parsing, API validation |
|
| 135 |
+
| Graph algorithms | BFS, cycle detection |
|
| 136 |
+
| Dynamic programming | knapsack, longest subsequence |
|
tests/test_graders.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tests/test_graders.py
|
| 2 |
+
# Basic tests to verify all graders work correctly.
|
| 3 |
+
# Run: python -m pytest tests/ -v
|
| 4 |
+
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 8 |
+
|
| 9 |
+
from server.graders.grader_easy import grade_easy
|
| 10 |
+
from server.graders.grader_medium import grade_medium
|
| 11 |
+
from server.graders.grader_hard import grade_hard
|
| 12 |
+
from server.tasks.task_easy import EASY_TASKS
|
| 13 |
+
from server.tasks.task_medium import MEDIUM_TASKS
|
| 14 |
+
from server.tasks.task_hard import HARD_TASKS
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def test_easy_tasks_count():
|
| 18 |
+
assert len(EASY_TASKS) == 15, f"Expected 15 easy tasks, got {len(EASY_TASKS)}"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_medium_tasks_count():
|
| 22 |
+
assert len(MEDIUM_TASKS) == 15, f"Expected 15 medium tasks, got {len(MEDIUM_TASKS)}"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def test_hard_tasks_count():
|
| 26 |
+
assert len(HARD_TASKS) == 15, f"Expected 15 hard tasks, got {len(HARD_TASKS)}"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_easy_correct_fix_scores_1():
|
| 30 |
+
for task in EASY_TASKS:
|
| 31 |
+
reward, passed, total, _, _ = grade_easy(task["fixed_code"], task)
|
| 32 |
+
assert reward == 1.0, f"{task['task_id']} should score 1.0, got {reward}"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_medium_correct_fix_scores_1():
|
| 36 |
+
for task in MEDIUM_TASKS:
|
| 37 |
+
reward, passed, total, _, _ = grade_medium(task["fixed_code"], task)
|
| 38 |
+
assert reward == 1.0, f"{task['task_id']} should score 1.0, got {reward}"
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def test_hard_correct_fix_scores_high():
|
| 42 |
+
for task in HARD_TASKS:
|
| 43 |
+
keywords = task.get("explanation_keywords", [])
|
| 44 |
+
explanation = " ".join(keywords)
|
| 45 |
+
reward, passed, total, _, _ = grade_hard(task["fixed_code"], task, explanation)
|
| 46 |
+
assert reward >= 0.9, f"{task['task_id']} should score >= 0.9, got {reward}"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def test_reward_range():
|
| 50 |
+
for task in EASY_TASKS + MEDIUM_TASKS:
|
| 51 |
+
reward, _, _, _, _ = grade_easy(task["buggy_code"], task)
|
| 52 |
+
assert 0.0 <= reward <= 1.0, f"Reward out of range: {reward}"
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def test_empty_code_returns_zero():
|
| 56 |
+
task = EASY_TASKS[0]
|
| 57 |
+
reward, passed, total, feedback, _ = grade_easy("", task)
|
| 58 |
+
assert reward == 0.0
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def test_buggy_code_scores_less_than_1():
|
| 62 |
+
for task in EASY_TASKS[:5]:
|
| 63 |
+
reward, _, _, _, _ = grade_easy(task["buggy_code"], task)
|
| 64 |
+
assert reward < 1.0, f"{task['task_id']} buggy code should not score 1.0"
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
import pytest
|
| 69 |
+
pytest.main([__file__, "-v"])
|