Souravdanyal commited on
Commit
8485798
·
1 Parent(s): c21c4ee

Final complete version - all fixes applied

Browse files
README.md CHANGED
@@ -1,12 +1,3 @@
1
- ---
2
- title: Code Debug Env
3
- emoji: 🐛
4
- colorFrom: blue
5
- colorTo: green
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
  # Code Debug Environment
11
 
12
  An [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
@@ -114,9 +105,10 @@ Explanation is scored by matching key algorithmic concepts. Partial credit is gi
114
 
115
  ### Install
116
  ```bash
117
- git clone https://github.com/Souravdanyal/code-debug-env
118
  cd code-debug-env
119
  pip install -e .
 
120
  git clone https://github.com/meta-pytorch/OpenEnv.git
121
  export PYTHONPATH=$PYTHONPATH:OpenEnv:OpenEnv/src:.
122
  ```
@@ -171,11 +163,32 @@ python inference.py --url http://localhost:7860 --difficulty hard
171
 
172
  ## Pre-Submission Validation
173
 
 
 
174
  ```bash
 
175
  python validator/pre_submit_check.py --url http://localhost:7860
176
 
177
  # Or against your HF Space:
178
- python validator/pre_submit_check.py --url https://Souravdanyal-code-debug-env.hf.space
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  ```
180
 
181
  ---
@@ -206,4 +219,4 @@ code-debug-env/
206
  │ └── Dockerfile
207
  └── validator/
208
  └── pre_submit_check.py
209
- ```
 
 
 
 
 
 
 
 
 
 
1
  # Code Debug Environment
2
 
3
  An [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
 
105
 
106
  ### Install
107
  ```bash
108
+ git clone https://github.com/YOUR_USERNAME/code-debug-env
109
  cd code-debug-env
110
  pip install -e .
111
+ # Also clone OpenEnv for PYTHONPATH
112
  git clone https://github.com/meta-pytorch/OpenEnv.git
113
  export PYTHONPATH=$PYTHONPATH:OpenEnv:OpenEnv/src:.
114
  ```
 
163
 
164
  ## Pre-Submission Validation
165
 
166
+ Run before submitting to catch any disqualifying issues:
167
+
168
  ```bash
169
+ # Start the environment first, then:
170
  python validator/pre_submit_check.py --url http://localhost:7860
171
 
172
  # Or against your HF Space:
173
+ python validator/pre_submit_check.py --url https://YOUR_SPACE.hf.space
174
+ ```
175
+
176
+ ---
177
+
178
+ ## Deploy to Hugging Face Spaces
179
+
180
+ ```bash
181
+ # Login
182
+ huggingface-cli login
183
+
184
+ # Create space and push
185
+ huggingface-cli repo create code-debug-env --type space --space_sdk docker
186
+ cd code-debug-env
187
+ git init
188
+ git remote add origin https://huggingface.co/spaces/YOUR_USERNAME/code-debug-env
189
+ git add .
190
+ git commit -m "Initial commit"
191
+ git push origin main
192
  ```
193
 
194
  ---
 
219
  │ └── Dockerfile
220
  └── validator/
221
  └── pre_submit_check.py
222
+ ```
inference.py CHANGED
@@ -1,262 +1,189 @@
1
  #!/usr/bin/env python3
2
- # inference.py — Code Debug Environment Baseline Agent
3
- # Log format strictly follows [START] [STEP] [END] as required by evaluator.
4
- #
5
- # Required env vars: API_BASE_URL, MODEL_NAME, HF_TOKEN
6
- # Usage:
7
- # python inference.py
8
- # python inference.py --url https://Souravdanyal-code-debug-env.hf.space
9
- # python inference.py --difficulty easy
10
-
11
- import os
12
- import sys
13
- import json
14
- import time
15
- import argparse
16
- import requests
 
17
  from openai import OpenAI
18
  from typing import List, Optional
19
 
20
- # ── Configuration ────────────────────────────────────────────────────────────
21
  API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
22
- MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
23
- HF_TOKEN = os.environ.get("HF_TOKEN", "")
24
- ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
25
  BENCHMARK = "code-debug-env"
26
  MAX_STEPS = 5
27
 
28
- # ─── OpenAI Client ───────────────────────────────────────────────────────────
29
  client = OpenAI(api_key=HF_TOKEN or "dummy", base_url=API_BASE_URL)
30
 
31
- # ── Logging — STRICT FORMAT REQUIRED BY EVALUATOR ───────────────────────────
32
- def log_start(task_id: str, env: str, model: str) -> None:
33
  print(f"[START] task={task_id} env={env} model={model}", flush=True)
34
 
35
- def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
36
- error_val = error if error else "null"
37
- done_val = str(done).lower()
38
- print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
39
 
40
- def log_end(success: bool, steps: int, rewards: List[float]) -> None:
41
- rewards_str = ",".join(f"{r:.2f}" for r in rewards)
42
- print(f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}", flush=True)
43
 
44
- # ── Environment Client ───────────────────────────────────────────────────────
45
- def env_reset(env_url: str, difficulty: str) -> dict:
46
- resp = requests.post(f"{env_url}/reset", json={"difficulty": difficulty}, timeout=30)
47
- resp.raise_for_status()
48
- return resp.json()
49
 
50
- def env_step(env_url: str, fixed_code: str, explanation: str = None) -> dict:
51
  payload = {"fixed_code": fixed_code}
52
  if explanation:
53
  payload["explanation"] = explanation
54
- resp = requests.post(f"{env_url}/step", json=payload, timeout=30)
55
- resp.raise_for_status()
56
- return resp.json()
57
-
58
- # ─── LLM Agent ───────────────────────────────────────────────────────────────
59
- SYSTEM_PROMPT = """You are an expert Python debugging agent. Your job is to find and fix bugs in Python functions.
60
 
61
- CRITICAL RULES:
62
- - You MUST respond ONLY with valid JSON no markdown, no explanation outside JSON
63
- - Return the COMPLETE fixed function, not just the changed line
64
- - The fixed_code must be syntactically valid Python
65
- - For hard tasks, the explanation field MUST describe: what the bug was, why it caused failures, and how your fix resolves it
66
 
67
- Response format (strictly):
68
  {
69
- "fixed_code": "<complete corrected Python function>",
70
- "explanation": "<for hard tasks: detailed explanation of bug and fix>"
71
  }
72
 
73
- DEBUGGING STRATEGY:
74
- 1. Read the instructions carefully they tell you exactly what type of bug exists
75
- 2. Trace through the logic with the test inputs mentally
76
- 3. For easy tasks: find the ONE wrong operator, value, or return statement
77
- 4. For medium tasks: find BOTH bugs — usually one logic bug + one edge case
78
- 5. For hard tasks: find the algorithmic flaw + write a clear explanation
79
- 6. If your previous attempt failed, READ THE FEEDBACK it shows exactly which inputs failed and what output was expected
 
 
 
 
 
 
 
 
 
80
  """
81
 
82
- def call_llm(buggy_code: str, instructions: str, difficulty: str,
83
- feedback: str = None, attempt: int = 1,
84
- previous_code: str = None) -> dict:
85
 
86
- user_content = f"""Task difficulty: {difficulty}
87
- Instructions: {instructions}
88
-
89
- Buggy code to fix:
90
- ```python
91
- {buggy_code}
92
- ```
93
- """
94
  if feedback and attempt > 1:
95
- user_content += f"""
96
- PREVIOUS ATTEMPT FAILED. Here is the feedback showing what went wrong:
97
- {feedback}
98
-
99
- Your previous fix was:
100
- ```python
101
- {previous_code or 'unknown'}
102
- ```
103
-
104
- IMPORTANT: Your previous fix did not work. Carefully analyze the feedback above.
105
- Look at the Input, Expected, and Got values for each failing test.
106
- Try a completely different approach to fix the bug.
107
- """
108
 
109
  if difficulty == "hard":
110
- user_content += """
111
- Remember: For hard tasks you MUST include a detailed explanation field describing:
112
- - What the algorithmic bug was
113
- - Why it caused incorrect results
114
- - How your fix resolves it
115
- Explanation quality affects 30% of your reward.
116
- """
117
-
118
- messages = [
119
- {"role": "system", "content": SYSTEM_PROMPT},
120
- {"role": "user", "content": user_content},
121
- ]
122
 
123
  try:
124
- response = client.chat.completions.create(
125
  model=MODEL_NAME,
126
- messages=messages,
127
  max_tokens=1500,
128
- temperature=0.2 if attempt == 1 else 0.5,
129
  )
130
- content = response.choices[0].message.content.strip()
131
-
132
- # Strip markdown fences
133
- if content.startswith("```"):
134
- lines = content.split("\n")
135
- content = "\n".join(lines[1:-1]) if lines[-1].strip() == "```" else "\n".join(lines[1:])
136
- # Strip json prefix
137
- if content.startswith("json"):
138
- content = content[4:].strip()
139
-
140
- parsed = json.loads(content)
141
- return {
142
- "fixed_code": parsed.get("fixed_code", ""),
143
- "explanation": parsed.get("explanation", None),
144
- }
145
- except json.JSONDecodeError:
146
- # Try to extract code from malformed response
147
- if "def " in content:
148
- lines = content.split("\n")
149
- code_lines = []
150
- in_code = False
151
- for line in lines:
152
- if line.strip().startswith("def "):
153
- in_code = True
154
- if in_code:
155
- code_lines.append(line)
156
- if code_lines:
157
- return {"fixed_code": "\n".join(code_lines), "explanation": None}
158
- return {"fixed_code": buggy_code, "explanation": None}
159
  except Exception as e:
160
- print(f"# LLM call failed: {e}", file=sys.stderr)
161
  return {"fixed_code": buggy_code, "explanation": None}
162
 
163
- # ── Main Episode Loop ────────────────────────────────────────────────────────
164
- def run_episode(env_url: str, difficulty: str) -> tuple:
165
- reset_data = env_reset(env_url, difficulty)
166
- obs = reset_data["observation"]
167
  task_id = obs["task_id"]
168
  buggy_code = obs["buggy_code"]
169
  instructions = obs["instructions"]
170
 
171
- log_start(task_id=task_id, env=BENCHMARK, model=MODEL_NAME)
172
 
173
- last_feedback = None
174
- last_fixed_code = None
175
- rewards: List[float] = []
176
- steps_taken = 0
177
- success = False
178
 
179
  for attempt in range(1, MAX_STEPS + 1):
180
  steps_taken = attempt
 
 
 
181
 
182
- agent_action = call_llm(
183
- buggy_code=buggy_code,
184
- instructions=instructions,
185
- difficulty=difficulty,
186
- feedback=last_feedback,
187
- attempt=attempt,
188
- previous_code=last_fixed_code,
189
- )
190
-
191
- fixed_code = agent_action["fixed_code"]
192
- last_fixed_code = fixed_code
193
-
194
- if not fixed_code or not fixed_code.strip():
195
- log_step(step=attempt, action="empty_submission",
196
- reward=0.0, done=False, error="empty_code")
197
  rewards.append(0.0)
198
  continue
199
 
200
  try:
201
- result = env_step(env_url, fixed_code=fixed_code,
202
- explanation=agent_action.get("explanation"))
203
  except Exception as e:
204
- log_step(step=attempt, action="step_failed",
205
- reward=0.0, done=False, error=str(e)[:60])
206
  rewards.append(0.0)
207
  continue
208
 
209
  reward = result.get("reward", 0.0)
210
  done = result.get("done", False)
211
- obs_r = result.get("observation", {})
212
- last_feedback = obs_r.get("feedback", "")
213
 
214
- log_step(step=attempt, action=f"fix_{difficulty}_attempt{attempt}",
215
- reward=reward, done=done, error=None)
216
  rewards.append(reward)
217
 
218
  if reward >= 1.0:
219
  success = True
220
-
221
  if done:
222
  break
223
 
224
- log_end(success=success, steps=steps_taken, rewards=rewards)
225
  return success, steps_taken, rewards
226
 
 
227
  def main():
228
- parser = argparse.ArgumentParser(description="Code Debug Environment Baseline Agent")
229
- parser.add_argument("--url", default=ENV_URL, help="Environment base URL")
230
- parser.add_argument("--difficulty", default=None,
231
- choices=["easy", "medium", "hard", "all"])
232
  args = parser.parse_args()
233
- env_url = args.url.rstrip("/")
234
 
235
  try:
236
- health = requests.get(f"{env_url}/health", timeout=10)
237
- health.raise_for_status()
238
- print(f"# Environment healthy at {env_url}", flush=True)
239
  except Exception as e:
240
  print(f"# Health check failed: {e}", file=sys.stderr)
241
  sys.exit(1)
242
 
243
- difficulties = ["easy", "medium", "hard"] if (
244
- args.difficulty in ("all", None)) else [args.difficulty]
245
-
246
- all_rewards = []
247
- all_successes = []
248
 
249
- for difficulty in difficulties:
250
- success, steps, rewards = run_episode(env_url, difficulty)
251
  all_rewards.extend(rewards)
252
- all_successes.append(success)
253
  time.sleep(0.5)
254
 
255
- avg = round(sum(all_rewards) / len(all_rewards), 3) if all_rewards else 0.0
256
- print(
257
- f"# SUMMARY: {sum(all_successes)}/{len(difficulties)} tasks solved | avg_reward={avg}",
258
- flush=True
259
- )
260
 
261
  if __name__ == "__main__":
262
- main()
 
1
  #!/usr/bin/env python3
2
+ """
3
+ inference.py - Code Debug Environment Baseline Agent
4
+
5
+ Required env vars: API_BASE_URL, MODEL_NAME, HF_TOKEN
6
+ Usage:
7
+ python inference.py
8
+ python inference.py --url https://Souravdanyal-code-debug-env.hf.space
9
+ python inference.py --difficulty easy
10
+
11
+ STDOUT FORMAT (required by evaluator):
12
+ [START] task=<id> env=<benchmark> model=<model>
13
+ [STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
14
+ [END] success=<true|false> steps=<n> rewards=<r1,r2,...>
15
+ """
16
+
17
+ import os, sys, json, time, argparse, requests
18
  from openai import OpenAI
19
  from typing import List, Optional
20
 
21
+ # ── Config ────────────────────────────────────────────────────────────────────
22
  API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
23
+ MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
24
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
25
+ ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
26
  BENCHMARK = "code-debug-env"
27
  MAX_STEPS = 5
28
 
 
29
  client = OpenAI(api_key=HF_TOKEN or "dummy", base_url=API_BASE_URL)
30
 
31
+ # ── Logging ───────────────────────────────────────────────────────────────────
32
+ def log_start(task_id, env, model):
33
  print(f"[START] task={task_id} env={env} model={model}", flush=True)
34
 
35
+ def log_step(step, action, reward, done, error):
36
+ print(f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={error or 'null'}", flush=True)
 
 
37
 
38
+ def log_end(success, steps, rewards):
39
+ print(f"[END] success={str(success).lower()} steps={steps} rewards={','.join(f'{r:.2f}' for r in rewards)}", flush=True)
 
40
 
41
+ # ── Env client ────────────────────────────────────────────────────────────────
42
+ def env_reset(url, difficulty):
43
+ r = requests.post(f"{url}/reset", json={"difficulty": difficulty}, timeout=30)
44
+ r.raise_for_status()
45
+ return r.json()
46
 
47
+ def env_step(url, fixed_code, explanation=None):
48
  payload = {"fixed_code": fixed_code}
49
  if explanation:
50
  payload["explanation"] = explanation
51
+ r = requests.post(f"{url}/step", json=payload, timeout=30)
52
+ r.raise_for_status()
53
+ return r.json()
 
 
 
54
 
55
+ # ── LLM ──────────────────────────────────────────────────────────────────────
56
+ SYSTEM_PROMPT = """You are an expert Python debugging agent. Fix bugs in Python functions.
 
 
 
57
 
58
+ RESPONSE FORMAT strictly JSON only, no markdown:
59
  {
60
+ "fixed_code": "<complete corrected Python function including imports>",
61
+ "explanation": "<for hard tasks: explain the bug, root cause, and fix>"
62
  }
63
 
64
+ RULES:
65
+ - Return COMPLETE function with all imports (e.g. from collections import deque)
66
+ - fixed_code must be valid Python
67
+ - For hard tasks explanation MUST mention the algorithmic concept
68
+
69
+ COMMON BUGS:
70
+ - Graph/BFS: missing visited set infinite loop on cycles add visited=set()
71
+ - Knapsack DP: wrong loop order (forward=unbounded, backward=0/1 knapsack)
72
+ - Binary search: wrong boundary → return high not low, or high=n//2 not n
73
+ - Off-by-one: lst[2] should be lst[1] for second element
74
+ - Wrong operator: + instead of -, * instead of /
75
+
76
+ IF PREVIOUS ATTEMPT FAILED:
77
+ - Read the Input/Expected/Got carefully
78
+ - Try a completely different fix
79
+ - For TimeoutError: you have an infinite loop, add a visited set
80
  """
81
 
82
+ def call_llm(buggy_code, instructions, difficulty, feedback=None, attempt=1, prev_code=None):
83
+ content = f"Difficulty: {difficulty}\nInstructions: {instructions}\n\nBuggy code:\n```python\n{buggy_code}\n```\n"
 
84
 
 
 
 
 
 
 
 
 
85
  if feedback and attempt > 1:
86
+ content += f"\nPREVIOUS FIX FAILED. Feedback:\n{feedback}\n\nYour previous code:\n```python\n{prev_code or ''}\n```\nTry a different approach.\n"
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  if difficulty == "hard":
89
+ content += "\nIMPORTANT: Include a detailed explanation field mentioning the algorithmic concept.\n"
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  try:
92
+ resp = client.chat.completions.create(
93
  model=MODEL_NAME,
94
+ messages=[{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": content}],
95
  max_tokens=1500,
96
+ temperature=0.1 if attempt == 1 else 0.4,
97
  )
98
+ raw = resp.choices[0].message.content.strip()
99
+ # Clean markdown fences
100
+ if "```" in raw:
101
+ raw = raw.split("```")[1] if raw.startswith("```") else raw
102
+ if raw.startswith("json\n"):
103
+ raw = raw[5:]
104
+ # Find JSON object
105
+ start = raw.find("{")
106
+ end = raw.rfind("}") + 1
107
+ if start >= 0 and end > start:
108
+ raw = raw[start:end]
109
+ parsed = json.loads(raw)
110
+ return {"fixed_code": parsed.get("fixed_code", ""), "explanation": parsed.get("explanation")}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  except Exception as e:
112
+ print(f"# LLM error: {e}", file=sys.stderr)
113
  return {"fixed_code": buggy_code, "explanation": None}
114
 
115
+ # ── Episode ───────────────────────────────────────────────────────────────────
116
+ def run_episode(env_url, difficulty):
117
+ data = env_reset(env_url, difficulty)
118
+ obs = data["observation"]
119
  task_id = obs["task_id"]
120
  buggy_code = obs["buggy_code"]
121
  instructions = obs["instructions"]
122
 
123
+ log_start(task_id, BENCHMARK, MODEL_NAME)
124
 
125
+ rewards, steps_taken, success = [], 0, False
126
+ last_feedback, last_code = None, None
 
 
 
127
 
128
  for attempt in range(1, MAX_STEPS + 1):
129
  steps_taken = attempt
130
+ action = call_llm(buggy_code, instructions, difficulty, last_feedback, attempt, last_code)
131
+ code = action["fixed_code"]
132
+ last_code = code
133
 
134
+ if not code or not code.strip():
135
+ log_step(attempt, "empty_submission", 0.0, False, "empty_code")
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  rewards.append(0.0)
137
  continue
138
 
139
  try:
140
+ result = env_step(env_url, code, action.get("explanation"))
 
141
  except Exception as e:
142
+ log_step(attempt, "step_failed", 0.0, False, str(e)[:60])
 
143
  rewards.append(0.0)
144
  continue
145
 
146
  reward = result.get("reward", 0.0)
147
  done = result.get("done", False)
148
+ last_feedback = result.get("observation", {}).get("feedback", "")
 
149
 
150
+ log_step(attempt, f"fix_{difficulty}_attempt{attempt}", reward, done, None)
 
151
  rewards.append(reward)
152
 
153
  if reward >= 1.0:
154
  success = True
 
155
  if done:
156
  break
157
 
158
+ log_end(success, steps_taken, rewards)
159
  return success, steps_taken, rewards
160
 
161
+ # ── Main ──────────────────────────────────────────────────────────────────────
162
  def main():
163
+ parser = argparse.ArgumentParser()
164
+ parser.add_argument("--url", default=ENV_URL)
165
+ parser.add_argument("--difficulty", default=None, choices=["easy","medium","hard","all"])
 
166
  args = parser.parse_args()
167
+ url = args.url.rstrip("/")
168
 
169
  try:
170
+ requests.get(f"{url}/health", timeout=10).raise_for_status()
171
+ print(f"# Environment healthy at {url}", flush=True)
 
172
  except Exception as e:
173
  print(f"# Health check failed: {e}", file=sys.stderr)
174
  sys.exit(1)
175
 
176
+ diffs = ["easy","medium","hard"] if args.difficulty in (None,"all") else [args.difficulty]
177
+ all_rewards, successes = [], []
 
 
 
178
 
179
+ for d in diffs:
180
+ ok, _, rewards = run_episode(url, d)
181
  all_rewards.extend(rewards)
182
+ successes.append(ok)
183
  time.sleep(0.5)
184
 
185
+ avg = round(sum(all_rewards)/len(all_rewards), 3) if all_rewards else 0.0
186
+ print(f"# SUMMARY: {sum(successes)}/{len(diffs)} tasks solved | avg_reward={avg}", flush=True)
 
 
 
187
 
188
  if __name__ == "__main__":
189
+ main()
server/__pycache__/__init__.cpython-39.pyc DELETED
Binary file (161 Bytes)
 
server/__pycache__/app.cpython-310.pyc CHANGED
Binary files a/server/__pycache__/app.cpython-310.pyc and b/server/__pycache__/app.cpython-310.pyc differ
 
server/__pycache__/app.cpython-39.pyc DELETED
Binary file (4.22 kB)
 
server/__pycache__/environment.cpython-310.pyc CHANGED
Binary files a/server/__pycache__/environment.cpython-310.pyc and b/server/__pycache__/environment.cpython-310.pyc differ
 
server/app.py CHANGED
@@ -1,5 +1,4 @@
1
- # server/app.py
2
- # FastAPI server exposing the OpenEnv standard endpoints.
3
  # Port 7860 required for Hugging Face Spaces.
4
 
5
  from fastapi import FastAPI, HTTPException
@@ -14,108 +13,74 @@ from models import DebugAction, DebugObservation, DebugState
14
 
15
  app = FastAPI(
16
  title="Code Debug Environment",
17
- description=(
18
- "An OpenEnv environment where LLM agents fix buggy Python code. "
19
- "3 difficulty levels: easy (1 bug), medium (2 bugs), hard (algorithmic + explanation)."
20
- ),
21
  version="1.0.0",
22
  )
23
 
24
- app.add_middleware(
25
- CORSMiddleware,
26
- allow_origins=["*"],
27
- allow_methods=["*"],
28
- allow_headers=["*"],
29
- )
30
 
31
- # One global environment instance (single session)
32
- # For concurrent sessions, instantiate per-request with a session dict
33
  env = CodeDebugEnvironment()
34
 
35
 
36
- # ─── Request Models ─────────────────────────────────────────────────────────
37
-
38
  class ResetRequest(BaseModel):
39
- difficulty: Optional[str] = None # "easy" | "medium" | "hard" | None (random)
40
-
41
 
42
  class StepRequest(BaseModel):
43
  fixed_code: str
44
  explanation: Optional[str] = None
45
 
46
-
47
- # ─── Response wrapper matching OpenEnv StepResult shape ──────────────────────
48
-
49
  class StepResponse(BaseModel):
50
  observation: dict
51
  reward: float
52
  done: bool
53
 
54
 
55
- # ─── Endpoints ───────────────────────────────────────────────────────────────
56
-
57
  @app.get("/", response_class=HTMLResponse)
58
  async def root():
59
- """Homepage shows environment info and API endpoints."""
60
  html_path = os.path.join(os.path.dirname(__file__), "static", "index.html")
61
- with open(html_path, "r") as f:
62
  return f.read()
63
 
64
 
 
 
 
 
 
 
65
  @app.get("/health")
66
  async def health():
67
- """Health check endpoint — must return 200 for submission validation."""
68
  return {"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
69
 
70
 
71
  @app.post("/reset")
72
  async def reset(request: ResetRequest = ResetRequest()) -> dict:
73
- """
74
- Reset the environment to start a new episode.
75
- Optionally pass difficulty: 'easy' | 'medium' | 'hard'
76
- """
77
  try:
78
- observation = env.reset(difficulty=request.difficulty)
79
- return {
80
- "observation": observation.model_dump(),
81
- "reward": 0.0,
82
- "done": False,
83
- }
84
  except Exception as e:
85
  raise HTTPException(status_code=500, detail=f"Reset failed: {str(e)}")
86
 
87
 
88
  @app.post("/step")
89
  async def step(request: StepRequest) -> StepResponse:
90
- """
91
- Submit a code fix (and optional explanation for hard tasks).
92
- Returns observation with reward (0.0-1.0), feedback, and done flag.
93
- """
94
  if not request.fixed_code or not request.fixed_code.strip():
95
  raise HTTPException(status_code=400, detail="fixed_code must not be empty.")
96
-
97
  try:
98
- action = DebugAction(
99
- fixed_code=request.fixed_code,
100
- explanation=request.explanation,
101
- )
102
- observation = env.step(action)
103
- return StepResponse(
104
- observation=observation.model_dump(),
105
- reward=observation.reward or 0.0,
106
- done=observation.done,
107
- )
108
  except TimeoutError:
109
- # Code execution timed out — return 0 reward instead of 500
110
  return StepResponse(
111
- observation={"task_id": "unknown", "difficulty": "unknown",
112
- "buggy_code": "", "instructions": "",
113
- "test_cases_description": "", "reward": 0.0,
114
- "passed_tests": 0, "total_tests": 3,
115
- "feedback": "TimeoutError: Code execution timed out. Possible infinite loop.",
116
- "done": False},
117
- reward=0.0,
118
- done=False,
119
  )
120
  except Exception as e:
121
  raise HTTPException(status_code=500, detail=f"Step failed: {str(e)}")
@@ -123,30 +88,27 @@ async def step(request: StepRequest) -> StepResponse:
123
 
124
  @app.get("/state")
125
  async def state() -> dict:
126
- """Return the current episode state."""
127
  try:
128
- s = env.state
129
- return s.model_dump()
130
  except Exception as e:
131
  raise HTTPException(status_code=500, detail=f"State failed: {str(e)}")
132
 
133
 
134
  @app.get("/tasks")
135
  async def list_tasks() -> dict:
136
- """List available task IDs per difficulty (for inspection)."""
137
  from server.tasks.task_easy import EASY_TASKS
138
  from server.tasks.task_medium import MEDIUM_TASKS
139
  from server.tasks.task_hard import HARD_TASKS
140
  return {
141
- "easy": [t["task_id"] for t in EASY_TASKS],
142
  "medium": [t["task_id"] for t in MEDIUM_TASKS],
143
- "hard": [t["task_id"] for t in HARD_TASKS],
144
- "total": len(EASY_TASKS) + len(MEDIUM_TASKS) + len(HARD_TASKS),
145
  }
146
 
147
- # ─── Run directly with: python server/app.py ─────────────────────────────────
148
  if __name__ == "__main__":
149
- import sys
150
  import uvicorn
151
- sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
152
- uvicorn.run("server.app:app", host="127.0.0.1", port=7860, reload=True)
 
1
+ # server/app.py — FastAPI server for Code Debug Environment
 
2
  # Port 7860 required for Hugging Face Spaces.
3
 
4
  from fastapi import FastAPI, HTTPException
 
13
 
14
  app = FastAPI(
15
  title="Code Debug Environment",
16
+ description="OpenEnv RL environment where LLM agents fix buggy Python code. 3 difficulty levels: easy, medium, hard.",
 
 
 
17
  version="1.0.0",
18
  )
19
 
20
+ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 
 
 
 
 
21
 
 
 
22
  env = CodeDebugEnvironment()
23
 
24
 
 
 
25
  class ResetRequest(BaseModel):
26
+ difficulty: Optional[str] = None
 
27
 
28
  class StepRequest(BaseModel):
29
  fixed_code: str
30
  explanation: Optional[str] = None
31
 
 
 
 
32
  class StepResponse(BaseModel):
33
  observation: dict
34
  reward: float
35
  done: bool
36
 
37
 
 
 
38
  @app.get("/", response_class=HTMLResponse)
39
  async def root():
40
+ """Homepage with live interactive tester."""
41
  html_path = os.path.join(os.path.dirname(__file__), "static", "index.html")
42
+ with open(html_path, "r", encoding="utf-8") as f:
43
  return f.read()
44
 
45
 
46
+ @app.get("/favicon.ico", include_in_schema=False)
47
+ async def favicon():
48
+ from fastapi.responses import Response
49
+ return Response(content=b"", media_type="image/x-icon")
50
+
51
+
52
  @app.get("/health")
53
  async def health():
54
+ """Health check — must return 200 for submission validation."""
55
  return {"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
56
 
57
 
58
  @app.post("/reset")
59
  async def reset(request: ResetRequest = ResetRequest()) -> dict:
60
+ """Reset environment to start a new episode. Pass difficulty: easy | medium | hard"""
 
 
 
61
  try:
62
+ obs = env.reset(difficulty=request.difficulty)
63
+ return {"observation": obs.model_dump(), "reward": 0.0, "done": False}
 
 
 
 
64
  except Exception as e:
65
  raise HTTPException(status_code=500, detail=f"Reset failed: {str(e)}")
66
 
67
 
68
  @app.post("/step")
69
  async def step(request: StepRequest) -> StepResponse:
70
+ """Submit fixed code. Returns reward (0.0-1.0), feedback, done flag."""
 
 
 
71
  if not request.fixed_code or not request.fixed_code.strip():
72
  raise HTTPException(status_code=400, detail="fixed_code must not be empty.")
 
73
  try:
74
+ action = DebugAction(fixed_code=request.fixed_code, explanation=request.explanation)
75
+ obs = env.step(action)
76
+ return StepResponse(observation=obs.model_dump(), reward=obs.reward or 0.0, done=obs.done)
 
 
 
 
 
 
 
77
  except TimeoutError:
 
78
  return StepResponse(
79
+ observation={"task_id": "unknown", "difficulty": "unknown", "buggy_code": "",
80
+ "instructions": "", "test_cases_description": "", "reward": 0.0,
81
+ "passed_tests": 0, "total_tests": 3, "done": False,
82
+ "feedback": "TimeoutError: Infinite loop detected. Add a visited set for graph traversal."},
83
+ reward=0.0, done=False,
 
 
 
84
  )
85
  except Exception as e:
86
  raise HTTPException(status_code=500, detail=f"Step failed: {str(e)}")
 
88
 
89
  @app.get("/state")
90
  async def state() -> dict:
91
+ """Return current episode state."""
92
  try:
93
+ return env.state.model_dump()
 
94
  except Exception as e:
95
  raise HTTPException(status_code=500, detail=f"State failed: {str(e)}")
96
 
97
 
98
  @app.get("/tasks")
99
  async def list_tasks() -> dict:
100
+ """List all 45 task IDs across difficulty levels."""
101
  from server.tasks.task_easy import EASY_TASKS
102
  from server.tasks.task_medium import MEDIUM_TASKS
103
  from server.tasks.task_hard import HARD_TASKS
104
  return {
105
+ "easy": [t["task_id"] for t in EASY_TASKS],
106
  "medium": [t["task_id"] for t in MEDIUM_TASKS],
107
+ "hard": [t["task_id"] for t in HARD_TASKS],
108
+ "total": len(EASY_TASKS) + len(MEDIUM_TASKS) + len(HARD_TASKS),
109
  }
110
 
111
+
112
  if __name__ == "__main__":
 
113
  import uvicorn
114
+ uvicorn.run("server.app:app", host="127.0.0.1", port=7860, reload=True)
 
server/graders/__pycache__/grader_easy.cpython-310.pyc CHANGED
Binary files a/server/graders/__pycache__/grader_easy.cpython-310.pyc and b/server/graders/__pycache__/grader_easy.cpython-310.pyc differ
 
server/graders/__pycache__/grader_hard.cpython-310.pyc CHANGED
Binary files a/server/graders/__pycache__/grader_hard.cpython-310.pyc and b/server/graders/__pycache__/grader_hard.cpython-310.pyc differ
 
server/graders/__pycache__/grader_medium.cpython-310.pyc CHANGED
Binary files a/server/graders/__pycache__/grader_medium.cpython-310.pyc and b/server/graders/__pycache__/grader_medium.cpython-310.pyc differ
 
server/graders/grader_easy.py CHANGED
@@ -1,6 +1,6 @@
1
  # server/graders/grader_easy.py
2
- # Grades easy tasks: 1 bug, 3 test cases.
3
- # Reward is proportional to tests passed (0.33, 0.66, 1.0).
4
 
5
  import traceback
6
  import signal
@@ -8,37 +8,32 @@ from typing import Tuple, List
8
 
9
 
10
  def _timeout_handler(signum, frame):
11
- raise TimeoutError("Code execution timed out (infinite loop or slow code)")
12
 
13
 
14
  def _run_code_safely(code: str, func_name: str, test_input):
15
- """
16
- Executes the submitted code in an isolated namespace and calls the function.
17
- Returns (output, error_message).
18
- Times out after 5 seconds to prevent infinite loops.
19
- """
20
  namespace = {}
21
  try:
22
  exec(compile(code, "<submitted>", "exec"), namespace)
23
  except SyntaxError as e:
24
  return None, f"SyntaxError: {e}"
25
  except Exception as e:
26
- return None, f"Compile error: {e}"
27
 
28
  func = namespace.get(func_name)
29
  if func is None:
30
- funcs = [v for v in namespace.values() if callable(v) and not v.__name__.startswith("_")]
31
  if not funcs:
32
  return None, "No callable function found in submitted code."
33
  func = funcs[0]
34
 
35
  try:
36
- # Set 5 second timeout to catch infinite loops
37
  try:
38
  signal.signal(signal.SIGALRM, _timeout_handler)
39
  signal.alarm(5)
40
  except (AttributeError, OSError):
41
- pass # Windows doesn't support SIGALRM, skip timeout
42
 
43
  if isinstance(test_input, list) and len(test_input) > 0 and isinstance(test_input[0], list):
44
  result = func(*test_input)
@@ -51,13 +46,14 @@ def _run_code_safely(code: str, func_name: str, test_input):
51
  result = func(test_input)
52
 
53
  try:
54
- signal.alarm(0) # Cancel timeout
55
  except (AttributeError, OSError):
56
  pass
57
 
58
  return result, None
 
59
  except TimeoutError as e:
60
- return None, f"TimeoutError: {e}"
61
  except Exception as e:
62
  try:
63
  signal.alarm(0)
@@ -67,7 +63,6 @@ def _run_code_safely(code: str, func_name: str, test_input):
67
 
68
 
69
  def _extract_func_name(code: str) -> str:
70
- """Extract the first function name defined in the code."""
71
  for line in code.splitlines():
72
  line = line.strip()
73
  if line.startswith("def "):
@@ -77,14 +72,8 @@ def _extract_func_name(code: str) -> str:
77
 
78
  def grade_easy(fixed_code: str, task: dict) -> Tuple[float, int, int, str, List[dict]]:
79
  """
80
- Grade an easy task submission.
81
-
82
- Returns:
83
- reward (float): 0.0 to 1.0
84
- passed (int): number of tests passed
85
- total (int): total test cases
86
- feedback (str): detailed feedback message
87
- results (list): per-test results
88
  """
89
  test_cases = task["test_cases"]
90
  total = len(test_cases)
@@ -99,21 +88,18 @@ def grade_easy(fixed_code: str, task: dict) -> Tuple[float, int, int, str, List[
99
  got, error = _run_code_safely(fixed_code, func_name, inp)
100
 
101
  if error:
102
- results.append({"test_id": i + 1, "passed": False, "expected": str(expected), "got": f"ERROR: {error}"})
103
  feedback_lines.append(f"Test {i+1}: ❌ Error\n Input : {inp!r}\n Expected : {expected!r}\n Error : {error}")
104
  elif got == expected:
105
  passed += 1
106
- results.append({"test_id": i + 1, "passed": True, "expected": str(expected), "got": str(got)})
107
  feedback_lines.append(f"Test {i+1}: ✅ Passed\n Input : {inp!r}\n Expected : {expected!r}\n Got : {got!r}")
108
  else:
109
- results.append({"test_id": i + 1, "passed": False, "expected": str(expected), "got": str(got)})
110
  feedback_lines.append(f"Test {i+1}: ❌ Failed\n Input : {inp!r}\n Expected : {expected!r}\n Got : {got!r}")
111
 
112
  reward = round(passed / total, 2)
113
  feedback = "\n".join(feedback_lines)
114
- if passed == total:
115
- feedback += "\n🎉 All tests passed! Full reward."
116
- else:
117
- feedback += f"\n{passed}/{total} tests passed. Review the failing cases."
118
 
119
- return reward, passed, total, feedback, results
 
1
  # server/graders/grader_easy.py
2
+ # Grades easy and medium tasks: runs code against test cases.
3
+ # Reward is proportional to tests passed (0.33, 0.67, 1.0).
4
 
5
  import traceback
6
  import signal
 
8
 
9
 
10
  def _timeout_handler(signum, frame):
11
+ raise TimeoutError("Code timed out — likely infinite loop. Check for missing visited set in graph traversal.")
12
 
13
 
14
  def _run_code_safely(code: str, func_name: str, test_input):
15
+ """Run submitted code safely with 5s timeout. Returns (result, error)."""
 
 
 
 
16
  namespace = {}
17
  try:
18
  exec(compile(code, "<submitted>", "exec"), namespace)
19
  except SyntaxError as e:
20
  return None, f"SyntaxError: {e}"
21
  except Exception as e:
22
+ return None, f"CompileError: {e}"
23
 
24
  func = namespace.get(func_name)
25
  if func is None:
26
+ funcs = [v for v in namespace.values() if callable(v) and not str(v.__name__).startswith("_")]
27
  if not funcs:
28
  return None, "No callable function found in submitted code."
29
  func = funcs[0]
30
 
31
  try:
 
32
  try:
33
  signal.signal(signal.SIGALRM, _timeout_handler)
34
  signal.alarm(5)
35
  except (AttributeError, OSError):
36
+ pass # Windows has no SIGALRM
37
 
38
  if isinstance(test_input, list) and len(test_input) > 0 and isinstance(test_input[0], list):
39
  result = func(*test_input)
 
46
  result = func(test_input)
47
 
48
  try:
49
+ signal.alarm(0)
50
  except (AttributeError, OSError):
51
  pass
52
 
53
  return result, None
54
+
55
  except TimeoutError as e:
56
+ return None, str(e)
57
  except Exception as e:
58
  try:
59
  signal.alarm(0)
 
63
 
64
 
65
  def _extract_func_name(code: str) -> str:
 
66
  for line in code.splitlines():
67
  line = line.strip()
68
  if line.startswith("def "):
 
72
 
73
  def grade_easy(fixed_code: str, task: dict) -> Tuple[float, int, int, str, List[dict]]:
74
  """
75
+ Grade submission against test cases.
76
+ Returns: (reward, passed, total, feedback, results)
 
 
 
 
 
 
77
  """
78
  test_cases = task["test_cases"]
79
  total = len(test_cases)
 
88
  got, error = _run_code_safely(fixed_code, func_name, inp)
89
 
90
  if error:
91
+ results.append({"test_id": i+1, "passed": False, "expected": str(expected), "got": f"ERROR"})
92
  feedback_lines.append(f"Test {i+1}: ❌ Error\n Input : {inp!r}\n Expected : {expected!r}\n Error : {error}")
93
  elif got == expected:
94
  passed += 1
95
+ results.append({"test_id": i+1, "passed": True, "expected": str(expected), "got": str(got)})
96
  feedback_lines.append(f"Test {i+1}: ✅ Passed\n Input : {inp!r}\n Expected : {expected!r}\n Got : {got!r}")
97
  else:
98
+ results.append({"test_id": i+1, "passed": False, "expected": str(expected), "got": str(got)})
99
  feedback_lines.append(f"Test {i+1}: ❌ Failed\n Input : {inp!r}\n Expected : {expected!r}\n Got : {got!r}")
100
 
101
  reward = round(passed / total, 2)
102
  feedback = "\n".join(feedback_lines)
103
+ feedback += "\n🎉 All tests passed! Full reward." if passed == total else f"\n{passed}/{total} tests passed."
 
 
 
104
 
105
+ return reward, passed, total, feedback, results
server/graders/grader_hard.py CHANGED
@@ -8,13 +8,10 @@ from .grader_easy import grade_easy
8
 
9
  def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple[float, str]:
10
  """
11
- Scores the explanation by checking for required conceptual keywords.
12
- Returns (score 0.0-1.0, feedback string).
13
-
14
- Scoring:
15
  - No explanation → 0.0
16
- - At least 1 keyword hit → partial credit
17
- - Half or more keywords → full credit 1.0
18
  """
19
  if not explanation or len(explanation.strip()) < 10:
20
  return 0.0, "❌ No explanation provided. Hard tasks require an explanation field."
@@ -23,31 +20,28 @@ def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple
23
  hits = [kw for kw in keywords if kw.lower() in explanation_lower]
24
 
25
  if not keywords:
26
- # No keywords defined — give full credit for any explanation
27
  score = 1.0 if len(explanation.strip()) > 20 else 0.5
28
  else:
29
- # Need at least 1 hit for partial, half for full
30
- needed_for_full = max(1, len(keywords) // 2)
31
  if len(hits) == 0:
32
  score = 0.0
33
- elif len(hits) >= needed_for_full:
34
  score = 1.0
35
  else:
36
- # Partial credit proportional to hits
37
- score = round(len(hits) / needed_for_full, 2)
38
 
39
  if score == 1.0:
40
- feedback = f"✅ Explanation excellent! Mentioned key concepts: {', '.join(hits)}"
41
  elif score > 0:
42
  missing = [kw for kw in keywords if kw.lower() not in explanation_lower]
43
  feedback = (
44
- f"⚠️ Partial explanation (score={score}). Mentioned: {', '.join(hits) if hits else 'none'}. "
45
- f"Consider also discussing: {', '.join(missing[:3])}"
46
  )
47
  else:
48
  feedback = (
49
  f"❌ Explanation missing key concepts. "
50
- f"Try to explain: {', '.join(keywords[:3])} in your analysis."
51
  )
52
 
53
  return round(score, 2), feedback
@@ -56,23 +50,11 @@ def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple
56
  def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -> Tuple[float, int, int, str, List[dict]]:
57
  """
58
  Grade a hard task submission.
59
- Reward = 0.7 * test_score + 0.3 * explanation_score
60
-
61
- Returns:
62
- reward (float): 0.0 to 1.0
63
- passed (int)
64
- total (int)
65
- feedback (str)
66
- results (list)
67
  """
68
- # Grade code using easy grader (same test execution logic)
69
  test_reward, passed, total, code_feedback, results = grade_easy(fixed_code, task)
70
-
71
- # Grade explanation
72
  keywords = task.get("explanation_keywords", [])
73
  exp_score, exp_feedback = _score_explanation(explanation, keywords)
74
-
75
- # Combined reward
76
  final_reward = round(0.7 * test_reward + 0.3 * exp_score, 2)
77
 
78
  feedback = (
@@ -83,71 +65,9 @@ def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -
83
  f"=== Final Reward: {final_reward:.2f} ==="
84
  )
85
 
86
- if passed < total and not explanation:
87
- feedback += "\n💡 Tip: Fix the code bugs AND provide a clear explanation for max reward."
88
-
89
  if passed == total and exp_score < 1.0:
90
- feedback += f"\n💡 Tip: Your code is correct! Improve explanation by mentioning: {', '.join(keywords[:3])}"
 
 
91
 
92
  return final_reward, passed, total, feedback, results
93
- """
94
- Scores the explanation by checking for required conceptual keywords.
95
- Returns (score 0.0-1.0, feedback string).
96
- """
97
- if not explanation or len(explanation.strip()) < 10:
98
- return 0.0, "❌ No explanation provided. Hard tasks require an explanation field."
99
-
100
- explanation_lower = explanation.lower()
101
- hits = [kw for kw in keywords if kw.lower() in explanation_lower]
102
- score = min(1.0, len(hits) / max(1, len(keywords) // 2)) # need at least half the keywords
103
-
104
- if score == 1.0:
105
- feedback = f"✅ Explanation excellent! Mentioned key concepts: {', '.join(hits)}"
106
- elif score > 0:
107
- feedback = (
108
- f"⚠️ Partial explanation. Mentioned: {', '.join(hits) if hits else 'none'}. "
109
- f"Consider discussing: {', '.join(kw for kw in keywords if kw.lower() not in explanation_lower)[:3]}"
110
- )
111
- else:
112
- feedback = (
113
- f"❌ Explanation missing key concepts. "
114
- f"Try to explain: {', '.join(keywords[:3])} in your analysis."
115
- )
116
-
117
- return round(score, 2), feedback
118
-
119
-
120
- def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -> Tuple[float, int, int, str, List[dict]]:
121
- """
122
- Grade a hard task submission.
123
- Reward = 0.7 * test_score + 0.3 * explanation_score
124
-
125
- Returns:
126
- reward (float): 0.0 to 1.0
127
- passed (int)
128
- total (int)
129
- feedback (str)
130
- results (list)
131
- """
132
- # Grade code
133
- test_reward, passed, total, code_feedback, results = grade_easy(fixed_code, task)
134
-
135
- # Grade explanation
136
- keywords = task.get("explanation_keywords", [])
137
- exp_score, exp_feedback = _score_explanation(explanation, keywords)
138
-
139
- # Combined reward
140
- final_reward = round(0.7 * test_reward + 0.3 * exp_score, 2)
141
-
142
- feedback = (
143
- f"--- Code Score (70% weight): {test_reward:.2f} ---\n"
144
- f"{code_feedback}\n\n"
145
- f"--- Explanation Score (30% weight): {exp_score:.2f} ---\n"
146
- f"{exp_feedback}\n\n"
147
- f"=== Final Reward: {final_reward:.2f} ==="
148
- )
149
-
150
- if passed < total and not explanation:
151
- feedback += "\n💡 Tip: Fix the code bugs AND provide a clear explanation for max reward."
152
-
153
- return final_reward, passed, total, feedback, results
 
8
 
9
  def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple[float, str]:
10
  """
11
+ Score explanation by checking for required conceptual keywords.
 
 
 
12
  - No explanation → 0.0
13
+ - 1+ keyword hit → partial credit proportional to hits
14
+ - Half or more keywords → 1.0
15
  """
16
  if not explanation or len(explanation.strip()) < 10:
17
  return 0.0, "❌ No explanation provided. Hard tasks require an explanation field."
 
20
  hits = [kw for kw in keywords if kw.lower() in explanation_lower]
21
 
22
  if not keywords:
 
23
  score = 1.0 if len(explanation.strip()) > 20 else 0.5
24
  else:
25
+ needed = max(1, len(keywords) // 2)
 
26
  if len(hits) == 0:
27
  score = 0.0
28
+ elif len(hits) >= needed:
29
  score = 1.0
30
  else:
31
+ score = round(len(hits) / needed, 2)
 
32
 
33
  if score == 1.0:
34
+ feedback = f"✅ Explanation excellent! Mentioned: {', '.join(hits)}"
35
  elif score > 0:
36
  missing = [kw for kw in keywords if kw.lower() not in explanation_lower]
37
  feedback = (
38
+ f"⚠️ Partial explanation (score={score}). Mentioned: {', '.join(hits) or 'none'}. "
39
+ f"Also discuss: {', '.join(missing[:3])}"
40
  )
41
  else:
42
  feedback = (
43
  f"❌ Explanation missing key concepts. "
44
+ f"Explain: {', '.join(keywords[:3])}"
45
  )
46
 
47
  return round(score, 2), feedback
 
50
  def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -> Tuple[float, int, int, str, List[dict]]:
51
  """
52
  Grade a hard task submission.
53
+ Reward = 0.7 × test_score + 0.3 × explanation_score
 
 
 
 
 
 
 
54
  """
 
55
  test_reward, passed, total, code_feedback, results = grade_easy(fixed_code, task)
 
 
56
  keywords = task.get("explanation_keywords", [])
57
  exp_score, exp_feedback = _score_explanation(explanation, keywords)
 
 
58
  final_reward = round(0.7 * test_reward + 0.3 * exp_score, 2)
59
 
60
  feedback = (
 
65
  f"=== Final Reward: {final_reward:.2f} ==="
66
  )
67
 
 
 
 
68
  if passed == total and exp_score < 1.0:
69
+ feedback += f"\n💡 Code is correct! Improve explanation by mentioning: {', '.join(keywords[:3])}"
70
+ elif passed < total and not explanation:
71
+ feedback += "\n💡 Fix the code AND provide a clear explanation for max reward."
72
 
73
  return final_reward, passed, total, feedback, results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/tasks/__pycache__/__init__.cpython-39.pyc DELETED
Binary file (449 Bytes)
 
server/tasks/__pycache__/task_easy.cpython-310.pyc CHANGED
Binary files a/server/tasks/__pycache__/task_easy.cpython-310.pyc and b/server/tasks/__pycache__/task_easy.cpython-310.pyc differ
 
server/tasks/__pycache__/task_easy.cpython-39.pyc DELETED
Binary file (7.37 kB)
 
server/tasks/__pycache__/task_hard.cpython-310.pyc CHANGED
Binary files a/server/tasks/__pycache__/task_hard.cpython-310.pyc and b/server/tasks/__pycache__/task_hard.cpython-310.pyc differ
 
server/tasks/__pycache__/task_hard.cpython-39.pyc DELETED
Binary file (16.5 kB)
 
server/tasks/__pycache__/task_medium.cpython-310.pyc CHANGED
Binary files a/server/tasks/__pycache__/task_medium.cpython-310.pyc and b/server/tasks/__pycache__/task_medium.cpython-310.pyc differ
 
server/tasks/__pycache__/task_medium.cpython-39.pyc DELETED
Binary file (10.5 kB)
 
server/tasks/task_hard.py CHANGED
@@ -625,4 +625,4 @@ def get_task_by_id(task_id: str) -> dict:
625
  for t in HARD_TASKS:
626
  if t["task_id"] == task_id:
627
  return t.copy()
628
- return random.choice(HARD_TASKS).copy()
 
625
  for t in HARD_TASKS:
626
  if t["task_id"] == task_id:
627
  return t.copy()
628
+ return random.choice(HARD_TASKS).copy()
server/tasks/task_medium.py CHANGED
@@ -504,4 +504,4 @@ def get_task_by_id(task_id: str) -> dict:
504
  for t in MEDIUM_TASKS:
505
  if t["task_id"] == task_id:
506
  return t.copy()
507
- return random.choice(MEDIUM_TASKS).copy()
 
504
  for t in MEDIUM_TASKS:
505
  if t["task_id"] == task_id:
506
  return t.copy()
507
+ return random.choice(MEDIUM_TASKS).copy()
spec.md CHANGED
@@ -1,23 +1,24 @@
1
- # Code Debug Environment — Specification
2
 
3
  ## Overview
 
 
 
 
4
 
5
- The Code Debug Environment is an OpenEnv-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
6
 
7
- ---
8
-
9
- ## API Specification
10
-
11
- ### POST /reset
12
- Start a new episode.
13
-
14
- **Request:**
15
  ```json
16
- {"difficulty": "easy"}
17
  ```
18
 
19
- **Response:**
20
  ```json
 
 
 
 
21
  {
22
  "observation": {
23
  "task_id": "easy_003",
@@ -25,112 +26,53 @@ Start a new episode.
25
  "buggy_code": "def find_max(nums):\n return min(nums)",
26
  "instructions": "The function has exactly one bug. Fix it.",
27
  "test_cases_description": "Finds max value in a list",
28
- "reward": null,
29
- "passed_tests": null,
30
- "total_tests": 3,
31
- "feedback": null,
32
- "done": false
33
  },
34
- "reward": 0.0,
35
- "done": false
36
  }
37
  ```
38
 
39
- ---
40
-
41
  ### POST /step
42
- Submit a code fix.
43
-
44
- **Request:**
45
  ```json
46
- {
47
- "fixed_code": "def find_max(nums):\n return max(nums)",
48
- "explanation": "Optional for hard tasks"
49
- }
50
- ```
51
 
52
- **Response:**
53
- ```json
54
  {
55
  "observation": {
56
- "task_id": "easy_003",
57
- "reward": 1.0,
58
- "passed_tests": 3,
59
- "total_tests": 3,
60
  "feedback": "Test 1: ✅ Passed\n Input: [1,2,3]\n Expected: 3\n Got: 3",
61
  "done": true
62
  },
63
- "reward": 1.0,
64
- "done": true
65
  }
66
  ```
67
 
68
- ---
69
-
70
  ### GET /state
71
- Returns current episode state.
72
-
73
  ```json
74
- {
75
- "episode_id": "uuid",
76
- "task_id": "easy_003",
77
- "difficulty": "easy",
78
- "step_count": 1,
79
- "max_steps": 5,
80
- "current_reward": 1.0,
81
- "best_reward": 1.0,
82
- "done": true
83
- }
84
  ```
85
 
86
- ---
87
-
88
- ### GET /health
89
  ```json
90
- {"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
91
  ```
92
 
93
- ---
 
 
 
 
 
94
 
95
- ## Reward Function
96
-
97
- ### Easy & Medium
98
- ```
99
- reward = passed_tests / total_tests
100
- ```
101
- - 3/3 → 1.00
102
- - 2/3 → 0.67
103
- - 1/3 → 0.33
104
- - 0/3 → 0.00
105
-
106
- ### Hard
107
- ```
108
- reward = 0.7 × test_score + 0.3 × explanation_score
109
- ```
110
-
111
- ### Invalid Actions
112
- - Empty code → reward = 0.0 + feedback message
113
- - Non-Python code → reward = 0.0 + feedback message
114
-
115
- ---
116
 
117
  ## Episode Rules
118
-
119
  - Max 5 steps per episode
120
- - Episode ends when reward = 1.0 OR max steps reached
121
- - Each step runs fixed_code against 3 deterministic test cases
122
- - Feedback shows Input, Expected, Got for each test
123
-
124
- ---
125
-
126
- ## Task Domains
127
-
128
- | Domain | Examples |
129
- |---|---|
130
- | List operations | second element, max, flatten |
131
- | String algorithms | palindrome, reverse, word count |
132
- | Math | fibonacci, factorial, square root |
133
- | Sorting | bubble sort, binary search |
134
- | Data processing | JSON parsing, API validation |
135
- | Graph algorithms | BFS, cycle detection |
136
- | Dynamic programming | knapsack, longest subsequence |
 
1
+ # Code Debug Environment — API Specification
2
 
3
  ## Overview
4
+ OpenEnv-compatible RL environment where LLM agents fix buggy Python code.
5
+ - 45 tasks: 15 easy + 15 medium + 15 hard
6
+ - Partial rewards: 0.33, 0.67, 1.0 based on test cases passed
7
+ - Hard tasks: reward = 0.7×code + 0.3×explanation
8
 
9
+ ## Endpoints
10
 
11
+ ### GET /health
 
 
 
 
 
 
 
12
  ```json
13
+ {"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
14
  ```
15
 
16
+ ### POST /reset
17
  ```json
18
+ // Request
19
+ {"difficulty": "easy"} // or "medium", "hard", omit for random
20
+
21
+ // Response
22
  {
23
  "observation": {
24
  "task_id": "easy_003",
 
26
  "buggy_code": "def find_max(nums):\n return min(nums)",
27
  "instructions": "The function has exactly one bug. Fix it.",
28
  "test_cases_description": "Finds max value in a list",
29
+ "reward": null, "passed_tests": null, "total_tests": 3,
30
+ "feedback": null, "done": false
 
 
 
31
  },
32
+ "reward": 0.0, "done": false
 
33
  }
34
  ```
35
 
 
 
36
  ### POST /step
 
 
 
37
  ```json
38
+ // Request
39
+ {"fixed_code": "def find_max(nums):\n return max(nums)", "explanation": "optional for hard"}
 
 
 
40
 
41
+ // Response
 
42
  {
43
  "observation": {
44
+ "task_id": "easy_003", "reward": 1.0,
45
+ "passed_tests": 3, "total_tests": 3,
 
 
46
  "feedback": "Test 1: ✅ Passed\n Input: [1,2,3]\n Expected: 3\n Got: 3",
47
  "done": true
48
  },
49
+ "reward": 1.0, "done": true
 
50
  }
51
  ```
52
 
 
 
53
  ### GET /state
 
 
54
  ```json
55
+ {"episode_id": "uuid", "task_id": "easy_003", "difficulty": "easy",
56
+ "step_count": 1, "max_steps": 5, "current_reward": 1.0, "best_reward": 1.0, "done": true}
 
 
 
 
 
 
 
 
57
  ```
58
 
59
+ ### GET /tasks
 
 
60
  ```json
61
+ {"easy": ["easy_001",...], "medium": ["medium_001",...], "hard": ["hard_001",...], "total": 45}
62
  ```
63
 
64
+ ## Reward Design
65
+ | Task | Formula |
66
+ |------|---------|
67
+ | Easy | passed/3 |
68
+ | Medium | passed/3 |
69
+ | Hard | 0.7×code_score + 0.3×explanation_score |
70
 
71
+ ## Invalid Actions
72
+ - Empty code → reward=0.0 + penalty feedback
73
+ - Infinite loop → TimeoutError → reward=0.0 + hint to add visited set
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  ## Episode Rules
 
76
  - Max 5 steps per episode
77
+ - Ends when reward=1.0 OR max steps reached
78
+ - 3 deterministic test cases per task
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_graders.py CHANGED
@@ -1,9 +1,5 @@
1
- # tests/test_graders.py
2
- # Basic tests to verify all graders work correctly.
3
- # Run: python -m pytest tests/ -v
4
-
5
- import sys
6
- import os
7
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
 
9
  from server.graders.grader_easy import grade_easy
@@ -14,56 +10,41 @@ from server.tasks.task_medium import MEDIUM_TASKS
14
  from server.tasks.task_hard import HARD_TASKS
15
 
16
 
17
- def test_easy_tasks_count():
18
- assert len(EASY_TASKS) == 15, f"Expected 15 easy tasks, got {len(EASY_TASKS)}"
19
-
20
-
21
- def test_medium_tasks_count():
22
- assert len(MEDIUM_TASKS) == 15, f"Expected 15 medium tasks, got {len(MEDIUM_TASKS)}"
23
-
24
-
25
- def test_hard_tasks_count():
26
- assert len(HARD_TASKS) == 15, f"Expected 15 hard tasks, got {len(HARD_TASKS)}"
27
-
28
 
29
- def test_easy_correct_fix_scores_1():
30
- for task in EASY_TASKS:
31
- reward, passed, total, _, _ = grade_easy(task["fixed_code"], task)
32
- assert reward == 1.0, f"{task['task_id']} should score 1.0, got {reward}"
33
 
 
 
 
 
34
 
35
- def test_medium_correct_fix_scores_1():
36
- for task in MEDIUM_TASKS:
37
- reward, passed, total, _, _ = grade_medium(task["fixed_code"], task)
38
- assert reward == 1.0, f"{task['task_id']} should score 1.0, got {reward}"
 
39
 
 
 
 
 
40
 
41
- def test_hard_correct_fix_scores_high():
42
- for task in HARD_TASKS:
43
- keywords = task.get("explanation_keywords", [])
44
- explanation = " ".join(keywords)
45
- reward, passed, total, _, _ = grade_hard(task["fixed_code"], task, explanation)
46
- assert reward >= 0.9, f"{task['task_id']} should score >= 0.9, got {reward}"
47
-
48
-
49
- def test_reward_range():
50
- for task in EASY_TASKS + MEDIUM_TASKS:
51
- reward, _, _, _, _ = grade_easy(task["buggy_code"], task)
52
- assert 0.0 <= reward <= 1.0, f"Reward out of range: {reward}"
53
-
54
 
55
  def test_empty_code_returns_zero():
56
- task = EASY_TASKS[0]
57
- reward, passed, total, feedback, _ = grade_easy("", task)
58
- assert reward == 0.0
59
-
60
-
61
- def test_buggy_code_scores_less_than_1():
62
- for task in EASY_TASKS[:5]:
63
- reward, _, _, _, _ = grade_easy(task["buggy_code"], task)
64
- assert reward < 1.0, f"{task['task_id']} buggy code should not score 1.0"
65
-
66
 
67
  if __name__ == "__main__":
68
  import pytest
69
- pytest.main([__file__, "-v"])
 
1
+ # tests/test_graders.py — Run: python -m pytest tests/ -v
2
+ import sys, os
 
 
 
 
3
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
4
 
5
  from server.graders.grader_easy import grade_easy
 
10
  from server.tasks.task_hard import HARD_TASKS
11
 
12
 
13
+ def test_task_counts():
14
+ assert len(EASY_TASKS) == 15
15
+ assert len(MEDIUM_TASKS) == 15
16
+ assert len(HARD_TASKS) == 15
 
 
 
 
 
 
 
17
 
18
+ def test_easy_correct_scores_1():
19
+ for t in EASY_TASKS:
20
+ r, _, _, _, _ = grade_easy(t["fixed_code"], t)
21
+ assert r == 1.0, f"{t['task_id']} expected 1.0 got {r}"
22
 
23
+ def test_medium_correct_scores_1():
24
+ for t in MEDIUM_TASKS:
25
+ r, _, _, _, _ = grade_medium(t["fixed_code"], t)
26
+ assert r == 1.0, f"{t['task_id']} expected 1.0 got {r}"
27
 
28
+ def test_hard_correct_scores_high():
29
+ for t in HARD_TASKS:
30
+ keywords = t.get("explanation_keywords", [])
31
+ r, _, _, _, _ = grade_hard(t["fixed_code"], t, " ".join(keywords))
32
+ assert r >= 0.9, f"{t['task_id']} expected >=0.9 got {r}"
33
 
34
+ def test_reward_in_range():
35
+ for t in EASY_TASKS:
36
+ r, _, _, _, _ = grade_easy(t["buggy_code"], t)
37
+ assert 0.0 <= r <= 1.0
38
 
39
+ def test_buggy_scores_less_than_1():
40
+ for t in EASY_TASKS[:5]:
41
+ r, _, _, _, _ = grade_easy(t["buggy_code"], t)
42
+ assert r < 1.0, f"{t['task_id']} buggy code should not score 1.0"
 
 
 
 
 
 
 
 
 
43
 
44
  def test_empty_code_returns_zero():
45
+ r, _, _, _, _ = grade_easy("", EASY_TASKS[0])
46
+ assert r == 0.0
 
 
 
 
 
 
 
 
47
 
48
  if __name__ == "__main__":
49
  import pytest
50
+ pytest.main([__file__, "-v"])