Souravdanyal commited on
Commit
c01667e
·
1 Parent(s): 08efbe6

Fix log format, add SPEC.md, tests, invalid action penalty, max_steps=5

Browse files
Files changed (5) hide show
  1. inference.py +74 -162
  2. openenv.yaml +79 -10
  3. server/environment.py +37 -2
  4. spec.md +136 -0
  5. tests/test_graders.py +69 -0
inference.py CHANGED
@@ -1,21 +1,12 @@
1
  #!/usr/bin/env python3
2
- # inference.py
3
- # ─────────────────────────────────────────────────────────────────────────────
4
- # Baseline inference script for the Code Debug Environment.
5
- # Must be run from the project root.
6
- #
7
- # Required environment variables:
8
- # API_BASE_URL — LLM API endpoint (OpenAI-compatible)
9
- # MODEL_NAME — Model identifier
10
- # HF_TOKEN — Hugging Face / API key
11
  #
 
12
  # Usage:
13
  # python inference.py
14
- # python inference.py --url https://your-hf-space.hf.space
15
  # python inference.py --difficulty easy
16
- #
17
- # Log format: [START], [STEP], [END] — strictly followed for evaluation scoring.
18
- # ─────────────────────────────────────────────────────────────────────────────
19
 
20
  import os
21
  import sys
@@ -24,96 +15,52 @@ import time
24
  import argparse
25
  import requests
26
  from openai import OpenAI
 
27
 
28
  # ─── Configuration ────────────────────────────────────────────────────────────
29
-
30
  API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
31
- MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
32
- HF_TOKEN = os.environ.get("HF_TOKEN", "")
33
- ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
34
-
35
- MAX_STEPS = 3
36
- DIFFICULTIES = ["easy", "medium", "hard"]
37
-
38
 
39
  # ─── OpenAI Client ───────────────────────────────────────────────────────────
 
40
 
41
- client = OpenAI(
42
- api_key=HF_TOKEN or "dummy",
43
- base_url=API_BASE_URL,
44
- )
45
-
46
-
47
- # ─── Logging (strict format required by evaluator) ───────────────────────────
48
-
49
- def log_start(task_id: str, difficulty: str, episode: int):
50
- print(json.dumps({
51
- "type": "START",
52
- "episode": episode,
53
- "task_id": task_id,
54
- "difficulty": difficulty,
55
- "timestamp": time.time(),
56
- }), flush=True)
57
-
58
 
59
- def log_step(task_id: str, step: int, action_summary: str, reward: float, done: bool):
60
- print(json.dumps({
61
- "type": "STEP",
62
- "task_id": task_id,
63
- "step": step,
64
- "action": action_summary,
65
- "reward": reward,
66
- "done": done,
67
- "timestamp": time.time(),
68
- }), flush=True)
69
 
 
 
 
 
70
 
71
- def log_end(task_id: str, difficulty: str, final_reward: float, steps_taken: int, episode: int):
72
- print(json.dumps({
73
- "type": "END",
74
- "episode": episode,
75
- "task_id": task_id,
76
- "difficulty": difficulty,
77
- "final_reward": final_reward,
78
- "steps_taken": steps_taken,
79
- "timestamp": time.time(),
80
- }), flush=True)
81
-
82
 
83
  # ─── Environment Client ───────────────────────────────────────────────────────
84
-
85
  def env_reset(env_url: str, difficulty: str) -> dict:
86
- resp = requests.post(
87
- f"{env_url}/reset",
88
- json={"difficulty": difficulty},
89
- timeout=30,
90
- )
91
  resp.raise_for_status()
92
  return resp.json()
93
 
94
-
95
  def env_step(env_url: str, fixed_code: str, explanation: str = None) -> dict:
96
  payload = {"fixed_code": fixed_code}
97
  if explanation:
98
  payload["explanation"] = explanation
99
- resp = requests.post(
100
- f"{env_url}/step",
101
- json=payload,
102
- timeout=30,
103
- )
104
- resp.raise_for_status()
105
- return resp.json()
106
-
107
-
108
- def env_state(env_url: str) -> dict:
109
- resp = requests.get(f"{env_url}/state", timeout=10)
110
  resp.raise_for_status()
111
  return resp.json()
112
 
113
-
114
  # ─── LLM Agent ───────────────────────────────────────────────────────────────
115
-
116
- SYSTEM_PROMPT = """You are an expert Python debugging agent.
117
  You will be given buggy Python code and must fix it.
118
 
119
  For easy tasks: fix the single bug.
@@ -133,11 +80,8 @@ Rules:
133
  - Do NOT include markdown fences or any text outside the JSON object.
134
  """
135
 
136
-
137
  def call_llm(buggy_code: str, instructions: str, difficulty: str,
138
  feedback: str = None, attempt: int = 1) -> dict:
139
- """Call the LLM and return parsed {fixed_code, explanation}."""
140
-
141
  user_content = f"""Task difficulty: {difficulty}
142
  Instructions: {instructions}
143
 
@@ -156,130 +100,98 @@ Buggy code:
156
 
157
  try:
158
  response = client.chat.completions.create(
159
- model=MODEL_NAME,
160
- messages=messages,
161
- max_tokens=1000,
162
- temperature=0.1,
163
  )
164
  content = response.choices[0].message.content.strip()
165
-
166
- # Strip markdown fences if present
167
  if content.startswith("```"):
168
  lines = content.split("\n")
169
  content = "\n".join(lines[1:-1]) if lines[-1] == "```" else "\n".join(lines[1:])
170
-
171
  parsed = json.loads(content)
172
- return {
173
- "fixed_code": parsed.get("fixed_code", ""),
174
- "explanation": parsed.get("explanation", None),
175
- }
176
  except json.JSONDecodeError:
177
- # Fallback: return original code if parsing fails
178
  return {"fixed_code": buggy_code, "explanation": None}
179
  except Exception as e:
180
- print(f"LLM call failed: {e}", file=sys.stderr)
181
  return {"fixed_code": buggy_code, "explanation": None}
182
 
183
-
184
  # ─── Main Episode Loop ────────────────────────────────────────────────────────
185
-
186
- def run_episode(env_url: str, difficulty: str, episode_num: int) -> float:
187
- """Run one full episode. Returns final reward."""
188
-
189
- # Reset
190
  reset_data = env_reset(env_url, difficulty)
191
  obs = reset_data["observation"]
192
-
193
- task_id = obs["task_id"]
194
- buggy_code = obs["buggy_code"]
195
  instructions = obs["instructions"]
196
 
197
- log_start(task_id, difficulty, episode_num)
198
 
199
  last_feedback = None
200
- final_reward = 0.0
201
- step_num = 0
 
202
 
203
  for attempt in range(1, MAX_STEPS + 1):
204
- step_num = attempt
205
-
206
- # Call LLM
207
  agent_action = call_llm(
208
- buggy_code=buggy_code,
209
- instructions=instructions,
210
- difficulty=difficulty,
211
- feedback=last_feedback,
212
- attempt=attempt,
213
  )
 
214
 
215
- # Submit to environment
216
- result = env_step(
217
- env_url,
218
- fixed_code=agent_action["fixed_code"],
219
- explanation=agent_action.get("explanation"),
220
- )
 
 
 
 
 
221
 
222
  reward = result.get("reward", 0.0)
223
- done = result.get("done", False)
224
- obs_result = result.get("observation", {})
225
- last_feedback = obs_result.get("feedback", "")
226
-
227
- log_step(
228
- task_id=task_id,
229
- step=attempt,
230
- action_summary=f"Submitted fix attempt {attempt} ({len(agent_action['fixed_code'])} chars)",
231
- reward=reward,
232
- done=done,
233
- )
234
 
235
- final_reward = reward
 
236
 
 
 
237
  if done:
238
  break
239
 
240
- log_end(task_id, difficulty, final_reward, step_num, episode_num)
241
- return final_reward
242
-
243
 
244
  def main():
245
  parser = argparse.ArgumentParser(description="Code Debug Environment Baseline Agent")
246
  parser.add_argument("--url", default=ENV_URL, help="Environment base URL")
247
- parser.add_argument("--difficulty", default=None, choices=["easy", "medium", "hard", "all"],
248
- help="Difficulty to run. 'all' runs one episode per difficulty.")
249
  args = parser.parse_args()
250
-
251
  env_url = args.url.rstrip("/")
252
 
253
- # Health check
254
  try:
255
  health = requests.get(f"{env_url}/health", timeout=10)
256
  health.raise_for_status()
257
- print(json.dumps({"type": "INFO", "message": f"Environment healthy at {env_url}"}), flush=True)
258
  except Exception as e:
259
- print(json.dumps({"type": "ERROR", "message": f"Health check failed: {e}"}), flush=True)
260
  sys.exit(1)
261
 
262
- # Determine episodes to run
263
- if args.difficulty == "all" or args.difficulty is None:
264
- episodes = [("easy", 1), ("medium", 2), ("hard", 3)]
265
- else:
266
- episodes = [(args.difficulty, 1)]
267
 
268
  all_rewards = []
269
- for episode_num, (difficulty, ep_id) in enumerate(episodes, start=1):
270
- reward = run_episode(env_url, difficulty, episode_num) # use episode_num, not ep_id
271
- all_rewards.append({"difficulty": difficulty, "reward": reward})
272
- time.sleep(0.5) # Small pause between episodes
273
-
274
- # Summary
275
- print(json.dumps({
276
- "type": "SUMMARY",
277
- "total_episodes": len(all_rewards),
278
- "results": all_rewards,
279
- "average_reward": round(sum(r["reward"] for r in all_rewards) / len(all_rewards), 3),
280
- "timestamp": time.time(),
281
- }), flush=True)
282
 
 
 
283
 
284
  if __name__ == "__main__":
285
- main()
 
1
  #!/usr/bin/env python3
2
+ # inference.py — Code Debug Environment Baseline Agent
3
+ # Log format strictly follows [START] [STEP] [END] as required by evaluator.
 
 
 
 
 
 
 
4
  #
5
+ # Required env vars: API_BASE_URL, MODEL_NAME, HF_TOKEN
6
  # Usage:
7
  # python inference.py
8
+ # python inference.py --url https://Souravdanyal-code-debug-env.hf.space
9
  # python inference.py --difficulty easy
 
 
 
10
 
11
  import os
12
  import sys
 
15
  import argparse
16
  import requests
17
  from openai import OpenAI
18
+ from typing import List, Optional
19
 
20
  # ─── Configuration ────────────────────────────────────────────────────────────
 
21
  API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
22
+ MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
23
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
24
+ ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
25
+ BENCHMARK = "code-debug-env"
26
+ MAX_STEPS = 5
 
 
27
 
28
  # ─── OpenAI Client ───────────────────────────────────────────────────────────
29
+ client = OpenAI(api_key=HF_TOKEN or "dummy", base_url=API_BASE_URL)
30
 
31
+ # ─── Logging — STRICT FORMAT REQUIRED BY EVALUATOR ───────────────────────────
32
+ # [START] task=<task_id> env=<benchmark> model=<model_name>
33
+ # [STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
34
+ # [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ def log_start(task_id: str, env: str, model: str) -> None:
37
+ print(f"[START] task={task_id} env={env} model={model}", flush=True)
 
 
 
 
 
 
 
 
38
 
39
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
40
+ error_val = error if error else "null"
41
+ done_val = str(done).lower()
42
+ print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
43
 
44
+ def log_end(success: bool, steps: int, rewards: List[float]) -> None:
45
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
46
+ print(f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}", flush=True)
 
 
 
 
 
 
 
 
47
 
48
  # ─── Environment Client ───────────────────────────────────────────────────────
 
49
  def env_reset(env_url: str, difficulty: str) -> dict:
50
+ resp = requests.post(f"{env_url}/reset", json={"difficulty": difficulty}, timeout=30)
 
 
 
 
51
  resp.raise_for_status()
52
  return resp.json()
53
 
 
54
  def env_step(env_url: str, fixed_code: str, explanation: str = None) -> dict:
55
  payload = {"fixed_code": fixed_code}
56
  if explanation:
57
  payload["explanation"] = explanation
58
+ resp = requests.post(f"{env_url}/step", json=payload, timeout=30)
 
 
 
 
 
 
 
 
 
 
59
  resp.raise_for_status()
60
  return resp.json()
61
 
 
62
  # ─── LLM Agent ───────────────────────────────────────────────────────────────
63
+ SYSTEM_PROMPT = """You are an expert Python debugging agent.
 
64
  You will be given buggy Python code and must fix it.
65
 
66
  For easy tasks: fix the single bug.
 
80
  - Do NOT include markdown fences or any text outside the JSON object.
81
  """
82
 
 
83
  def call_llm(buggy_code: str, instructions: str, difficulty: str,
84
  feedback: str = None, attempt: int = 1) -> dict:
 
 
85
  user_content = f"""Task difficulty: {difficulty}
86
  Instructions: {instructions}
87
 
 
100
 
101
  try:
102
  response = client.chat.completions.create(
103
+ model=MODEL_NAME, messages=messages, max_tokens=1000, temperature=0.1,
 
 
 
104
  )
105
  content = response.choices[0].message.content.strip()
 
 
106
  if content.startswith("```"):
107
  lines = content.split("\n")
108
  content = "\n".join(lines[1:-1]) if lines[-1] == "```" else "\n".join(lines[1:])
 
109
  parsed = json.loads(content)
110
+ return {"fixed_code": parsed.get("fixed_code", ""), "explanation": parsed.get("explanation", None)}
 
 
 
111
  except json.JSONDecodeError:
 
112
  return {"fixed_code": buggy_code, "explanation": None}
113
  except Exception as e:
114
+ print(f"# LLM call failed: {e}", file=sys.stderr)
115
  return {"fixed_code": buggy_code, "explanation": None}
116
 
 
117
  # ─── Main Episode Loop ────────────────────────────────────────────────────────
118
+ def run_episode(env_url: str, difficulty: str) -> tuple:
 
 
 
 
119
  reset_data = env_reset(env_url, difficulty)
120
  obs = reset_data["observation"]
121
+ task_id = obs["task_id"]
122
+ buggy_code = obs["buggy_code"]
 
123
  instructions = obs["instructions"]
124
 
125
+ log_start(task_id=task_id, env=BENCHMARK, model=MODEL_NAME)
126
 
127
  last_feedback = None
128
+ rewards: List[float] = []
129
+ steps_taken = 0
130
+ success = False
131
 
132
  for attempt in range(1, MAX_STEPS + 1):
133
+ steps_taken = attempt
 
 
134
  agent_action = call_llm(
135
+ buggy_code=buggy_code, instructions=instructions,
136
+ difficulty=difficulty, feedback=last_feedback, attempt=attempt,
 
 
 
137
  )
138
+ fixed_code = agent_action["fixed_code"]
139
 
140
+ if not fixed_code or not fixed_code.strip():
141
+ log_step(step=attempt, action="empty_submission", reward=0.0, done=False, error="empty_code")
142
+ rewards.append(0.0)
143
+ continue
144
+
145
+ try:
146
+ result = env_step(env_url, fixed_code=fixed_code, explanation=agent_action.get("explanation"))
147
+ except Exception as e:
148
+ log_step(step=attempt, action="step_failed", reward=0.0, done=False, error=str(e)[:60])
149
+ rewards.append(0.0)
150
+ continue
151
 
152
  reward = result.get("reward", 0.0)
153
+ done = result.get("done", False)
154
+ obs_r = result.get("observation", {})
155
+ last_feedback = obs_r.get("feedback", "")
 
 
 
 
 
 
 
 
156
 
157
+ log_step(step=attempt, action=f"fix_{difficulty}_attempt{attempt}", reward=reward, done=done, error=None)
158
+ rewards.append(reward)
159
 
160
+ if reward >= 1.0:
161
+ success = True
162
  if done:
163
  break
164
 
165
+ log_end(success=success, steps=steps_taken, rewards=rewards)
166
+ return success, steps_taken, rewards
 
167
 
168
  def main():
169
  parser = argparse.ArgumentParser(description="Code Debug Environment Baseline Agent")
170
  parser.add_argument("--url", default=ENV_URL, help="Environment base URL")
171
+ parser.add_argument("--difficulty", default=None, choices=["easy", "medium", "hard", "all"])
 
172
  args = parser.parse_args()
 
173
  env_url = args.url.rstrip("/")
174
 
 
175
  try:
176
  health = requests.get(f"{env_url}/health", timeout=10)
177
  health.raise_for_status()
178
+ print(f"# Environment healthy at {env_url}", flush=True)
179
  except Exception as e:
180
+ print(f"# Health check failed: {e}", file=sys.stderr)
181
  sys.exit(1)
182
 
183
+ difficulties = ["easy", "medium", "hard"] if (args.difficulty in ("all", None)) else [args.difficulty]
 
 
 
 
184
 
185
  all_rewards = []
186
+ all_successes = []
187
+ for difficulty in difficulties:
188
+ success, steps, rewards = run_episode(env_url, difficulty)
189
+ all_rewards.extend(rewards)
190
+ all_successes.append(success)
191
+ time.sleep(0.5)
 
 
 
 
 
 
 
192
 
193
+ avg = round(sum(all_rewards) / len(all_rewards), 3) if all_rewards else 0.0
194
+ print(f"# SUMMARY: {sum(all_successes)}/{len(difficulties)} tasks solved | avg_reward={avg}", flush=True)
195
 
196
  if __name__ == "__main__":
197
+ main()
openenv.yaml CHANGED
@@ -4,12 +4,21 @@ type: typed
4
  description: >
5
  A real-world RL environment where an LLM agent diagnoses and fixes
6
  buggy Python code across three difficulty levels (easy, medium, hard).
7
- Tasks are drawn from real-world domains: data processing, API handlers,
8
- and algorithmic functions. Rewards are partial and proportional to how
9
- many test cases pass, with bonuses for correct explanations on hard tasks.
 
10
 
11
  version: 1.0.0
12
- author: Souravdanyal
 
 
 
 
 
 
 
 
13
 
14
  runtime:
15
  type: docker
@@ -20,29 +29,89 @@ app:
20
  host: 0.0.0.0
21
  port: 7860
22
 
 
 
 
 
23
  tasks:
24
  - id: easy
25
- description: "Fix a single off-by-one or operator bug in a Python function"
26
  difficulty: easy
27
- max_steps: 3
28
  reward_range: [0.0, 1.0]
 
 
29
 
30
  - id: medium
31
- description: "Fix two bugs (logic + edge case) so all test cases pass"
32
  difficulty: medium
33
- max_steps: 3
34
  reward_range: [0.0, 1.0]
 
 
35
 
36
  - id: hard
37
- description: "Fix an algorithmic bug AND provide a correct explanation"
38
  difficulty: hard
39
- max_steps: 3
40
  reward_range: [0.0, 1.0]
 
 
41
 
42
  reward_range: [0.0, 1.0]
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  api:
45
  reset: /reset
46
  step: /step
47
  state: /state
48
  health: /health
 
 
4
  description: >
5
  A real-world RL environment where an LLM agent diagnoses and fixes
6
  buggy Python code across three difficulty levels (easy, medium, hard).
7
+ Tasks are drawn from real-world domains: data processing, string algorithms,
8
+ API validation, sorting, dynamic programming, and graph algorithms.
9
+ Rewards are partial and proportional to test cases passed, with bonuses
10
+ for correct explanations on hard tasks.
11
 
12
  version: 1.0.0
13
+ author: Souravdanyal
14
+
15
+ tags:
16
+ - code-debugging
17
+ - python
18
+ - reinforcement-learning
19
+ - openenv
20
+ - llm-agent
21
+ - software-engineering
22
 
23
  runtime:
24
  type: docker
 
29
  host: 0.0.0.0
30
  port: 7860
31
 
32
+ config:
33
+ episode_timeout: 300
34
+ max_steps: 5
35
+
36
  tasks:
37
  - id: easy
38
+ description: "Fix a single off-by-one, operator, or return bug in a Python function"
39
  difficulty: easy
40
+ max_steps: 5
41
  reward_range: [0.0, 1.0]
42
+ grader: deterministic
43
+ num_tasks: 15
44
 
45
  - id: medium
46
+ description: "Fix two bugs (logic bug + edge case) so all test cases pass"
47
  difficulty: medium
48
+ max_steps: 5
49
  reward_range: [0.0, 1.0]
50
+ grader: deterministic
51
+ num_tasks: 15
52
 
53
  - id: hard
54
+ description: "Fix an algorithmic bug AND provide a correct explanation of the root cause"
55
  difficulty: hard
56
+ max_steps: 5
57
  reward_range: [0.0, 1.0]
58
+ grader: deterministic
59
+ num_tasks: 15
60
 
61
  reward_range: [0.0, 1.0]
62
 
63
+ action_space:
64
+ type: dict
65
+ description: "Agent submits fixed Python code and optional explanation"
66
+ fields:
67
+ fixed_code:
68
+ type: string
69
+ required: true
70
+ description: "Complete corrected Python function as a string. Must be valid Python."
71
+ explanation:
72
+ type: string
73
+ required: false
74
+ description: "Required for hard tasks. Explain the bug, root cause, and why fix is correct."
75
+
76
+ observation_space:
77
+ type: dict
78
+ description: "Environment observation returned after reset() and step()"
79
+ fields:
80
+ task_id:
81
+ type: string
82
+ description: "Unique identifier for the current task instance (e.g. easy_003)"
83
+ difficulty:
84
+ type: enum
85
+ values: [easy, medium, hard]
86
+ description: "Task difficulty level"
87
+ buggy_code:
88
+ type: string
89
+ description: "The buggy Python function the agent must fix"
90
+ instructions:
91
+ type: string
92
+ description: "Natural language description of what is wrong and what to fix"
93
+ test_cases_description:
94
+ type: string
95
+ description: "Description of what the test cases check"
96
+ reward:
97
+ type: float
98
+ description: "Score from 0.0 to 1.0 for this step (null on reset)"
99
+ passed_tests:
100
+ type: integer
101
+ description: "Number of test cases passed (null on reset)"
102
+ total_tests:
103
+ type: integer
104
+ description: "Total number of test cases (always 3)"
105
+ feedback:
106
+ type: string
107
+ description: "Detailed per-test feedback showing input, expected, and got values"
108
+ done:
109
+ type: boolean
110
+ description: "True when episode is complete (perfect score or max steps reached)"
111
+
112
  api:
113
  reset: /reset
114
  step: /step
115
  state: /state
116
  health: /health
117
+ tasks: /tasks
server/environment.py CHANGED
@@ -29,7 +29,7 @@ GRADERS = {
29
  "hard": grade_hard,
30
  }
31
 
32
- MAX_STEPS = 3
33
 
34
 
35
  class CodeDebugEnvironment(Environment):
@@ -101,6 +101,41 @@ class CodeDebugEnvironment(Environment):
101
 
102
  self._step_count += 1
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  # Grade the submission
105
  grader = GRADERS[self._difficulty]
106
  if self._difficulty == "hard":
@@ -144,4 +179,4 @@ class CodeDebugEnvironment(Environment):
144
  current_reward=self._current_reward,
145
  best_reward=self._best_reward,
146
  done=self._done,
147
- )
 
29
  "hard": grade_hard,
30
  }
31
 
32
+ MAX_STEPS = 5
33
 
34
 
35
  class CodeDebugEnvironment(Environment):
 
101
 
102
  self._step_count += 1
103
 
104
+ # ── Invalid action penalty ──────────────────────────────────────────
105
+ code = action.fixed_code.strip() if action.fixed_code else ""
106
+ if not code:
107
+ done = self._step_count >= MAX_STEPS
108
+ self._done = done
109
+ return DebugObservation(
110
+ task_id=self._current_task["task_id"],
111
+ difficulty=self._difficulty,
112
+ buggy_code=self._current_task["buggy_code"],
113
+ instructions=self._current_task["instructions"],
114
+ test_cases_description=self._current_task["test_cases_description"],
115
+ reward=0.0,
116
+ passed_tests=0,
117
+ total_tests=len(self._current_task["test_cases"]),
118
+ feedback="❌ Invalid action: fixed_code is empty. Penalty applied. Submit valid Python code.",
119
+ done=done,
120
+ )
121
+
122
+ # Check for obvious non-Python (very short or no 'def' keyword)
123
+ if len(code) < 5 or ("def " not in code and "lambda" not in code and "=" not in code):
124
+ done = self._step_count >= MAX_STEPS
125
+ self._done = done
126
+ return DebugObservation(
127
+ task_id=self._current_task["task_id"],
128
+ difficulty=self._difficulty,
129
+ buggy_code=self._current_task["buggy_code"],
130
+ instructions=self._current_task["instructions"],
131
+ test_cases_description=self._current_task["test_cases_description"],
132
+ reward=0.0,
133
+ passed_tests=0,
134
+ total_tests=len(self._current_task["test_cases"]),
135
+ feedback="❌ Invalid action: submission does not appear to be valid Python. Penalty applied.",
136
+ done=done,
137
+ )
138
+
139
  # Grade the submission
140
  grader = GRADERS[self._difficulty]
141
  if self._difficulty == "hard":
 
179
  current_reward=self._current_reward,
180
  best_reward=self._best_reward,
181
  done=self._done,
182
+ )
spec.md ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code Debug Environment — Specification
2
+
3
+ ## Overview
4
+
5
+ The Code Debug Environment is an OpenEnv-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
6
+
7
+ ---
8
+
9
+ ## API Specification
10
+
11
+ ### POST /reset
12
+ Start a new episode.
13
+
14
+ **Request:**
15
+ ```json
16
+ {"difficulty": "easy"}
17
+ ```
18
+
19
+ **Response:**
20
+ ```json
21
+ {
22
+ "observation": {
23
+ "task_id": "easy_003",
24
+ "difficulty": "easy",
25
+ "buggy_code": "def find_max(nums):\n return min(nums)",
26
+ "instructions": "The function has exactly one bug. Fix it.",
27
+ "test_cases_description": "Finds max value in a list",
28
+ "reward": null,
29
+ "passed_tests": null,
30
+ "total_tests": 3,
31
+ "feedback": null,
32
+ "done": false
33
+ },
34
+ "reward": 0.0,
35
+ "done": false
36
+ }
37
+ ```
38
+
39
+ ---
40
+
41
+ ### POST /step
42
+ Submit a code fix.
43
+
44
+ **Request:**
45
+ ```json
46
+ {
47
+ "fixed_code": "def find_max(nums):\n return max(nums)",
48
+ "explanation": "Optional for hard tasks"
49
+ }
50
+ ```
51
+
52
+ **Response:**
53
+ ```json
54
+ {
55
+ "observation": {
56
+ "task_id": "easy_003",
57
+ "reward": 1.0,
58
+ "passed_tests": 3,
59
+ "total_tests": 3,
60
+ "feedback": "Test 1: ✅ Passed\n Input: [1,2,3]\n Expected: 3\n Got: 3",
61
+ "done": true
62
+ },
63
+ "reward": 1.0,
64
+ "done": true
65
+ }
66
+ ```
67
+
68
+ ---
69
+
70
+ ### GET /state
71
+ Returns current episode state.
72
+
73
+ ```json
74
+ {
75
+ "episode_id": "uuid",
76
+ "task_id": "easy_003",
77
+ "difficulty": "easy",
78
+ "step_count": 1,
79
+ "max_steps": 5,
80
+ "current_reward": 1.0,
81
+ "best_reward": 1.0,
82
+ "done": true
83
+ }
84
+ ```
85
+
86
+ ---
87
+
88
+ ### GET /health
89
+ ```json
90
+ {"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
91
+ ```
92
+
93
+ ---
94
+
95
+ ## Reward Function
96
+
97
+ ### Easy & Medium
98
+ ```
99
+ reward = passed_tests / total_tests
100
+ ```
101
+ - 3/3 → 1.00
102
+ - 2/3 → 0.67
103
+ - 1/3 → 0.33
104
+ - 0/3 → 0.00
105
+
106
+ ### Hard
107
+ ```
108
+ reward = 0.7 × test_score + 0.3 × explanation_score
109
+ ```
110
+
111
+ ### Invalid Actions
112
+ - Empty code → reward = 0.0 + feedback message
113
+ - Non-Python code → reward = 0.0 + feedback message
114
+
115
+ ---
116
+
117
+ ## Episode Rules
118
+
119
+ - Max 5 steps per episode
120
+ - Episode ends when reward = 1.0 OR max steps reached
121
+ - Each step runs fixed_code against 3 deterministic test cases
122
+ - Feedback shows Input, Expected, Got for each test
123
+
124
+ ---
125
+
126
+ ## Task Domains
127
+
128
+ | Domain | Examples |
129
+ |---|---|
130
+ | List operations | second element, max, flatten |
131
+ | String algorithms | palindrome, reverse, word count |
132
+ | Math | fibonacci, factorial, square root |
133
+ | Sorting | bubble sort, binary search |
134
+ | Data processing | JSON parsing, API validation |
135
+ | Graph algorithms | BFS, cycle detection |
136
+ | Dynamic programming | knapsack, longest subsequence |
tests/test_graders.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tests/test_graders.py
2
+ # Basic tests to verify all graders work correctly.
3
+ # Run: python -m pytest tests/ -v
4
+
5
+ import sys
6
+ import os
7
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
+
9
+ from server.graders.grader_easy import grade_easy
10
+ from server.graders.grader_medium import grade_medium
11
+ from server.graders.grader_hard import grade_hard
12
+ from server.tasks.task_easy import EASY_TASKS
13
+ from server.tasks.task_medium import MEDIUM_TASKS
14
+ from server.tasks.task_hard import HARD_TASKS
15
+
16
+
17
+ def test_easy_tasks_count():
18
+ assert len(EASY_TASKS) == 15, f"Expected 15 easy tasks, got {len(EASY_TASKS)}"
19
+
20
+
21
+ def test_medium_tasks_count():
22
+ assert len(MEDIUM_TASKS) == 15, f"Expected 15 medium tasks, got {len(MEDIUM_TASKS)}"
23
+
24
+
25
+ def test_hard_tasks_count():
26
+ assert len(HARD_TASKS) == 15, f"Expected 15 hard tasks, got {len(HARD_TASKS)}"
27
+
28
+
29
+ def test_easy_correct_fix_scores_1():
30
+ for task in EASY_TASKS:
31
+ reward, passed, total, _, _ = grade_easy(task["fixed_code"], task)
32
+ assert reward == 1.0, f"{task['task_id']} should score 1.0, got {reward}"
33
+
34
+
35
+ def test_medium_correct_fix_scores_1():
36
+ for task in MEDIUM_TASKS:
37
+ reward, passed, total, _, _ = grade_medium(task["fixed_code"], task)
38
+ assert reward == 1.0, f"{task['task_id']} should score 1.0, got {reward}"
39
+
40
+
41
+ def test_hard_correct_fix_scores_high():
42
+ for task in HARD_TASKS:
43
+ keywords = task.get("explanation_keywords", [])
44
+ explanation = " ".join(keywords)
45
+ reward, passed, total, _, _ = grade_hard(task["fixed_code"], task, explanation)
46
+ assert reward >= 0.9, f"{task['task_id']} should score >= 0.9, got {reward}"
47
+
48
+
49
+ def test_reward_range():
50
+ for task in EASY_TASKS + MEDIUM_TASKS:
51
+ reward, _, _, _, _ = grade_easy(task["buggy_code"], task)
52
+ assert 0.0 <= reward <= 1.0, f"Reward out of range: {reward}"
53
+
54
+
55
+ def test_empty_code_returns_zero():
56
+ task = EASY_TASKS[0]
57
+ reward, passed, total, feedback, _ = grade_easy("", task)
58
+ assert reward == 0.0
59
+
60
+
61
+ def test_buggy_code_scores_less_than_1():
62
+ for task in EASY_TASKS[:5]:
63
+ reward, _, _, _, _ = grade_easy(task["buggy_code"], task)
64
+ assert reward < 1.0, f"{task['task_id']} buggy code should not score 1.0"
65
+
66
+
67
+ if __name__ == "__main__":
68
+ import pytest
69
+ pytest.main([__file__, "-v"])