Souravdanyal commited on
Commit
d298b6d
Β·
1 Parent(s): de1fd70

Fix log format, timeout protection, SPEC.md, tests, max_steps=5, remove pycache

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __pycache__/
2
+ .vscode/
3
+ __pycache__/
4
+ .vscode/
.vscode/settings.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "python-envs.defaultEnvManager": "ms-python.python:conda",
3
- "python-envs.defaultPackageManager": "ms-python.python:conda"
4
- }
 
 
 
 
 
__pycache__/models.cpython-310.pyc DELETED
Binary file (2.74 kB)
 
inference.py CHANGED
@@ -63,7 +63,6 @@ CRITICAL RULES:
63
  - Return the COMPLETE fixed function, not just the changed line
64
  - The fixed_code must be syntactically valid Python
65
  - For hard tasks, the explanation field MUST describe: what the bug was, why it caused failures, and how your fix resolves it
66
- - ALWAYS preserve the original function signature and structure
67
 
68
  Response format (strictly):
69
  {
@@ -75,16 +74,9 @@ DEBUGGING STRATEGY:
75
  1. Read the instructions carefully β€” they tell you exactly what type of bug exists
76
  2. Trace through the logic with the test inputs mentally
77
  3. For easy tasks: find the ONE wrong operator, value, or return statement
78
- 4. For medium tasks: find BOTH bugs β€” usually one logic bug + one edge case (swapped if/else, wrong operators)
79
- 5. For hard tasks: find the algorithmic flaw (loop bounds, iteration order, missing checks) + write a clear explanation
80
  6. If your previous attempt failed, READ THE FEEDBACK β€” it shows exactly which inputs failed and what output was expected
81
-
82
- COMMON BUG PATTERNS:
83
- - Easy: Wrong comparison (==, !=, <, >), off-by-one errors, wrong return value
84
- - Medium: Swapped if/else logic, missing edge case check, two related operators wrong
85
- - Hard: Wrong iteration order (forward vs backward), missing visited set, incorrect DP initialization, boundary conditions
86
-
87
- IMPORTANT: Do not add imports, libraries, or change the algorithm unless absolutely necessary. Fix the bugs in the existing code.
88
  """
89
 
90
  def call_llm(buggy_code: str, instructions: str, difficulty: str,
@@ -112,29 +104,15 @@ Your previous fix was:
112
  IMPORTANT: Your previous fix did not work. Carefully analyze the feedback above.
113
  Look at the Input, Expected, and Got values for each failing test.
114
  Try a completely different approach to fix the bug.
115
- """
116
-
117
- if difficulty == "medium":
118
- user_content += """
119
- MEDIUM TASK TIPS:
120
- - Look for EXACTLY TWO bugs (not one, not three)
121
- - Common patterns: swapped if/else branches, += vs =, wrong comparison operator
122
- - Check: Does the logic make sense? Are edge cases handled?
123
- - Example bugs: "if item in freq: freq[item] = 1" should be += 1, and "else: freq[item] = freq[item] + 1" should be = 1
124
  """
125
 
126
  if difficulty == "hard":
127
  user_content += """
128
- HARD TASK TIPS:
129
- - Algorithmic bugs often involve: iteration order, loop bounds, missing state tracking
130
- - Common patterns: forward vs backward iteration (DP), missing visited set (graphs), wrong initialization
131
- - Your explanation MUST mention the specific algorithmic concept (e.g., "backward iteration", "visited set", "dp initialization")
132
- - Explanation quality affects 30% of your reward β€” be specific about what was wrong and why
133
-
134
  Remember: For hard tasks you MUST include a detailed explanation field describing:
135
- - What the algorithmic bug was (be specific: "inner loop iterates forward instead of backward")
136
- - Why it caused incorrect results (e.g., "allows items to be used multiple times")
137
- - How your fix resolves it (e.g., "reversing iteration ensures each item used once")
 
138
  """
139
 
140
  messages = [
@@ -223,12 +201,9 @@ def run_episode(env_url: str, difficulty: str) -> tuple:
223
  result = env_step(env_url, fixed_code=fixed_code,
224
  explanation=agent_action.get("explanation"))
225
  except Exception as e:
226
- error_msg = str(e)[:200]
227
  log_step(step=attempt, action="step_failed",
228
- reward=0.0, done=False, error=error_msg[:60])
229
  rewards.append(0.0)
230
- # Pass error feedback to LLM for next attempt
231
- last_feedback = f"❌ Server Error: {error_msg}\n\nYour code likely caused a runtime error or timeout. Check for:\n- Infinite loops\n- Syntax errors\n- Runtime exceptions (IndexError, KeyError, etc.)\n- Edge cases not handled"
232
  continue
233
 
234
  reward = result.get("reward", 0.0)
 
63
  - Return the COMPLETE fixed function, not just the changed line
64
  - The fixed_code must be syntactically valid Python
65
  - For hard tasks, the explanation field MUST describe: what the bug was, why it caused failures, and how your fix resolves it
 
66
 
67
  Response format (strictly):
68
  {
 
74
  1. Read the instructions carefully β€” they tell you exactly what type of bug exists
75
  2. Trace through the logic with the test inputs mentally
76
  3. For easy tasks: find the ONE wrong operator, value, or return statement
77
+ 4. For medium tasks: find BOTH bugs β€” usually one logic bug + one edge case
78
+ 5. For hard tasks: find the algorithmic flaw + write a clear explanation
79
  6. If your previous attempt failed, READ THE FEEDBACK β€” it shows exactly which inputs failed and what output was expected
 
 
 
 
 
 
 
80
  """
81
 
82
  def call_llm(buggy_code: str, instructions: str, difficulty: str,
 
104
  IMPORTANT: Your previous fix did not work. Carefully analyze the feedback above.
105
  Look at the Input, Expected, and Got values for each failing test.
106
  Try a completely different approach to fix the bug.
 
 
 
 
 
 
 
 
 
107
  """
108
 
109
  if difficulty == "hard":
110
  user_content += """
 
 
 
 
 
 
111
  Remember: For hard tasks you MUST include a detailed explanation field describing:
112
+ - What the algorithmic bug was
113
+ - Why it caused incorrect results
114
+ - How your fix resolves it
115
+ Explanation quality affects 30% of your reward.
116
  """
117
 
118
  messages = [
 
201
  result = env_step(env_url, fixed_code=fixed_code,
202
  explanation=agent_action.get("explanation"))
203
  except Exception as e:
 
204
  log_step(step=attempt, action="step_failed",
205
+ reward=0.0, done=False, error=str(e)[:60])
206
  rewards.append(0.0)
 
 
207
  continue
208
 
209
  reward = result.get("reward", 0.0)
pyproject.toml DELETED
@@ -1,26 +0,0 @@
1
- [build-system]
2
- requires = ["setuptools>=68", "wheel"]
3
- build-backend = "setuptools.backends.legacy:build"
4
-
5
- [project]
6
- name = "code-debug-env"
7
- version = "1.0.0"
8
- description = "OpenEnv environment for LLM-based code debugging"
9
- requires-python = ">=3.10"
10
- dependencies = [
11
- "fastapi>=0.110.0",
12
- "uvicorn[standard]>=0.29.0",
13
- "pydantic>=2.0.0",
14
- "openai>=1.0.0",
15
- "requests>=2.31.0",
16
- "openenv-core>=0.2.0",
17
- ]
18
-
19
- [project.optional-dependencies]
20
- dev = [
21
- "pytest>=8.0.0",
22
- "httpx>=0.27.0",
23
- ]
24
-
25
- [tool.setuptools.packages.find]
26
- where = ["."]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/app.py CHANGED
@@ -105,27 +105,19 @@ async def step(request: StepRequest) -> StepResponse:
105
  reward=observation.reward or 0.0,
106
  done=observation.done,
107
  )
108
- except TimeoutError as e:
109
  # Code execution timed out β€” return 0 reward instead of 500
110
- import traceback
111
- print(f"[ERROR] TimeoutError in step: {e}\n{traceback.format_exc()}", flush=True)
112
  return StepResponse(
113
- observation={"task_id": env._current_task.get("task_id", "unknown") if env._current_task else "unknown",
114
- "difficulty": env._difficulty,
115
- "buggy_code": env._current_task.get("buggy_code", "") if env._current_task else "",
116
- "instructions": env._current_task.get("instructions", "") if env._current_task else "",
117
- "test_cases_description": env._current_task.get("test_cases_description", "") if env._current_task else "",
118
- "reward": 0.0,
119
- "passed_tests": 0,
120
- "total_tests": len(env._current_task.get("test_cases", [])) if env._current_task else 3,
121
- "feedback": "TimeoutError: Code execution timed out. Possible infinite loop or very slow code.",
122
  "done": False},
123
  reward=0.0,
124
  done=False,
125
  )
126
  except Exception as e:
127
- import traceback
128
- print(f"[ERROR] Exception in step: {e}\n{traceback.format_exc()}", flush=True)
129
  raise HTTPException(status_code=500, detail=f"Step failed: {str(e)}")
130
 
131
 
@@ -152,7 +144,6 @@ async def list_tasks() -> dict:
152
  "total": len(EASY_TASKS) + len(MEDIUM_TASKS) + len(HARD_TASKS),
153
  }
154
 
155
-
156
  # ─── Run directly with: python server/app.py ─────────────────────────────────
157
  if __name__ == "__main__":
158
  import sys
 
105
  reward=observation.reward or 0.0,
106
  done=observation.done,
107
  )
108
+ except TimeoutError:
109
  # Code execution timed out β€” return 0 reward instead of 500
 
 
110
  return StepResponse(
111
+ observation={"task_id": "unknown", "difficulty": "unknown",
112
+ "buggy_code": "", "instructions": "",
113
+ "test_cases_description": "", "reward": 0.0,
114
+ "passed_tests": 0, "total_tests": 3,
115
+ "feedback": "TimeoutError: Code execution timed out. Possible infinite loop.",
 
 
 
 
116
  "done": False},
117
  reward=0.0,
118
  done=False,
119
  )
120
  except Exception as e:
 
 
121
  raise HTTPException(status_code=500, detail=f"Step failed: {str(e)}")
122
 
123
 
 
144
  "total": len(EASY_TASKS) + len(MEDIUM_TASKS) + len(HARD_TASKS),
145
  }
146
 
 
147
  # ─── Run directly with: python server/app.py ─────────────────────────────────
148
  if __name__ == "__main__":
149
  import sys
server/environment.py CHANGED
@@ -137,35 +137,14 @@ class CodeDebugEnvironment(Environment):
137
  )
138
 
139
  # Grade the submission
140
- try:
141
- grader = GRADERS[self._difficulty]
142
- if self._difficulty == "hard":
143
- reward, passed, total, feedback, _ = grader(
144
- action.fixed_code, self._current_task, action.explanation
145
- )
146
- else:
147
- reward, passed, total, feedback, _ = grader(
148
- action.fixed_code, self._current_task
149
- )
150
- except Exception as e:
151
- # Catch any grading errors and return helpful feedback
152
- import traceback
153
- error_detail = traceback.format_exc()
154
- print(f"[ERROR] Grading failed for {self._current_task['task_id']}: {e}\n{error_detail}", flush=True)
155
-
156
- done = self._step_count >= MAX_STEPS
157
- self._done = done
158
- return DebugObservation(
159
- task_id=self._current_task["task_id"],
160
- difficulty=self._difficulty,
161
- buggy_code=self._current_task["buggy_code"],
162
- instructions=self._current_task["instructions"],
163
- test_cases_description=self._current_task["test_cases_description"],
164
- reward=0.0,
165
- passed_tests=0,
166
- total_tests=len(self._current_task.get("test_cases", [])),
167
- feedback=f"❌ Grading Error: {type(e).__name__}: {str(e)[:100]}\nYour code caused an unexpected error during grading. Check for infinite loops, type errors, or invalid operations.",
168
- done=done,
169
  )
170
 
171
  self._current_reward = reward
 
137
  )
138
 
139
  # Grade the submission
140
+ grader = GRADERS[self._difficulty]
141
+ if self._difficulty == "hard":
142
+ reward, passed, total, feedback, _ = grader(
143
+ action.fixed_code, self._current_task, action.explanation
144
+ )
145
+ else:
146
+ reward, passed, total, feedback, _ = grader(
147
+ action.fixed_code, self._current_task
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  )
149
 
150
  self._current_reward = reward