Spaces:
Running
Running
Commit Β·
d298b6d
1
Parent(s): de1fd70
Fix log format, timeout protection, SPEC.md, tests, max_steps=5, remove pycache
Browse files- .gitignore +4 -0
- .vscode/settings.json +0 -4
- __pycache__/models.cpython-310.pyc +0 -0
- inference.py +7 -32
- pyproject.toml +0 -26
- server/app.py +6 -15
- server/environment.py +8 -29
.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
.vscode/
|
| 3 |
+
__pycache__/
|
| 4 |
+
.vscode/
|
.vscode/settings.json
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"python-envs.defaultEnvManager": "ms-python.python:conda",
|
| 3 |
-
"python-envs.defaultPackageManager": "ms-python.python:conda"
|
| 4 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/models.cpython-310.pyc
DELETED
|
Binary file (2.74 kB)
|
|
|
inference.py
CHANGED
|
@@ -63,7 +63,6 @@ CRITICAL RULES:
|
|
| 63 |
- Return the COMPLETE fixed function, not just the changed line
|
| 64 |
- The fixed_code must be syntactically valid Python
|
| 65 |
- For hard tasks, the explanation field MUST describe: what the bug was, why it caused failures, and how your fix resolves it
|
| 66 |
-
- ALWAYS preserve the original function signature and structure
|
| 67 |
|
| 68 |
Response format (strictly):
|
| 69 |
{
|
|
@@ -75,16 +74,9 @@ DEBUGGING STRATEGY:
|
|
| 75 |
1. Read the instructions carefully β they tell you exactly what type of bug exists
|
| 76 |
2. Trace through the logic with the test inputs mentally
|
| 77 |
3. For easy tasks: find the ONE wrong operator, value, or return statement
|
| 78 |
-
4. For medium tasks: find BOTH bugs β usually one logic bug + one edge case
|
| 79 |
-
5. For hard tasks: find the algorithmic flaw
|
| 80 |
6. If your previous attempt failed, READ THE FEEDBACK β it shows exactly which inputs failed and what output was expected
|
| 81 |
-
|
| 82 |
-
COMMON BUG PATTERNS:
|
| 83 |
-
- Easy: Wrong comparison (==, !=, <, >), off-by-one errors, wrong return value
|
| 84 |
-
- Medium: Swapped if/else logic, missing edge case check, two related operators wrong
|
| 85 |
-
- Hard: Wrong iteration order (forward vs backward), missing visited set, incorrect DP initialization, boundary conditions
|
| 86 |
-
|
| 87 |
-
IMPORTANT: Do not add imports, libraries, or change the algorithm unless absolutely necessary. Fix the bugs in the existing code.
|
| 88 |
"""
|
| 89 |
|
| 90 |
def call_llm(buggy_code: str, instructions: str, difficulty: str,
|
|
@@ -112,29 +104,15 @@ Your previous fix was:
|
|
| 112 |
IMPORTANT: Your previous fix did not work. Carefully analyze the feedback above.
|
| 113 |
Look at the Input, Expected, and Got values for each failing test.
|
| 114 |
Try a completely different approach to fix the bug.
|
| 115 |
-
"""
|
| 116 |
-
|
| 117 |
-
if difficulty == "medium":
|
| 118 |
-
user_content += """
|
| 119 |
-
MEDIUM TASK TIPS:
|
| 120 |
-
- Look for EXACTLY TWO bugs (not one, not three)
|
| 121 |
-
- Common patterns: swapped if/else branches, += vs =, wrong comparison operator
|
| 122 |
-
- Check: Does the logic make sense? Are edge cases handled?
|
| 123 |
-
- Example bugs: "if item in freq: freq[item] = 1" should be += 1, and "else: freq[item] = freq[item] + 1" should be = 1
|
| 124 |
"""
|
| 125 |
|
| 126 |
if difficulty == "hard":
|
| 127 |
user_content += """
|
| 128 |
-
HARD TASK TIPS:
|
| 129 |
-
- Algorithmic bugs often involve: iteration order, loop bounds, missing state tracking
|
| 130 |
-
- Common patterns: forward vs backward iteration (DP), missing visited set (graphs), wrong initialization
|
| 131 |
-
- Your explanation MUST mention the specific algorithmic concept (e.g., "backward iteration", "visited set", "dp initialization")
|
| 132 |
-
- Explanation quality affects 30% of your reward β be specific about what was wrong and why
|
| 133 |
-
|
| 134 |
Remember: For hard tasks you MUST include a detailed explanation field describing:
|
| 135 |
-
- What the algorithmic bug was
|
| 136 |
-
- Why it caused incorrect results
|
| 137 |
-
- How your fix resolves it
|
|
|
|
| 138 |
"""
|
| 139 |
|
| 140 |
messages = [
|
|
@@ -223,12 +201,9 @@ def run_episode(env_url: str, difficulty: str) -> tuple:
|
|
| 223 |
result = env_step(env_url, fixed_code=fixed_code,
|
| 224 |
explanation=agent_action.get("explanation"))
|
| 225 |
except Exception as e:
|
| 226 |
-
error_msg = str(e)[:200]
|
| 227 |
log_step(step=attempt, action="step_failed",
|
| 228 |
-
reward=0.0, done=False, error=
|
| 229 |
rewards.append(0.0)
|
| 230 |
-
# Pass error feedback to LLM for next attempt
|
| 231 |
-
last_feedback = f"β Server Error: {error_msg}\n\nYour code likely caused a runtime error or timeout. Check for:\n- Infinite loops\n- Syntax errors\n- Runtime exceptions (IndexError, KeyError, etc.)\n- Edge cases not handled"
|
| 232 |
continue
|
| 233 |
|
| 234 |
reward = result.get("reward", 0.0)
|
|
|
|
| 63 |
- Return the COMPLETE fixed function, not just the changed line
|
| 64 |
- The fixed_code must be syntactically valid Python
|
| 65 |
- For hard tasks, the explanation field MUST describe: what the bug was, why it caused failures, and how your fix resolves it
|
|
|
|
| 66 |
|
| 67 |
Response format (strictly):
|
| 68 |
{
|
|
|
|
| 74 |
1. Read the instructions carefully β they tell you exactly what type of bug exists
|
| 75 |
2. Trace through the logic with the test inputs mentally
|
| 76 |
3. For easy tasks: find the ONE wrong operator, value, or return statement
|
| 77 |
+
4. For medium tasks: find BOTH bugs β usually one logic bug + one edge case
|
| 78 |
+
5. For hard tasks: find the algorithmic flaw + write a clear explanation
|
| 79 |
6. If your previous attempt failed, READ THE FEEDBACK β it shows exactly which inputs failed and what output was expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
"""
|
| 81 |
|
| 82 |
def call_llm(buggy_code: str, instructions: str, difficulty: str,
|
|
|
|
| 104 |
IMPORTANT: Your previous fix did not work. Carefully analyze the feedback above.
|
| 105 |
Look at the Input, Expected, and Got values for each failing test.
|
| 106 |
Try a completely different approach to fix the bug.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
"""
|
| 108 |
|
| 109 |
if difficulty == "hard":
|
| 110 |
user_content += """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
Remember: For hard tasks you MUST include a detailed explanation field describing:
|
| 112 |
+
- What the algorithmic bug was
|
| 113 |
+
- Why it caused incorrect results
|
| 114 |
+
- How your fix resolves it
|
| 115 |
+
Explanation quality affects 30% of your reward.
|
| 116 |
"""
|
| 117 |
|
| 118 |
messages = [
|
|
|
|
| 201 |
result = env_step(env_url, fixed_code=fixed_code,
|
| 202 |
explanation=agent_action.get("explanation"))
|
| 203 |
except Exception as e:
|
|
|
|
| 204 |
log_step(step=attempt, action="step_failed",
|
| 205 |
+
reward=0.0, done=False, error=str(e)[:60])
|
| 206 |
rewards.append(0.0)
|
|
|
|
|
|
|
| 207 |
continue
|
| 208 |
|
| 209 |
reward = result.get("reward", 0.0)
|
pyproject.toml
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
[build-system]
|
| 2 |
-
requires = ["setuptools>=68", "wheel"]
|
| 3 |
-
build-backend = "setuptools.backends.legacy:build"
|
| 4 |
-
|
| 5 |
-
[project]
|
| 6 |
-
name = "code-debug-env"
|
| 7 |
-
version = "1.0.0"
|
| 8 |
-
description = "OpenEnv environment for LLM-based code debugging"
|
| 9 |
-
requires-python = ">=3.10"
|
| 10 |
-
dependencies = [
|
| 11 |
-
"fastapi>=0.110.0",
|
| 12 |
-
"uvicorn[standard]>=0.29.0",
|
| 13 |
-
"pydantic>=2.0.0",
|
| 14 |
-
"openai>=1.0.0",
|
| 15 |
-
"requests>=2.31.0",
|
| 16 |
-
"openenv-core>=0.2.0",
|
| 17 |
-
]
|
| 18 |
-
|
| 19 |
-
[project.optional-dependencies]
|
| 20 |
-
dev = [
|
| 21 |
-
"pytest>=8.0.0",
|
| 22 |
-
"httpx>=0.27.0",
|
| 23 |
-
]
|
| 24 |
-
|
| 25 |
-
[tool.setuptools.packages.find]
|
| 26 |
-
where = ["."]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/app.py
CHANGED
|
@@ -105,27 +105,19 @@ async def step(request: StepRequest) -> StepResponse:
|
|
| 105 |
reward=observation.reward or 0.0,
|
| 106 |
done=observation.done,
|
| 107 |
)
|
| 108 |
-
except TimeoutError
|
| 109 |
# Code execution timed out β return 0 reward instead of 500
|
| 110 |
-
import traceback
|
| 111 |
-
print(f"[ERROR] TimeoutError in step: {e}\n{traceback.format_exc()}", flush=True)
|
| 112 |
return StepResponse(
|
| 113 |
-
observation={"task_id":
|
| 114 |
-
"
|
| 115 |
-
"
|
| 116 |
-
"
|
| 117 |
-
"
|
| 118 |
-
"reward": 0.0,
|
| 119 |
-
"passed_tests": 0,
|
| 120 |
-
"total_tests": len(env._current_task.get("test_cases", [])) if env._current_task else 3,
|
| 121 |
-
"feedback": "TimeoutError: Code execution timed out. Possible infinite loop or very slow code.",
|
| 122 |
"done": False},
|
| 123 |
reward=0.0,
|
| 124 |
done=False,
|
| 125 |
)
|
| 126 |
except Exception as e:
|
| 127 |
-
import traceback
|
| 128 |
-
print(f"[ERROR] Exception in step: {e}\n{traceback.format_exc()}", flush=True)
|
| 129 |
raise HTTPException(status_code=500, detail=f"Step failed: {str(e)}")
|
| 130 |
|
| 131 |
|
|
@@ -152,7 +144,6 @@ async def list_tasks() -> dict:
|
|
| 152 |
"total": len(EASY_TASKS) + len(MEDIUM_TASKS) + len(HARD_TASKS),
|
| 153 |
}
|
| 154 |
|
| 155 |
-
|
| 156 |
# βββ Run directly with: python server/app.py βββββββββββββββββββββββββββββββββ
|
| 157 |
if __name__ == "__main__":
|
| 158 |
import sys
|
|
|
|
| 105 |
reward=observation.reward or 0.0,
|
| 106 |
done=observation.done,
|
| 107 |
)
|
| 108 |
+
except TimeoutError:
|
| 109 |
# Code execution timed out β return 0 reward instead of 500
|
|
|
|
|
|
|
| 110 |
return StepResponse(
|
| 111 |
+
observation={"task_id": "unknown", "difficulty": "unknown",
|
| 112 |
+
"buggy_code": "", "instructions": "",
|
| 113 |
+
"test_cases_description": "", "reward": 0.0,
|
| 114 |
+
"passed_tests": 0, "total_tests": 3,
|
| 115 |
+
"feedback": "TimeoutError: Code execution timed out. Possible infinite loop.",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
"done": False},
|
| 117 |
reward=0.0,
|
| 118 |
done=False,
|
| 119 |
)
|
| 120 |
except Exception as e:
|
|
|
|
|
|
|
| 121 |
raise HTTPException(status_code=500, detail=f"Step failed: {str(e)}")
|
| 122 |
|
| 123 |
|
|
|
|
| 144 |
"total": len(EASY_TASKS) + len(MEDIUM_TASKS) + len(HARD_TASKS),
|
| 145 |
}
|
| 146 |
|
|
|
|
| 147 |
# βββ Run directly with: python server/app.py βββββββββββββββββββββββββββββββββ
|
| 148 |
if __name__ == "__main__":
|
| 149 |
import sys
|
server/environment.py
CHANGED
|
@@ -137,35 +137,14 @@ class CodeDebugEnvironment(Environment):
|
|
| 137 |
)
|
| 138 |
|
| 139 |
# Grade the submission
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
action.fixed_code, self._current_task
|
| 149 |
-
)
|
| 150 |
-
except Exception as e:
|
| 151 |
-
# Catch any grading errors and return helpful feedback
|
| 152 |
-
import traceback
|
| 153 |
-
error_detail = traceback.format_exc()
|
| 154 |
-
print(f"[ERROR] Grading failed for {self._current_task['task_id']}: {e}\n{error_detail}", flush=True)
|
| 155 |
-
|
| 156 |
-
done = self._step_count >= MAX_STEPS
|
| 157 |
-
self._done = done
|
| 158 |
-
return DebugObservation(
|
| 159 |
-
task_id=self._current_task["task_id"],
|
| 160 |
-
difficulty=self._difficulty,
|
| 161 |
-
buggy_code=self._current_task["buggy_code"],
|
| 162 |
-
instructions=self._current_task["instructions"],
|
| 163 |
-
test_cases_description=self._current_task["test_cases_description"],
|
| 164 |
-
reward=0.0,
|
| 165 |
-
passed_tests=0,
|
| 166 |
-
total_tests=len(self._current_task.get("test_cases", [])),
|
| 167 |
-
feedback=f"β Grading Error: {type(e).__name__}: {str(e)[:100]}\nYour code caused an unexpected error during grading. Check for infinite loops, type errors, or invalid operations.",
|
| 168 |
-
done=done,
|
| 169 |
)
|
| 170 |
|
| 171 |
self._current_reward = reward
|
|
|
|
| 137 |
)
|
| 138 |
|
| 139 |
# Grade the submission
|
| 140 |
+
grader = GRADERS[self._difficulty]
|
| 141 |
+
if self._difficulty == "hard":
|
| 142 |
+
reward, passed, total, feedback, _ = grader(
|
| 143 |
+
action.fixed_code, self._current_task, action.explanation
|
| 144 |
+
)
|
| 145 |
+
else:
|
| 146 |
+
reward, passed, total, feedback, _ = grader(
|
| 147 |
+
action.fixed_code, self._current_task
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
)
|
| 149 |
|
| 150 |
self._current_reward = reward
|