Spaces:

Souravdanyal
/

code-debug-env

Running

App Files Files Community

Souravdanyal commited on Apr 5

Commit

d298b6d

1 Parent(s): de1fd70

Fix log format, timeout protection, SPEC.md, tests, max_steps=5, remove pycache

Browse files

Files changed (7) hide show

.gitignore +4 -0
.vscode/settings.json +0 -4
__pycache__/models.cpython-310.pyc +0 -0
inference.py +7 -32
pyproject.toml +0 -26
server/app.py +6 -15
server/environment.py +8 -29

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+.vscode/
+__pycache__/
+.vscode/

.vscode/settings.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-    "python-envs.defaultEnvManager": "ms-python.python:conda",
-    "python-envs.defaultPackageManager": "ms-python.python:conda"
-}

__pycache__/models.cpython-310.pyc DELETED Viewed

Binary file (2.74 kB)

inference.py CHANGED Viewed

@@ -63,7 +63,6 @@ CRITICAL RULES:
 - Return the COMPLETE fixed function, not just the changed line
 - The fixed_code must be syntactically valid Python
 - For hard tasks, the explanation field MUST describe: what the bug was, why it caused failures, and how your fix resolves it
-- ALWAYS preserve the original function signature and structure
 Response format (strictly):
 {
@@ -75,16 +74,9 @@ DEBUGGING STRATEGY:
 1. Read the instructions carefully — they tell you exactly what type of bug exists
 2. Trace through the logic with the test inputs mentally
 3. For easy tasks: find the ONE wrong operator, value, or return statement
-4. For medium tasks: find BOTH bugs — usually one logic bug + one edge case (swapped if/else, wrong operators)
-5. For hard tasks: find the algorithmic flaw (loop bounds, iteration order, missing checks) + write a clear explanation
 6. If your previous attempt failed, READ THE FEEDBACK — it shows exactly which inputs failed and what output was expected
-COMMON BUG PATTERNS:
-- Easy: Wrong comparison (==, !=, <, >), off-by-one errors, wrong return value
-- Medium: Swapped if/else logic, missing edge case check, two related operators wrong
-- Hard: Wrong iteration order (forward vs backward), missing visited set, incorrect DP initialization, boundary conditions
-IMPORTANT: Do not add imports, libraries, or change the algorithm unless absolutely necessary. Fix the bugs in the existing code.
 """
 def call_llm(buggy_code: str, instructions: str, difficulty: str,
@@ -112,29 +104,15 @@ Your previous fix was:
 IMPORTANT: Your previous fix did not work. Carefully analyze the feedback above.
 Look at the Input, Expected, and Got values for each failing test.
 Try a completely different approach to fix the bug.
-"""
-    if difficulty == "medium":
-        user_content += """
-MEDIUM TASK TIPS:
-- Look for EXACTLY TWO bugs (not one, not three)
-- Common patterns: swapped if/else branches, += vs =, wrong comparison operator
-- Check: Does the logic make sense? Are edge cases handled?
-- Example bugs: "if item in freq: freq[item] = 1" should be += 1, and "else: freq[item] = freq[item] + 1" should be = 1
 """
     if difficulty == "hard":
         user_content += """
-HARD TASK TIPS:
-- Algorithmic bugs often involve: iteration order, loop bounds, missing state tracking
-- Common patterns: forward vs backward iteration (DP), missing visited set (graphs), wrong initialization
-- Your explanation MUST mention the specific algorithmic concept (e.g., "backward iteration", "visited set", "dp initialization")
-- Explanation quality affects 30% of your reward — be specific about what was wrong and why
 Remember: For hard tasks you MUST include a detailed explanation field describing:
-- What the algorithmic bug was (be specific: "inner loop iterates forward instead of backward")
-- Why it caused incorrect results (e.g., "allows items to be used multiple times")
-- How your fix resolves it (e.g., "reversing iteration ensures each item used once")
 """
     messages = [
@@ -223,12 +201,9 @@ def run_episode(env_url: str, difficulty: str) -> tuple:
             result = env_step(env_url, fixed_code=fixed_code,
                               explanation=agent_action.get("explanation"))
         except Exception as e:
-            error_msg = str(e)[:200]
             log_step(step=attempt, action="step_failed",
-                     reward=0.0, done=False, error=error_msg[:60])
             rewards.append(0.0)
-            # Pass error feedback to LLM for next attempt
-            last_feedback = f"❌ Server Error: {error_msg}\n\nYour code likely caused a runtime error or timeout. Check for:\n- Infinite loops\n- Syntax errors\n- Runtime exceptions (IndexError, KeyError, etc.)\n- Edge cases not handled"
             continue
         reward = result.get("reward", 0.0)

 - Return the COMPLETE fixed function, not just the changed line
 - The fixed_code must be syntactically valid Python
 - For hard tasks, the explanation field MUST describe: what the bug was, why it caused failures, and how your fix resolves it
 Response format (strictly):
 {
 1. Read the instructions carefully — they tell you exactly what type of bug exists
 2. Trace through the logic with the test inputs mentally
 3. For easy tasks: find the ONE wrong operator, value, or return statement
+4. For medium tasks: find BOTH bugs — usually one logic bug + one edge case
+5. For hard tasks: find the algorithmic flaw + write a clear explanation
 6. If your previous attempt failed, READ THE FEEDBACK — it shows exactly which inputs failed and what output was expected
 """
 def call_llm(buggy_code: str, instructions: str, difficulty: str,
 IMPORTANT: Your previous fix did not work. Carefully analyze the feedback above.
 Look at the Input, Expected, and Got values for each failing test.
 Try a completely different approach to fix the bug.
 """
     if difficulty == "hard":
         user_content += """
 Remember: For hard tasks you MUST include a detailed explanation field describing:
+- What the algorithmic bug was
+- Why it caused incorrect results
+- How your fix resolves it
+Explanation quality affects 30% of your reward.
 """
     messages = [
             result = env_step(env_url, fixed_code=fixed_code,
                               explanation=agent_action.get("explanation"))
         except Exception as e:
             log_step(step=attempt, action="step_failed",
+                     reward=0.0, done=False, error=str(e)[:60])
             rewards.append(0.0)
             continue
         reward = result.get("reward", 0.0)

pyproject.toml DELETED Viewed

@@ -1,26 +0,0 @@
-[build-system]
-requires = ["setuptools>=68", "wheel"]
-build-backend = "setuptools.backends.legacy:build"
-[project]
-name = "code-debug-env"
-version = "1.0.0"
-description = "OpenEnv environment for LLM-based code debugging"
-requires-python = ">=3.10"
-dependencies = [
-    "fastapi>=0.110.0",
-    "uvicorn[standard]>=0.29.0",
-    "pydantic>=2.0.0",
-    "openai>=1.0.0",
-    "requests>=2.31.0",
-    "openenv-core>=0.2.0",
-]
-[project.optional-dependencies]
-dev = [
-    "pytest>=8.0.0",
-    "httpx>=0.27.0",
-]
-[tool.setuptools.packages.find]
-where = ["."]

server/app.py CHANGED Viewed

@@ -105,27 +105,19 @@ async def step(request: StepRequest) -> StepResponse:
             reward=observation.reward or 0.0,
             done=observation.done,
         )
-    except TimeoutError as e:
         # Code execution timed out — return 0 reward instead of 500
-        import traceback
-        print(f"[ERROR] TimeoutError in step: {e}\n{traceback.format_exc()}", flush=True)
         return StepResponse(
-            observation={"task_id": env._current_task.get("task_id", "unknown") if env._current_task else "unknown",
-                        "difficulty": env._difficulty,
-                        "buggy_code": env._current_task.get("buggy_code", "") if env._current_task else "",
-                        "instructions": env._current_task.get("instructions", "") if env._current_task else "",
-                        "test_cases_description": env._current_task.get("test_cases_description", "") if env._current_task else "",
-                        "reward": 0.0,
-                        "passed_tests": 0,
-                        "total_tests": len(env._current_task.get("test_cases", [])) if env._current_task else 3,
-                        "feedback": "TimeoutError: Code execution timed out. Possible infinite loop or very slow code.",
                         "done": False},
             reward=0.0,
             done=False,
         )
     except Exception as e:
-        import traceback
-        print(f"[ERROR] Exception in step: {e}\n{traceback.format_exc()}", flush=True)
         raise HTTPException(status_code=500, detail=f"Step failed: {str(e)}")
@@ -152,7 +144,6 @@ async def list_tasks() -> dict:
         "total": len(EASY_TASKS) + len(MEDIUM_TASKS) + len(HARD_TASKS),
     }
 # ─── Run directly with: python server/app.py ─────────────────────────────────
 if __name__ == "__main__":
     import sys

             reward=observation.reward or 0.0,
             done=observation.done,
         )
+    except TimeoutError:
         # Code execution timed out — return 0 reward instead of 500
         return StepResponse(
+            observation={"task_id": "unknown", "difficulty": "unknown",
+                        "buggy_code": "", "instructions": "",
+                        "test_cases_description": "", "reward": 0.0,
+                        "passed_tests": 0, "total_tests": 3,
+                        "feedback": "TimeoutError: Code execution timed out. Possible infinite loop.",
                         "done": False},
             reward=0.0,
             done=False,
         )
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Step failed: {str(e)}")
         "total": len(EASY_TASKS) + len(MEDIUM_TASKS) + len(HARD_TASKS),
     }
 # ─── Run directly with: python server/app.py ─────────────────────────────────
 if __name__ == "__main__":
     import sys

server/environment.py CHANGED Viewed

@@ -137,35 +137,14 @@ class CodeDebugEnvironment(Environment):
             )
         # Grade the submission
-        try:
-            grader = GRADERS[self._difficulty]
-            if self._difficulty == "hard":
-                reward, passed, total, feedback, _ = grader(
-                    action.fixed_code, self._current_task, action.explanation
-                )
-            else:
-                reward, passed, total, feedback, _ = grader(
-                    action.fixed_code, self._current_task
-                )
-        except Exception as e:
-            # Catch any grading errors and return helpful feedback
-            import traceback
-            error_detail = traceback.format_exc()
-            print(f"[ERROR] Grading failed for {self._current_task['task_id']}: {e}\n{error_detail}", flush=True)
-            done = self._step_count >= MAX_STEPS
-            self._done = done
-            return DebugObservation(
-                task_id=self._current_task["task_id"],
-                difficulty=self._difficulty,
-                buggy_code=self._current_task["buggy_code"],
-                instructions=self._current_task["instructions"],
-                test_cases_description=self._current_task["test_cases_description"],
-                reward=0.0,
-                passed_tests=0,
-                total_tests=len(self._current_task.get("test_cases", [])),
-                feedback=f"❌ Grading Error: {type(e).__name__}: {str(e)[:100]}\nYour code caused an unexpected error during grading. Check for infinite loops, type errors, or invalid operations.",
-                done=done,
             )
         self._current_reward = reward

             )
         # Grade the submission
+        grader = GRADERS[self._difficulty]
+        if self._difficulty == "hard":
+            reward, passed, total, feedback, _ = grader(
+                action.fixed_code, self._current_task, action.explanation
+            )
+        else:
+            reward, passed, total, feedback, _ = grader(
+                action.fixed_code, self._current_task
             )
         self._current_reward = reward