Spaces:

arnavk
/

openenv-code-debugger

Sleeping

arnavzz Claude Sonnet 4.6 commited on Apr 4

Commit

3faaaa0

1 Parent(s): b95e073

refactor: simplify and fix efficiency issues

- executor: embed code via repr() to avoid double file I/O; extract _failure()
helper to unify error result structure; use rsplit for stdout parsing;
remove unnecessary WHAT comments
- environment: cap episodes at 500 with LRU eviction to prevent memory leak;
unify observation building into single _build_observation(); store
tests_passed/total_tests in episode during step() so state() reads
directly instead of recalculating; extract _get_episode() helper
- app: pass req.action.code directly instead of model_dump()
- inference: simplify strip_fences() using removeprefix/removesuffix

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (4) hide show

code_debug_env/server/app.py +2 -27
code_debug_env/server/environment.py +46 -60
code_debug_env/server/executor.py +20 -60
inference.py +2 -6

code_debug_env/server/app.py CHANGED Viewed

@@ -1,10 +1,4 @@
-"""
-FastAPI server exposing the OpenEnv-compatible HTTP API.
-Port: 7860 (Hugging Face Spaces default)
-"""
 from fastapi import FastAPI, HTTPException
-from fastapi.responses import JSONResponse
 from ..models import (
     DebugState,
@@ -24,10 +18,6 @@ app = FastAPI(
 env = CodeDebugEnvironment()
-# ------------------------------------------------------------------
-# Health & metadata
-# ------------------------------------------------------------------
 @app.get("/health")
 async def health():
     return {"status": "healthy", "tasks_loaded": len(env.tasks)}
@@ -38,17 +28,12 @@ async def list_tasks():
     return env.list_tasks()
-# ------------------------------------------------------------------
-# OpenEnv core endpoints
-# ------------------------------------------------------------------
 @app.post("/reset", response_model=ResetResponse)
 async def reset(req: ResetRequest = None):
     if req is None:
         req = ResetRequest()
     try:
-        result = env.reset(task_id=req.task_id, seed=req.seed)
-        return result
     except KeyError as e:
         raise HTTPException(status_code=404, detail=str(e))
@@ -56,8 +41,7 @@ async def reset(req: ResetRequest = None):
 @app.post("/step/{episode_id}", response_model=StepResponse)
 async def step(episode_id: str, req: StepRequest):
     try:
-        result = env.step(episode_id, req.action.model_dump())
-        return result
     except KeyError as e:
         raise HTTPException(status_code=404, detail=str(e))
     except ValueError as e:
@@ -70,12 +54,3 @@ async def state(episode_id: str):
         return env.state(episode_id)
     except KeyError as e:
         raise HTTPException(status_code=404, detail=str(e))
-# ------------------------------------------------------------------
-# Entry point for local dev
-# ------------------------------------------------------------------
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run("code_debug_env.server.app:app", host="0.0.0.0", port=7860, reload=True)

 from fastapi import FastAPI, HTTPException
 from ..models import (
     DebugState,
 env = CodeDebugEnvironment()
 @app.get("/health")
 async def health():
     return {"status": "healthy", "tasks_loaded": len(env.tasks)}
     return env.list_tasks()
 @app.post("/reset", response_model=ResetResponse)
 async def reset(req: ResetRequest = None):
     if req is None:
         req = ResetRequest()
     try:
+        return env.reset(task_id=req.task_id, seed=req.seed)
     except KeyError as e:
         raise HTTPException(status_code=404, detail=str(e))
 @app.post("/step/{episode_id}", response_model=StepResponse)
 async def step(episode_id: str, req: StepRequest):
     try:
+        return env.step(episode_id, req.action.code)
     except KeyError as e:
         raise HTTPException(status_code=404, detail=str(e))
     except ValueError as e:
         return env.state(episode_id)
     except KeyError as e:
         raise HTTPException(status_code=404, detail=str(e))

code_debug_env/server/environment.py CHANGED Viewed

@@ -5,15 +5,20 @@ Core environment logic: task loading, reset, step, state.
 import json
 import random
 import uuid
 from pathlib import Path
 from .executor import run_code_safely
 class CodeDebugEnvironment:
     def __init__(self):
         self.tasks: dict[str, dict] = {}
-        self.episodes: dict[str, dict] = {}
         self._load_tasks()
     def _load_tasks(self):
@@ -23,10 +28,6 @@ class CodeDebugEnvironment:
                 task = json.load(f)
                 self.tasks[task["task_id"]] = task
-    # ------------------------------------------------------------------
-    # Public API
-    # ------------------------------------------------------------------
     def reset(self, task_id: str | None = None, seed: int | None = None) -> dict:
         if seed is not None:
             random.seed(seed)
@@ -40,38 +41,34 @@ class CodeDebugEnvironment:
         task = self.tasks[task_id]
         episode_id = str(uuid.uuid4())
         self.episodes[episode_id] = {
             "episode_id": episode_id,
             "task": task,
             "step_count": 0,
             "done": False,
             "rewards": [],
             "last_test_results": [],
         }
-        observation = self._initial_observation(task)
-        return {"episode_id": episode_id, "observation": observation}
-    def step(self, episode_id: str, action: dict) -> dict:
-        if episode_id not in self.episodes:
-            raise KeyError(f"Unknown episode_id: {episode_id!r}")
-        ep = self.episodes[episode_id]
         if ep["done"]:
             raise ValueError("Episode is already finished. Call reset() to start a new episode.")
         task = ep["task"]
-        submitted_code = action.get("code", "")
         ep["step_count"] += 1
-        test_results_raw, stdout, stderr = run_code_safely(
-            submitted_code,
-            task["test_code"],
-            timeout=10,
-        )
-        tests_passed = sum(1 for t in test_results_raw if t.get("passed", False))
-        total_tests = len(test_results_raw)
         reward = round(tests_passed / total_tests, 4) if total_tests > 0 else 0.0
         max_steps = task.get("max_steps", 5)
@@ -79,45 +76,31 @@ class CodeDebugEnvironment:
         ep["done"] = done
         ep["rewards"].append(reward)
-        ep["last_test_results"] = test_results_raw
-        observation = {
-            "task_id": task["task_id"],
-            "difficulty": task["difficulty"],
-            "description": task["description"],
-            "buggy_code": task["buggy_code"],
-            "test_descriptions": task["test_descriptions"],
-            "test_results": test_results_raw,
-            "stdout": stdout,
-            "stderr": stderr,
-            "step_count": ep["step_count"],
-            "max_steps": max_steps,
             "reward": reward,
             "done": done,
-            "total_tests": total_tests,
-            "tests_passed": tests_passed,
         }
-        return {"observation": observation, "reward": reward, "done": done, "info": {}}
     def state(self, episode_id: str) -> dict:
-        if episode_id not in self.episodes:
-            raise KeyError(f"Unknown episode_id: {episode_id!r}")
-        ep = self.episodes[episode_id]
-        task = ep["task"]
-        last_results = ep.get("last_test_results", [])
         return {
             "episode_id": episode_id,
-            "task_id": task["task_id"],
-            "difficulty": task["difficulty"],
             "step_count": ep["step_count"],
-            "max_steps": task.get("max_steps", 5),
             "last_reward": ep["rewards"][-1] if ep["rewards"] else 0.0,
             "cumulative_reward": round(sum(ep["rewards"]), 4),
-            "tests_passed": sum(1 for t in last_results if t.get("passed", False)),
-            "total_tests": len(last_results),
             "done": ep["done"],
         }
@@ -133,24 +116,27 @@ class CodeDebugEnvironment:
             for t in self.tasks.values()
         ]
-    # ------------------------------------------------------------------
-    # Internal helpers
-    # ------------------------------------------------------------------
-    def _initial_observation(self, task: dict) -> dict:
         return {
             "task_id": task["task_id"],
             "difficulty": task["difficulty"],
             "description": task["description"],
             "buggy_code": task["buggy_code"],
             "test_descriptions": task["test_descriptions"],
-            "test_results": [],
-            "stdout": "",
-            "stderr": "",
-            "step_count": 0,
             "max_steps": task.get("max_steps", 5),
-            "reward": 0.0,
-            "done": False,
-            "total_tests": len(task["test_descriptions"]),
-            "tests_passed": 0,
         }

 import json
 import random
 import uuid
+from collections import OrderedDict
 from pathlib import Path
 from .executor import run_code_safely
+# Cap in-memory episodes to prevent unbounded growth.
+_MAX_EPISODES = 500
 class CodeDebugEnvironment:
     def __init__(self):
         self.tasks: dict[str, dict] = {}
+        # OrderedDict used as a simple LRU: oldest episode evicted when cap is hit.
+        self.episodes: OrderedDict[str, dict] = OrderedDict()
         self._load_tasks()
     def _load_tasks(self):
                 task = json.load(f)
                 self.tasks[task["task_id"]] = task
     def reset(self, task_id: str | None = None, seed: int | None = None) -> dict:
         if seed is not None:
             random.seed(seed)
         task = self.tasks[task_id]
         episode_id = str(uuid.uuid4())
+        if len(self.episodes) >= _MAX_EPISODES:
+            self.episodes.popitem(last=False)
         self.episodes[episode_id] = {
             "episode_id": episode_id,
             "task": task,
             "step_count": 0,
             "done": False,
             "rewards": [],
+            "tests_passed": 0,
+            "total_tests": len(task["test_descriptions"]),
             "last_test_results": [],
         }
+        return {"episode_id": episode_id, "observation": self._build_observation(episode_id)}
+    def step(self, episode_id: str, code: str) -> dict:
+        ep = self._get_episode(episode_id)
         if ep["done"]:
             raise ValueError("Episode is already finished. Call reset() to start a new episode.")
         task = ep["task"]
         ep["step_count"] += 1
+        test_results, stdout, stderr = run_code_safely(code, task["test_code"], timeout=10)
+        tests_passed = sum(1 for t in test_results if t.get("passed", False))
+        total_tests = len(test_results)
         reward = round(tests_passed / total_tests, 4) if total_tests > 0 else 0.0
         max_steps = task.get("max_steps", 5)
         ep["done"] = done
         ep["rewards"].append(reward)
+        ep["tests_passed"] = tests_passed
+        ep["total_tests"] = total_tests
+        ep["last_test_results"] = test_results
+        ep["last_stdout"] = stdout
+        ep["last_stderr"] = stderr
+        return {
+            "observation": self._build_observation(episode_id),
             "reward": reward,
             "done": done,
+            "info": {},
         }
     def state(self, episode_id: str) -> dict:
+        ep = self._get_episode(episode_id)
         return {
             "episode_id": episode_id,
+            "task_id": ep["task"]["task_id"],
+            "difficulty": ep["task"]["difficulty"],
             "step_count": ep["step_count"],
+            "max_steps": ep["task"].get("max_steps", 5),
             "last_reward": ep["rewards"][-1] if ep["rewards"] else 0.0,
             "cumulative_reward": round(sum(ep["rewards"]), 4),
+            "tests_passed": ep["tests_passed"],
+            "total_tests": ep["total_tests"],
             "done": ep["done"],
         }
             for t in self.tasks.values()
         ]
+    def _get_episode(self, episode_id: str) -> dict:
+        if episode_id not in self.episodes:
+            raise KeyError(f"Unknown episode_id: {episode_id!r}")
+        return self.episodes[episode_id]
+    def _build_observation(self, episode_id: str) -> dict:
+        ep = self.episodes[episode_id]
+        task = ep["task"]
         return {
             "task_id": task["task_id"],
             "difficulty": task["difficulty"],
             "description": task["description"],
             "buggy_code": task["buggy_code"],
             "test_descriptions": task["test_descriptions"],
+            "test_results": ep["last_test_results"],
+            "stdout": ep.get("last_stdout", ""),
+            "stderr": ep.get("last_stderr", ""),
+            "step_count": ep["step_count"],
             "max_steps": task.get("max_steps", 5),
+            "reward": ep["rewards"][-1] if ep["rewards"] else 0.0,
+            "done": ep["done"],
+            "total_tests": ep["total_tests"],
+            "tests_passed": ep["tests_passed"],
         }

code_debug_env/server/executor.py CHANGED Viewed

@@ -1,9 +1,5 @@
 """
-Safe code execution engine.
-Runs submitted code in a subprocess with timeout.
-Writes code to a temp directory, generates a test harness,
-and parses JSON results from stdout.
 """
 import json
@@ -22,40 +18,26 @@ def run_code_safely(
     """
     Execute submitted code against test cases in an isolated subprocess.
-    Args:
-        submitted_code: The Python code the agent submitted as a fix.
-        test_code: Python snippet that populates a `results` list with test dicts.
-        timeout: Max seconds before killing the subprocess.
-    Returns:
-        (test_results, stdout_extra, stderr) where test_results is a list of
-        {"test_name", "passed", "expected", "actual", "error"} dicts.
     """
     with tempfile.TemporaryDirectory() as tmpdir:
-        solution_path = Path(tmpdir) / "solution.py"
         harness_path = Path(tmpdir) / "harness.py"
-        # Write the submitted code as a module
-        solution_path.write_text(submitted_code, encoding="utf-8")
-        # Build the test harness
         harness = textwrap.dedent(f"""\
             import sys, json, traceback
-            sys.path.insert(0, r"{tmpdir}")
             results = []
-            # Execute the submitted solution in this namespace
             try:
-                exec(open(r"{solution_path}", encoding="utf-8").read())
-            except Exception as e:
-                # If the solution itself fails to load, all tests fail
                 print(json.dumps([{{"test_name": "load", "passed": False,
                        "expected": "code loads", "actual": "",
                        "error": traceback.format_exc()}}]))
                 sys.exit(0)
-            # Run the test code (populates `results`)
             {textwrap.indent(test_code, "            ").strip()}
             print(json.dumps(results))
@@ -74,44 +56,22 @@ def run_code_safely(
             stdout = proc.stdout.strip()
             stderr = proc.stderr.strip()
-            # Parse test results from last line of stdout (the JSON array)
             if stdout:
-                # The JSON array should be the last line
-                lines = stdout.split("\n")
-                json_line = lines[-1]
-                extra_output = "\n".join(lines[:-1]) if len(lines) > 1 else ""
                 try:
-                    test_results = json.loads(json_line)
-                    return test_results, extra_output, stderr
                 except json.JSONDecodeError:
-                    return [
-                        {
-                            "test_name": "parse_error",
-                            "passed": False,
-                            "expected": "valid JSON output",
-                            "actual": stdout[:200],
-                            "error": "Could not parse test results from subprocess output",
-                        }
-                    ], "", stderr
-            # No stdout at all — likely a crash
-            return [
-                {
-                    "test_name": "execution_error",
-                    "passed": False,
-                    "expected": "code runs",
-                    "actual": "",
-                    "error": stderr[:500] if stderr else "No output produced",
-                }
-            ], "", stderr
         except subprocess.TimeoutExpired:
-            return [
-                {
-                    "test_name": "timeout",
-                    "passed": False,
-                    "expected": f"completes within {timeout}s",
-                    "actual": "timed out",
-                    "error": f"Code execution exceeded {timeout} second timeout",
-                }
-            ], "", "Execution timed out"

 """
+Safe code execution engine using subprocess with timeout.
 """
 import json
     """
     Execute submitted code against test cases in an isolated subprocess.
+    Returns (test_results, extra_stdout, stderr).
+    test_results is a list of {test_name, passed, expected, actual, error} dicts.
     """
     with tempfile.TemporaryDirectory() as tmpdir:
         harness_path = Path(tmpdir) / "harness.py"
+        # Embed submitted code directly via repr() to avoid a second file read.
         harness = textwrap.dedent(f"""\
             import sys, json, traceback
             results = []
             try:
+                exec({repr(submitted_code)})
+            except Exception:
                 print(json.dumps([{{"test_name": "load", "passed": False,
                        "expected": "code loads", "actual": "",
                        "error": traceback.format_exc()}}]))
                 sys.exit(0)
             {textwrap.indent(test_code, "            ").strip()}
             print(json.dumps(results))
             stdout = proc.stdout.strip()
             stderr = proc.stderr.strip()
             if stdout:
+                # JSON array is always the last line; anything before it is extra output.
+                extra_output, json_line = stdout.rsplit("\n", maxsplit=1) if "\n" in stdout else ("", stdout)
                 try:
+                    return json.loads(json_line), extra_output, stderr
                 except json.JSONDecodeError:
+                    return _failure("parse_error", "valid JSON output", stdout[:200],
+                                    "Could not parse test results from subprocess output"), "", stderr
+            return _failure("execution_error", "code runs", "",
+                            stderr[:500] if stderr else "No output produced"), "", stderr
         except subprocess.TimeoutExpired:
+            return _failure("timeout", f"completes within {timeout}s", "timed out",
+                            f"Code execution exceeded {timeout} second timeout"), "", "Execution timed out"
+def _failure(name: str, expected: str, actual: str, error: str) -> list[dict]:
+    return [{"test_name": name, "passed": False, "expected": expected, "actual": actual, "error": error}]

inference.py CHANGED Viewed

@@ -111,12 +111,8 @@ def build_feedback_prompt(obs: dict) -> str:
 def strip_fences(text: str) -> str:
     text = text.strip()
-    if text.startswith("```python"):
-        text = text[len("```python"):].strip()
-    elif text.startswith("```"):
-        text = text[3:].strip()
-    if text.endswith("```"):
-        text = text[:-3].strip()
     return text

 def strip_fences(text: str) -> str:
     text = text.strip()
+    text = text.removeprefix("```python").removeprefix("```").strip()
+    text = text.removesuffix("```").strip()
     return text