Spaces:

Souravdanyal
/

code-debug-env

Running

App Files Files Community

Souravdanyal commited on Apr 5

Commit

8485798

1 Parent(s): c21c4ee

Final complete version - all fixes applied

Browse files

Files changed (23) hide show

README.md +25 -12
inference.py +109 -182
server/__pycache__/__init__.cpython-39.pyc +0 -0
server/__pycache__/app.cpython-310.pyc +0 -0
server/__pycache__/app.cpython-39.pyc +0 -0
server/__pycache__/environment.cpython-310.pyc +0 -0
server/app.py +33 -71
server/graders/__pycache__/grader_easy.cpython-310.pyc +0 -0
server/graders/__pycache__/grader_hard.cpython-310.pyc +0 -0
server/graders/__pycache__/grader_medium.cpython-310.pyc +0 -0
server/graders/grader_easy.py +17 -31
server/graders/grader_hard.py +14 -94
server/tasks/__pycache__/__init__.cpython-39.pyc +0 -0
server/tasks/__pycache__/task_easy.cpython-310.pyc +0 -0
server/tasks/__pycache__/task_easy.cpython-39.pyc +0 -0
server/tasks/__pycache__/task_hard.cpython-310.pyc +0 -0
server/tasks/__pycache__/task_hard.cpython-39.pyc +0 -0
server/tasks/__pycache__/task_medium.cpython-310.pyc +0 -0
server/tasks/__pycache__/task_medium.cpython-39.pyc +0 -0
server/tasks/task_hard.py +1 -1
server/tasks/task_medium.py +1 -1
spec.md +37 -95
tests/test_graders.py +30 -49

README.md CHANGED Viewed

@@ -1,12 +1,3 @@
----
-title: Code Debug Env
-emoji: 🐛
-colorFrom: blue
-colorTo: green
-sdk: docker
-pinned: false
----
 # Code Debug Environment
 An [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
@@ -114,9 +105,10 @@ Explanation is scored by matching key algorithmic concepts. Partial credit is gi
 ### Install
 ```bash
-git clone https://github.com/Souravdanyal/code-debug-env
 cd code-debug-env
 pip install -e .
 git clone https://github.com/meta-pytorch/OpenEnv.git
 export PYTHONPATH=$PYTHONPATH:OpenEnv:OpenEnv/src:.
 ```
@@ -171,11 +163,32 @@ python inference.py --url http://localhost:7860 --difficulty hard
 ## Pre-Submission Validation
 ```bash
 python validator/pre_submit_check.py --url http://localhost:7860
 # Or against your HF Space:
-python validator/pre_submit_check.py --url https://Souravdanyal-code-debug-env.hf.space
 ```
 ---
@@ -206,4 +219,4 @@ code-debug-env/
 │   └── Dockerfile
 └── validator/
     └── pre_submit_check.py
-```

 # Code Debug Environment
 An [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
 ### Install
 ```bash
+git clone https://github.com/YOUR_USERNAME/code-debug-env
 cd code-debug-env
 pip install -e .
+# Also clone OpenEnv for PYTHONPATH
 git clone https://github.com/meta-pytorch/OpenEnv.git
 export PYTHONPATH=$PYTHONPATH:OpenEnv:OpenEnv/src:.
 ```
 ## Pre-Submission Validation
+Run before submitting to catch any disqualifying issues:
 ```bash
+# Start the environment first, then:
 python validator/pre_submit_check.py --url http://localhost:7860
 # Or against your HF Space:
+python validator/pre_submit_check.py --url https://YOUR_SPACE.hf.space
+```
+---
+## Deploy to Hugging Face Spaces
+```bash
+# Login
+huggingface-cli login
+# Create space and push
+huggingface-cli repo create code-debug-env --type space --space_sdk docker
+cd code-debug-env
+git init
+git remote add origin https://huggingface.co/spaces/YOUR_USERNAME/code-debug-env
+git add .
+git commit -m "Initial commit"
+git push origin main
 ```
 ---
 │   └── Dockerfile
 └── validator/
     └── pre_submit_check.py
+```

inference.py CHANGED Viewed

@@ -1,262 +1,189 @@
 #!/usr/bin/env python3
-# inference.py — Code Debug Environment Baseline Agent
-# Log format strictly follows [START] [STEP] [END] as required by evaluator.
-#
-# Required env vars: API_BASE_URL, MODEL_NAME, HF_TOKEN
-# Usage:
-#   python inference.py
-#   python inference.py --url https://Souravdanyal-code-debug-env.hf.space
-#   python inference.py --difficulty easy
-import os
-import sys
-import json
-import time
-import argparse
-import requests
 from openai import OpenAI
 from typing import List, Optional
-# ─── Configuration ────────────────────────────────────────────────────────────
 API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
-MODEL_NAME   = os.environ.get("MODEL_NAME", "gpt-4o-mini")
-HF_TOKEN     = os.environ.get("HF_TOKEN", "")
-ENV_URL      = os.environ.get("ENV_URL", "http://localhost:7860")
 BENCHMARK    = "code-debug-env"
 MAX_STEPS    = 5
-# ─── OpenAI Client ───────────────────────────────────────────────────────────
 client = OpenAI(api_key=HF_TOKEN or "dummy", base_url=API_BASE_URL)
-# ─── Logging — STRICT FORMAT REQUIRED BY EVALUATOR ───────────────────────────
-def log_start(task_id: str, env: str, model: str) -> None:
     print(f"[START] task={task_id} env={env} model={model}", flush=True)
-def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
-    error_val = error if error else "null"
-    done_val = str(done).lower()
-    print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
-def log_end(success: bool, steps: int, rewards: List[float]) -> None:
-    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
-    print(f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}", flush=True)
-# ─── Environment Client ───────────────────────────────────────────────────────
-def env_reset(env_url: str, difficulty: str) -> dict:
-    resp = requests.post(f"{env_url}/reset", json={"difficulty": difficulty}, timeout=30)
-    resp.raise_for_status()
-    return resp.json()
-def env_step(env_url: str, fixed_code: str, explanation: str = None) -> dict:
     payload = {"fixed_code": fixed_code}
     if explanation:
         payload["explanation"] = explanation
-    resp = requests.post(f"{env_url}/step", json=payload, timeout=30)
-    resp.raise_for_status()
-    return resp.json()
-# ─── LLM Agent ───────────────────────────────────────────────────────────────
-SYSTEM_PROMPT = """You are an expert Python debugging agent. Your job is to find and fix bugs in Python functions.
-CRITICAL RULES:
-- You MUST respond ONLY with valid JSON — no markdown, no explanation outside JSON
-- Return the COMPLETE fixed function, not just the changed line
-- The fixed_code must be syntactically valid Python
-- For hard tasks, the explanation field MUST describe: what the bug was, why it caused failures, and how your fix resolves it
-Response format (strictly):
 {
-  "fixed_code": "<complete corrected Python function>",
-  "explanation": "<for hard tasks: detailed explanation of bug and fix>"
 }
-DEBUGGING STRATEGY:
-1. Read the instructions carefully — they tell you exactly what type of bug exists
-2. Trace through the logic with the test inputs mentally
-3. For easy tasks: find the ONE wrong operator, value, or return statement
-4. For medium tasks: find BOTH bugs — usually one logic bug + one edge case
-5. For hard tasks: find the algorithmic flaw + write a clear explanation
-6. If your previous attempt failed, READ THE FEEDBACK — it shows exactly which inputs failed and what output was expected
 """
-def call_llm(buggy_code: str, instructions: str, difficulty: str,
-             feedback: str = None, attempt: int = 1,
-             previous_code: str = None) -> dict:
-    user_content = f"""Task difficulty: {difficulty}
-Instructions: {instructions}
-Buggy code to fix:
-```python
-{buggy_code}
-```
-"""
     if feedback and attempt > 1:
-        user_content += f"""
-PREVIOUS ATTEMPT FAILED. Here is the feedback showing what went wrong:
-{feedback}
-Your previous fix was:
-```python
-{previous_code or 'unknown'}
-```
-IMPORTANT: Your previous fix did not work. Carefully analyze the feedback above.
-Look at the Input, Expected, and Got values for each failing test.
-Try a completely different approach to fix the bug.
-"""
     if difficulty == "hard":
-        user_content += """
-Remember: For hard tasks you MUST include a detailed explanation field describing:
-- What the algorithmic bug was
-- Why it caused incorrect results
-- How your fix resolves it
-Explanation quality affects 30% of your reward.
-"""
-    messages = [
-        {"role": "system", "content": SYSTEM_PROMPT},
-        {"role": "user", "content": user_content},
-    ]
     try:
-        response = client.chat.completions.create(
             model=MODEL_NAME,
-            messages=messages,
             max_tokens=1500,
-            temperature=0.2 if attempt == 1 else 0.5,
         )
-        content = response.choices[0].message.content.strip()
-        # Strip markdown fences
-        if content.startswith("```"):
-            lines = content.split("\n")
-            content = "\n".join(lines[1:-1]) if lines[-1].strip() == "```" else "\n".join(lines[1:])
-        # Strip json prefix
-        if content.startswith("json"):
-            content = content[4:].strip()
-        parsed = json.loads(content)
-        return {
-            "fixed_code": parsed.get("fixed_code", ""),
-            "explanation": parsed.get("explanation", None),
-        }
-    except json.JSONDecodeError:
-        # Try to extract code from malformed response
-        if "def " in content:
-            lines = content.split("\n")
-            code_lines = []
-            in_code = False
-            for line in lines:
-                if line.strip().startswith("def "):
-                    in_code = True
-                if in_code:
-                    code_lines.append(line)
-            if code_lines:
-                return {"fixed_code": "\n".join(code_lines), "explanation": None}
-        return {"fixed_code": buggy_code, "explanation": None}
     except Exception as e:
-        print(f"# LLM call failed: {e}", file=sys.stderr)
         return {"fixed_code": buggy_code, "explanation": None}
-# ─── Main Episode Loop ────────────────────────────────────────────────────────
-def run_episode(env_url: str, difficulty: str) -> tuple:
-    reset_data = env_reset(env_url, difficulty)
-    obs = reset_data["observation"]
     task_id      = obs["task_id"]
     buggy_code   = obs["buggy_code"]
     instructions = obs["instructions"]
-    log_start(task_id=task_id, env=BENCHMARK, model=MODEL_NAME)
-    last_feedback = None
-    last_fixed_code = None
-    rewards: List[float] = []
-    steps_taken = 0
-    success = False
     for attempt in range(1, MAX_STEPS + 1):
         steps_taken = attempt
-        agent_action = call_llm(
-            buggy_code=buggy_code,
-            instructions=instructions,
-            difficulty=difficulty,
-            feedback=last_feedback,
-            attempt=attempt,
-            previous_code=last_fixed_code,
-        )
-        fixed_code = agent_action["fixed_code"]
-        last_fixed_code = fixed_code
-        if not fixed_code or not fixed_code.strip():
-            log_step(step=attempt, action="empty_submission",
-                     reward=0.0, done=False, error="empty_code")
             rewards.append(0.0)
             continue
         try:
-            result = env_step(env_url, fixed_code=fixed_code,
-                              explanation=agent_action.get("explanation"))
         except Exception as e:
-            log_step(step=attempt, action="step_failed",
-                     reward=0.0, done=False, error=str(e)[:60])
             rewards.append(0.0)
             continue
         reward = result.get("reward", 0.0)
         done   = result.get("done", False)
-        obs_r  = result.get("observation", {})
-        last_feedback = obs_r.get("feedback", "")
-        log_step(step=attempt, action=f"fix_{difficulty}_attempt{attempt}",
-                 reward=reward, done=done, error=None)
         rewards.append(reward)
         if reward >= 1.0:
             success = True
         if done:
             break
-    log_end(success=success, steps=steps_taken, rewards=rewards)
     return success, steps_taken, rewards
 def main():
-    parser = argparse.ArgumentParser(description="Code Debug Environment Baseline Agent")
-    parser.add_argument("--url", default=ENV_URL, help="Environment base URL")
-    parser.add_argument("--difficulty", default=None,
-                        choices=["easy", "medium", "hard", "all"])
     args = parser.parse_args()
-    env_url = args.url.rstrip("/")
     try:
-        health = requests.get(f"{env_url}/health", timeout=10)
-        health.raise_for_status()
-        print(f"# Environment healthy at {env_url}", flush=True)
     except Exception as e:
         print(f"# Health check failed: {e}", file=sys.stderr)
         sys.exit(1)
-    difficulties = ["easy", "medium", "hard"] if (
-        args.difficulty in ("all", None)) else [args.difficulty]
-    all_rewards = []
-    all_successes = []
-    for difficulty in difficulties:
-        success, steps, rewards = run_episode(env_url, difficulty)
         all_rewards.extend(rewards)
-        all_successes.append(success)
         time.sleep(0.5)
-    avg = round(sum(all_rewards) / len(all_rewards), 3) if all_rewards else 0.0
-    print(
-        f"# SUMMARY: {sum(all_successes)}/{len(difficulties)} tasks solved | avg_reward={avg}",
-        flush=True
-    )
 if __name__ == "__main__":
-    main()

 #!/usr/bin/env python3
+"""
+inference.py - Code Debug Environment Baseline Agent
+Required env vars: API_BASE_URL, MODEL_NAME, HF_TOKEN
+Usage:
+  python inference.py
+  python inference.py --url https://Souravdanyal-code-debug-env.hf.space
+  python inference.py --difficulty easy
+STDOUT FORMAT (required by evaluator):
+  [START] task=<id> env=<benchmark> model=<model>
+  [STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
+  [END] success=<true|false> steps=<n> rewards=<r1,r2,...>
+"""
+import os, sys, json, time, argparse, requests
 from openai import OpenAI
 from typing import List, Optional
+# ── Config ────────────────────────────────────────────────────────────────────
 API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
+MODEL_NAME   = os.environ.get("MODEL_NAME",   "gpt-4o-mini")
+HF_TOKEN     = os.environ.get("HF_TOKEN",     "")
+ENV_URL      = os.environ.get("ENV_URL",      "http://localhost:7860")
 BENCHMARK    = "code-debug-env"
 MAX_STEPS    = 5
 client = OpenAI(api_key=HF_TOKEN or "dummy", base_url=API_BASE_URL)
+# ── Logging ───────────────────────────────────────────────────────────────────
+def log_start(task_id, env, model):
     print(f"[START] task={task_id} env={env} model={model}", flush=True)
+def log_step(step, action, reward, done, error):
+    print(f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={error or 'null'}", flush=True)
+def log_end(success, steps, rewards):
+    print(f"[END] success={str(success).lower()} steps={steps} rewards={','.join(f'{r:.2f}' for r in rewards)}", flush=True)
+# ── Env client ────────────────────────────────────────────────────────────────
+def env_reset(url, difficulty):
+    r = requests.post(f"{url}/reset", json={"difficulty": difficulty}, timeout=30)
+    r.raise_for_status()
+    return r.json()
+def env_step(url, fixed_code, explanation=None):
     payload = {"fixed_code": fixed_code}
     if explanation:
         payload["explanation"] = explanation
+    r = requests.post(f"{url}/step", json=payload, timeout=30)
+    r.raise_for_status()
+    return r.json()
+# ── LLM ──────────────────────────────────────────────────────────────────────
+SYSTEM_PROMPT = """You are an expert Python debugging agent. Fix bugs in Python functions.
+RESPONSE FORMAT — strictly JSON only, no markdown:
 {
+  "fixed_code": "<complete corrected Python function including imports>",
+  "explanation": "<for hard tasks: explain the bug, root cause, and fix>"
 }
+RULES:
+- Return COMPLETE function with all imports (e.g. from collections import deque)
+- fixed_code must be valid Python
+- For hard tasks explanation MUST mention the algorithmic concept
+COMMON BUGS:
+- Graph/BFS: missing visited set → infinite loop on cycles → add visited=set()
+- Knapsack DP: wrong loop order (forward=unbounded, backward=0/1 knapsack)
+- Binary search: wrong boundary → return high not low, or high=n//2 not n
+- Off-by-one: lst[2] should be lst[1] for second element
+- Wrong operator: + instead of -, * instead of /
+IF PREVIOUS ATTEMPT FAILED:
+- Read the Input/Expected/Got carefully
+- Try a completely different fix
+- For TimeoutError: you have an infinite loop, add a visited set
 """
+def call_llm(buggy_code, instructions, difficulty, feedback=None, attempt=1, prev_code=None):
+    content = f"Difficulty: {difficulty}\nInstructions: {instructions}\n\nBuggy code:\n```python\n{buggy_code}\n```\n"
     if feedback and attempt > 1:
+        content += f"\nPREVIOUS FIX FAILED. Feedback:\n{feedback}\n\nYour previous code:\n```python\n{prev_code or ''}\n```\nTry a different approach.\n"
     if difficulty == "hard":
+        content += "\nIMPORTANT: Include a detailed explanation field mentioning the algorithmic concept.\n"
     try:
+        resp = client.chat.completions.create(
             model=MODEL_NAME,
+            messages=[{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": content}],
             max_tokens=1500,
+            temperature=0.1 if attempt == 1 else 0.4,
         )
+        raw = resp.choices[0].message.content.strip()
+        # Clean markdown fences
+        if "```" in raw:
+            raw = raw.split("```")[1] if raw.startswith("```") else raw
+            if raw.startswith("json\n"):
+                raw = raw[5:]
+        # Find JSON object
+        start = raw.find("{")
+        end = raw.rfind("}") + 1
+        if start >= 0 and end > start:
+            raw = raw[start:end]
+        parsed = json.loads(raw)
+        return {"fixed_code": parsed.get("fixed_code", ""), "explanation": parsed.get("explanation")}
     except Exception as e:
+        print(f"# LLM error: {e}", file=sys.stderr)
         return {"fixed_code": buggy_code, "explanation": None}
+# ── Episode ───────────────────────────────────────────────────────────────────
+def run_episode(env_url, difficulty):
+    data = env_reset(env_url, difficulty)
+    obs  = data["observation"]
     task_id      = obs["task_id"]
     buggy_code   = obs["buggy_code"]
     instructions = obs["instructions"]
+    log_start(task_id, BENCHMARK, MODEL_NAME)
+    rewards, steps_taken, success = [], 0, False
+    last_feedback, last_code = None, None
     for attempt in range(1, MAX_STEPS + 1):
         steps_taken = attempt
+        action = call_llm(buggy_code, instructions, difficulty, last_feedback, attempt, last_code)
+        code = action["fixed_code"]
+        last_code = code
+        if not code or not code.strip():
+            log_step(attempt, "empty_submission", 0.0, False, "empty_code")
             rewards.append(0.0)
             continue
         try:
+            result = env_step(env_url, code, action.get("explanation"))
         except Exception as e:
+            log_step(attempt, "step_failed", 0.0, False, str(e)[:60])
             rewards.append(0.0)
             continue
         reward = result.get("reward", 0.0)
         done   = result.get("done", False)
+        last_feedback = result.get("observation", {}).get("feedback", "")
+        log_step(attempt, f"fix_{difficulty}_attempt{attempt}", reward, done, None)
         rewards.append(reward)
         if reward >= 1.0:
             success = True
         if done:
             break
+    log_end(success, steps_taken, rewards)
     return success, steps_taken, rewards
+# ── Main ──────────────────────────────────────────────────────────────────────
 def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--url", default=ENV_URL)
+    parser.add_argument("--difficulty", default=None, choices=["easy","medium","hard","all"])
     args = parser.parse_args()
+    url = args.url.rstrip("/")
     try:
+        requests.get(f"{url}/health", timeout=10).raise_for_status()
+        print(f"# Environment healthy at {url}", flush=True)
     except Exception as e:
         print(f"# Health check failed: {e}", file=sys.stderr)
         sys.exit(1)
+    diffs = ["easy","medium","hard"] if args.difficulty in (None,"all") else [args.difficulty]
+    all_rewards, successes = [], []
+    for d in diffs:
+        ok, _, rewards = run_episode(url, d)
         all_rewards.extend(rewards)
+        successes.append(ok)
         time.sleep(0.5)
+    avg = round(sum(all_rewards)/len(all_rewards), 3) if all_rewards else 0.0
+    print(f"# SUMMARY: {sum(successes)}/{len(diffs)} tasks solved | avg_reward={avg}", flush=True)
 if __name__ == "__main__":
+    main()

server/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (161 Bytes)

server/__pycache__/app.cpython-310.pyc CHANGED Viewed

Binary files a/server/__pycache__/app.cpython-310.pyc and b/server/__pycache__/app.cpython-310.pyc differ

server/__pycache__/app.cpython-39.pyc DELETED Viewed

Binary file (4.22 kB)

server/__pycache__/environment.cpython-310.pyc CHANGED Viewed

Binary files a/server/__pycache__/environment.cpython-310.pyc and b/server/__pycache__/environment.cpython-310.pyc differ

server/app.py CHANGED Viewed

@@ -1,5 +1,4 @@
-# server/app.py
-# FastAPI server exposing the OpenEnv standard endpoints.
 # Port 7860 required for Hugging Face Spaces.
 from fastapi import FastAPI, HTTPException
@@ -14,108 +13,74 @@ from models import DebugAction, DebugObservation, DebugState
 app = FastAPI(
     title="Code Debug Environment",
-    description=(
-        "An OpenEnv environment where LLM agents fix buggy Python code. "
-        "3 difficulty levels: easy (1 bug), medium (2 bugs), hard (algorithmic + explanation)."
-    ),
     version="1.0.0",
 )
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# One global environment instance (single session)
-# For concurrent sessions, instantiate per-request with a session dict
 env = CodeDebugEnvironment()
-# ─── Request Models ─────────────────────────────────────────────────────────
 class ResetRequest(BaseModel):
-    difficulty: Optional[str] = None  # "easy" | "medium" | "hard" | None (random)
 class StepRequest(BaseModel):
     fixed_code: str
     explanation: Optional[str] = None
-# ─── Response wrapper matching OpenEnv StepResult shape ──────────────────────
 class StepResponse(BaseModel):
     observation: dict
     reward: float
     done: bool
-# ─── Endpoints ───────────────────────────────────────────────────────────────
 @app.get("/", response_class=HTMLResponse)
 async def root():
-    """Homepage — shows environment info and API endpoints."""
     html_path = os.path.join(os.path.dirname(__file__), "static", "index.html")
-    with open(html_path, "r") as f:
         return f.read()
 @app.get("/health")
 async def health():
-    """Health check endpoint — must return 200 for submission validation."""
     return {"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
 @app.post("/reset")
 async def reset(request: ResetRequest = ResetRequest()) -> dict:
-    """
-    Reset the environment to start a new episode.
-    Optionally pass difficulty: 'easy' | 'medium' | 'hard'
-    """
     try:
-        observation = env.reset(difficulty=request.difficulty)
-        return {
-            "observation": observation.model_dump(),
-            "reward": 0.0,
-            "done": False,
-        }
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Reset failed: {str(e)}")
 @app.post("/step")
 async def step(request: StepRequest) -> StepResponse:
-    """
-    Submit a code fix (and optional explanation for hard tasks).
-    Returns observation with reward (0.0-1.0), feedback, and done flag.
-    """
     if not request.fixed_code or not request.fixed_code.strip():
         raise HTTPException(status_code=400, detail="fixed_code must not be empty.")
     try:
-        action = DebugAction(
-            fixed_code=request.fixed_code,
-            explanation=request.explanation,
-        )
-        observation = env.step(action)
-        return StepResponse(
-            observation=observation.model_dump(),
-            reward=observation.reward or 0.0,
-            done=observation.done,
-        )
     except TimeoutError:
-        # Code execution timed out — return 0 reward instead of 500
         return StepResponse(
-            observation={"task_id": "unknown", "difficulty": "unknown",
-                        "buggy_code": "", "instructions": "",
-                        "test_cases_description": "", "reward": 0.0,
-                        "passed_tests": 0, "total_tests": 3,
-                        "feedback": "TimeoutError: Code execution timed out. Possible infinite loop.",
-                        "done": False},
-            reward=0.0,
-            done=False,
         )
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Step failed: {str(e)}")
@@ -123,30 +88,27 @@ async def step(request: StepRequest) -> StepResponse:
 @app.get("/state")
 async def state() -> dict:
-    """Return the current episode state."""
     try:
-        s = env.state
-        return s.model_dump()
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"State failed: {str(e)}")
 @app.get("/tasks")
 async def list_tasks() -> dict:
-    """List available task IDs per difficulty (for inspection)."""
     from server.tasks.task_easy import EASY_TASKS
     from server.tasks.task_medium import MEDIUM_TASKS
     from server.tasks.task_hard import HARD_TASKS
     return {
-        "easy": [t["task_id"] for t in EASY_TASKS],
         "medium": [t["task_id"] for t in MEDIUM_TASKS],
-        "hard": [t["task_id"] for t in HARD_TASKS],
-        "total": len(EASY_TASKS) + len(MEDIUM_TASKS) + len(HARD_TASKS),
     }
-# ─── Run directly with: python server/app.py ─────────────────────────────────
 if __name__ == "__main__":
-    import sys
     import uvicorn
-    sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-    uvicorn.run("server.app:app", host="127.0.0.1", port=7860, reload=True)

+# server/app.py — FastAPI server for Code Debug Environment
 # Port 7860 required for Hugging Face Spaces.
 from fastapi import FastAPI, HTTPException
 app = FastAPI(
     title="Code Debug Environment",
+    description="OpenEnv RL environment where LLM agents fix buggy Python code. 3 difficulty levels: easy, medium, hard.",
     version="1.0.0",
 )
+app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 env = CodeDebugEnvironment()
 class ResetRequest(BaseModel):
+    difficulty: Optional[str] = None
 class StepRequest(BaseModel):
     fixed_code: str
     explanation: Optional[str] = None
 class StepResponse(BaseModel):
     observation: dict
     reward: float
     done: bool
 @app.get("/", response_class=HTMLResponse)
 async def root():
+    """Homepage with live interactive tester."""
     html_path = os.path.join(os.path.dirname(__file__), "static", "index.html")
+    with open(html_path, "r", encoding="utf-8") as f:
         return f.read()
+@app.get("/favicon.ico", include_in_schema=False)
+async def favicon():
+    from fastapi.responses import Response
+    return Response(content=b"", media_type="image/x-icon")
 @app.get("/health")
 async def health():
+    """Health check — must return 200 for submission validation."""
     return {"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
 @app.post("/reset")
 async def reset(request: ResetRequest = ResetRequest()) -> dict:
+    """Reset environment to start a new episode. Pass difficulty: easy | medium | hard"""
     try:
+        obs = env.reset(difficulty=request.difficulty)
+        return {"observation": obs.model_dump(), "reward": 0.0, "done": False}
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Reset failed: {str(e)}")
 @app.post("/step")
 async def step(request: StepRequest) -> StepResponse:
+    """Submit fixed code. Returns reward (0.0-1.0), feedback, done flag."""
     if not request.fixed_code or not request.fixed_code.strip():
         raise HTTPException(status_code=400, detail="fixed_code must not be empty.")
     try:
+        action = DebugAction(fixed_code=request.fixed_code, explanation=request.explanation)
+        obs = env.step(action)
+        return StepResponse(observation=obs.model_dump(), reward=obs.reward or 0.0, done=obs.done)
     except TimeoutError:
         return StepResponse(
+            observation={"task_id": "unknown", "difficulty": "unknown", "buggy_code": "",
+                         "instructions": "", "test_cases_description": "", "reward": 0.0,
+                         "passed_tests": 0, "total_tests": 3, "done": False,
+                         "feedback": "TimeoutError: Infinite loop detected. Add a visited set for graph traversal."},
+            reward=0.0, done=False,
         )
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Step failed: {str(e)}")
 @app.get("/state")
 async def state() -> dict:
+    """Return current episode state."""
     try:
+        return env.state.model_dump()
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"State failed: {str(e)}")
 @app.get("/tasks")
 async def list_tasks() -> dict:
+    """List all 45 task IDs across difficulty levels."""
     from server.tasks.task_easy import EASY_TASKS
     from server.tasks.task_medium import MEDIUM_TASKS
     from server.tasks.task_hard import HARD_TASKS
     return {
+        "easy":   [t["task_id"] for t in EASY_TASKS],
         "medium": [t["task_id"] for t in MEDIUM_TASKS],
+        "hard":   [t["task_id"] for t in HARD_TASKS],
+        "total":  len(EASY_TASKS) + len(MEDIUM_TASKS) + len(HARD_TASKS),
     }
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run("server.app:app", host="127.0.0.1", port=7860, reload=True)

server/graders/__pycache__/grader_easy.cpython-310.pyc CHANGED Viewed

Binary files a/server/graders/__pycache__/grader_easy.cpython-310.pyc and b/server/graders/__pycache__/grader_easy.cpython-310.pyc differ

server/graders/__pycache__/grader_hard.cpython-310.pyc CHANGED Viewed

Binary files a/server/graders/__pycache__/grader_hard.cpython-310.pyc and b/server/graders/__pycache__/grader_hard.cpython-310.pyc differ

server/graders/__pycache__/grader_medium.cpython-310.pyc CHANGED Viewed

Binary files a/server/graders/__pycache__/grader_medium.cpython-310.pyc and b/server/graders/__pycache__/grader_medium.cpython-310.pyc differ

server/graders/grader_easy.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # server/graders/grader_easy.py
-# Grades easy tasks: 1 bug, 3 test cases.
-# Reward is proportional to tests passed (0.33, 0.66, 1.0).
 import traceback
 import signal
@@ -8,37 +8,32 @@ from typing import Tuple, List
 def _timeout_handler(signum, frame):
-    raise TimeoutError("Code execution timed out (infinite loop or slow code)")
 def _run_code_safely(code: str, func_name: str, test_input):
-    """
-    Executes the submitted code in an isolated namespace and calls the function.
-    Returns (output, error_message).
-    Times out after 5 seconds to prevent infinite loops.
-    """
     namespace = {}
     try:
         exec(compile(code, "<submitted>", "exec"), namespace)
     except SyntaxError as e:
         return None, f"SyntaxError: {e}"
     except Exception as e:
-        return None, f"Compile error: {e}"
     func = namespace.get(func_name)
     if func is None:
-        funcs = [v for v in namespace.values() if callable(v) and not v.__name__.startswith("_")]
         if not funcs:
             return None, "No callable function found in submitted code."
         func = funcs[0]
     try:
-        # Set 5 second timeout to catch infinite loops
         try:
             signal.signal(signal.SIGALRM, _timeout_handler)
             signal.alarm(5)
         except (AttributeError, OSError):
-            pass  # Windows doesn't support SIGALRM, skip timeout
         if isinstance(test_input, list) and len(test_input) > 0 and isinstance(test_input[0], list):
             result = func(*test_input)
@@ -51,13 +46,14 @@ def _run_code_safely(code: str, func_name: str, test_input):
             result = func(test_input)
         try:
-            signal.alarm(0)  # Cancel timeout
         except (AttributeError, OSError):
             pass
         return result, None
     except TimeoutError as e:
-        return None, f"TimeoutError: {e}"
     except Exception as e:
         try:
             signal.alarm(0)
@@ -67,7 +63,6 @@ def _run_code_safely(code: str, func_name: str, test_input):
 def _extract_func_name(code: str) -> str:
-    """Extract the first function name defined in the code."""
     for line in code.splitlines():
         line = line.strip()
         if line.startswith("def "):
@@ -77,14 +72,8 @@ def _extract_func_name(code: str) -> str:
 def grade_easy(fixed_code: str, task: dict) -> Tuple[float, int, int, str, List[dict]]:
     """
-    Grade an easy task submission.
-    Returns:
-        reward (float): 0.0 to 1.0
-        passed (int): number of tests passed
-        total (int): total test cases
-        feedback (str): detailed feedback message
-        results (list): per-test results
     """
     test_cases = task["test_cases"]
     total = len(test_cases)
@@ -99,21 +88,18 @@ def grade_easy(fixed_code: str, task: dict) -> Tuple[float, int, int, str, List[
         got, error = _run_code_safely(fixed_code, func_name, inp)
         if error:
-            results.append({"test_id": i + 1, "passed": False, "expected": str(expected), "got": f"ERROR: {error}"})
             feedback_lines.append(f"Test {i+1}: ❌ Error\n   Input    : {inp!r}\n   Expected : {expected!r}\n   Error    : {error}")
         elif got == expected:
             passed += 1
-            results.append({"test_id": i + 1, "passed": True, "expected": str(expected), "got": str(got)})
             feedback_lines.append(f"Test {i+1}: ✅ Passed\n   Input    : {inp!r}\n   Expected : {expected!r}\n   Got      : {got!r}")
         else:
-            results.append({"test_id": i + 1, "passed": False, "expected": str(expected), "got": str(got)})
             feedback_lines.append(f"Test {i+1}: ❌ Failed\n   Input    : {inp!r}\n   Expected : {expected!r}\n   Got      : {got!r}")
     reward = round(passed / total, 2)
     feedback = "\n".join(feedback_lines)
-    if passed == total:
-        feedback += "\n🎉 All tests passed! Full reward."
-    else:
-        feedback += f"\n{passed}/{total} tests passed. Review the failing cases."
-    return reward, passed, total, feedback, results

 # server/graders/grader_easy.py
+# Grades easy and medium tasks: runs code against test cases.
+# Reward is proportional to tests passed (0.33, 0.67, 1.0).
 import traceback
 import signal
 def _timeout_handler(signum, frame):
+    raise TimeoutError("Code timed out — likely infinite loop. Check for missing visited set in graph traversal.")
 def _run_code_safely(code: str, func_name: str, test_input):
+    """Run submitted code safely with 5s timeout. Returns (result, error)."""
     namespace = {}
     try:
         exec(compile(code, "<submitted>", "exec"), namespace)
     except SyntaxError as e:
         return None, f"SyntaxError: {e}"
     except Exception as e:
+        return None, f"CompileError: {e}"
     func = namespace.get(func_name)
     if func is None:
+        funcs = [v for v in namespace.values() if callable(v) and not str(v.__name__).startswith("_")]
         if not funcs:
             return None, "No callable function found in submitted code."
         func = funcs[0]
     try:
         try:
             signal.signal(signal.SIGALRM, _timeout_handler)
             signal.alarm(5)
         except (AttributeError, OSError):
+            pass  # Windows has no SIGALRM
         if isinstance(test_input, list) and len(test_input) > 0 and isinstance(test_input[0], list):
             result = func(*test_input)
             result = func(test_input)
         try:
+            signal.alarm(0)
         except (AttributeError, OSError):
             pass
         return result, None
     except TimeoutError as e:
+        return None, str(e)
     except Exception as e:
         try:
             signal.alarm(0)
 def _extract_func_name(code: str) -> str:
     for line in code.splitlines():
         line = line.strip()
         if line.startswith("def "):
 def grade_easy(fixed_code: str, task: dict) -> Tuple[float, int, int, str, List[dict]]:
     """
+    Grade submission against test cases.
+    Returns: (reward, passed, total, feedback, results)
     """
     test_cases = task["test_cases"]
     total = len(test_cases)
         got, error = _run_code_safely(fixed_code, func_name, inp)
         if error:
+            results.append({"test_id": i+1, "passed": False, "expected": str(expected), "got": f"ERROR"})
             feedback_lines.append(f"Test {i+1}: ❌ Error\n   Input    : {inp!r}\n   Expected : {expected!r}\n   Error    : {error}")
         elif got == expected:
             passed += 1
+            results.append({"test_id": i+1, "passed": True, "expected": str(expected), "got": str(got)})
             feedback_lines.append(f"Test {i+1}: ✅ Passed\n   Input    : {inp!r}\n   Expected : {expected!r}\n   Got      : {got!r}")
         else:
+            results.append({"test_id": i+1, "passed": False, "expected": str(expected), "got": str(got)})
             feedback_lines.append(f"Test {i+1}: ❌ Failed\n   Input    : {inp!r}\n   Expected : {expected!r}\n   Got      : {got!r}")
     reward = round(passed / total, 2)
     feedback = "\n".join(feedback_lines)
+    feedback += "\n🎉 All tests passed! Full reward." if passed == total else f"\n{passed}/{total} tests passed."
+    return reward, passed, total, feedback, results

server/graders/grader_hard.py CHANGED Viewed

@@ -8,13 +8,10 @@ from .grader_easy import grade_easy
 def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple[float, str]:
     """
-    Scores the explanation by checking for required conceptual keywords.
-    Returns (score 0.0-1.0, feedback string).
-    Scoring:
     - No explanation → 0.0
-    - At least 1 keyword hit → partial credit
-    - Half or more keywords → full credit 1.0
     """
     if not explanation or len(explanation.strip()) < 10:
         return 0.0, "❌ No explanation provided. Hard tasks require an explanation field."
@@ -23,31 +20,28 @@ def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple
     hits = [kw for kw in keywords if kw.lower() in explanation_lower]
     if not keywords:
-        # No keywords defined — give full credit for any explanation
         score = 1.0 if len(explanation.strip()) > 20 else 0.5
     else:
-        # Need at least 1 hit for partial, half for full
-        needed_for_full = max(1, len(keywords) // 2)
         if len(hits) == 0:
             score = 0.0
-        elif len(hits) >= needed_for_full:
             score = 1.0
         else:
-            # Partial credit proportional to hits
-            score = round(len(hits) / needed_for_full, 2)
     if score == 1.0:
-        feedback = f"✅ Explanation excellent! Mentioned key concepts: {', '.join(hits)}"
     elif score > 0:
         missing = [kw for kw in keywords if kw.lower() not in explanation_lower]
         feedback = (
-            f"⚠️ Partial explanation (score={score}). Mentioned: {', '.join(hits) if hits else 'none'}. "
-            f"Consider also discussing: {', '.join(missing[:3])}"
         )
     else:
         feedback = (
             f"❌ Explanation missing key concepts. "
-            f"Try to explain: {', '.join(keywords[:3])} in your analysis."
         )
     return round(score, 2), feedback
@@ -56,23 +50,11 @@ def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple
 def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -> Tuple[float, int, int, str, List[dict]]:
     """
     Grade a hard task submission.
-    Reward = 0.7 * test_score + 0.3 * explanation_score
-    Returns:
-        reward (float): 0.0 to 1.0
-        passed (int)
-        total (int)
-        feedback (str)
-        results (list)
     """
-    # Grade code using easy grader (same test execution logic)
     test_reward, passed, total, code_feedback, results = grade_easy(fixed_code, task)
-    # Grade explanation
     keywords = task.get("explanation_keywords", [])
     exp_score, exp_feedback = _score_explanation(explanation, keywords)
-    # Combined reward
     final_reward = round(0.7 * test_reward + 0.3 * exp_score, 2)
     feedback = (
@@ -83,71 +65,9 @@ def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -
         f"=== Final Reward: {final_reward:.2f} ==="
     )
-    if passed < total and not explanation:
-        feedback += "\n💡 Tip: Fix the code bugs AND provide a clear explanation for max reward."
     if passed == total and exp_score < 1.0:
-        feedback += f"\n💡 Tip: Your code is correct! Improve explanation by mentioning: {', '.join(keywords[:3])}"
     return final_reward, passed, total, feedback, results
-    """
-    Scores the explanation by checking for required conceptual keywords.
-    Returns (score 0.0-1.0, feedback string).
-    """
-    if not explanation or len(explanation.strip()) < 10:
-        return 0.0, "❌ No explanation provided. Hard tasks require an explanation field."
-    explanation_lower = explanation.lower()
-    hits = [kw for kw in keywords if kw.lower() in explanation_lower]
-    score = min(1.0, len(hits) / max(1, len(keywords) // 2))  # need at least half the keywords
-    if score == 1.0:
-        feedback = f"✅ Explanation excellent! Mentioned key concepts: {', '.join(hits)}"
-    elif score > 0:
-        feedback = (
-            f"⚠️ Partial explanation. Mentioned: {', '.join(hits) if hits else 'none'}. "
-            f"Consider discussing: {', '.join(kw for kw in keywords if kw.lower() not in explanation_lower)[:3]}"
-        )
-    else:
-        feedback = (
-            f"❌ Explanation missing key concepts. "
-            f"Try to explain: {', '.join(keywords[:3])} in your analysis."
-        )
-    return round(score, 2), feedback
-def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -> Tuple[float, int, int, str, List[dict]]:
-    """
-    Grade a hard task submission.
-    Reward = 0.7 * test_score + 0.3 * explanation_score
-    Returns:
-        reward (float): 0.0 to 1.0
-        passed (int)
-        total (int)
-        feedback (str)
-        results (list)
-    """
-    # Grade code
-    test_reward, passed, total, code_feedback, results = grade_easy(fixed_code, task)
-    # Grade explanation
-    keywords = task.get("explanation_keywords", [])
-    exp_score, exp_feedback = _score_explanation(explanation, keywords)
-    # Combined reward
-    final_reward = round(0.7 * test_reward + 0.3 * exp_score, 2)
-    feedback = (
-        f"--- Code Score (70% weight): {test_reward:.2f} ---\n"
-        f"{code_feedback}\n\n"
-        f"--- Explanation Score (30% weight): {exp_score:.2f} ---\n"
-        f"{exp_feedback}\n\n"
-        f"=== Final Reward: {final_reward:.2f} ==="
-    )
-    if passed < total and not explanation:
-        feedback += "\n💡 Tip: Fix the code bugs AND provide a clear explanation for max reward."
-    return final_reward, passed, total, feedback, results

 def _score_explanation(explanation: Optional[str], keywords: List[str]) -> Tuple[float, str]:
     """
+    Score explanation by checking for required conceptual keywords.
     - No explanation → 0.0
+    - 1+ keyword hit → partial credit proportional to hits
+    - Half or more keywords → 1.0
     """
     if not explanation or len(explanation.strip()) < 10:
         return 0.0, "❌ No explanation provided. Hard tasks require an explanation field."
     hits = [kw for kw in keywords if kw.lower() in explanation_lower]
     if not keywords:
         score = 1.0 if len(explanation.strip()) > 20 else 0.5
     else:
+        needed = max(1, len(keywords) // 2)
         if len(hits) == 0:
             score = 0.0
+        elif len(hits) >= needed:
             score = 1.0
         else:
+            score = round(len(hits) / needed, 2)
     if score == 1.0:
+        feedback = f"✅ Explanation excellent! Mentioned: {', '.join(hits)}"
     elif score > 0:
         missing = [kw for kw in keywords if kw.lower() not in explanation_lower]
         feedback = (
+            f"⚠️ Partial explanation (score={score}). Mentioned: {', '.join(hits) or 'none'}. "
+            f"Also discuss: {', '.join(missing[:3])}"
         )
     else:
         feedback = (
             f"❌ Explanation missing key concepts. "
+            f"Explain: {', '.join(keywords[:3])}"
         )
     return round(score, 2), feedback
 def grade_hard(fixed_code: str, task: dict, explanation: Optional[str] = None) -> Tuple[float, int, int, str, List[dict]]:
     """
     Grade a hard task submission.
+    Reward = 0.7 × test_score + 0.3 × explanation_score
     """
     test_reward, passed, total, code_feedback, results = grade_easy(fixed_code, task)
     keywords = task.get("explanation_keywords", [])
     exp_score, exp_feedback = _score_explanation(explanation, keywords)
     final_reward = round(0.7 * test_reward + 0.3 * exp_score, 2)
     feedback = (
         f"=== Final Reward: {final_reward:.2f} ==="
     )
     if passed == total and exp_score < 1.0:
+        feedback += f"\n💡 Code is correct! Improve explanation by mentioning: {', '.join(keywords[:3])}"
+    elif passed < total and not explanation:
+        feedback += "\n💡 Fix the code AND provide a clear explanation for max reward."
     return final_reward, passed, total, feedback, results

server/tasks/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (449 Bytes)

server/tasks/__pycache__/task_easy.cpython-310.pyc CHANGED Viewed

Binary files a/server/tasks/__pycache__/task_easy.cpython-310.pyc and b/server/tasks/__pycache__/task_easy.cpython-310.pyc differ

server/tasks/__pycache__/task_easy.cpython-39.pyc DELETED Viewed

Binary file (7.37 kB)

server/tasks/__pycache__/task_hard.cpython-310.pyc CHANGED Viewed

Binary files a/server/tasks/__pycache__/task_hard.cpython-310.pyc and b/server/tasks/__pycache__/task_hard.cpython-310.pyc differ

server/tasks/__pycache__/task_hard.cpython-39.pyc DELETED Viewed

Binary file (16.5 kB)

server/tasks/__pycache__/task_medium.cpython-310.pyc CHANGED Viewed

Binary files a/server/tasks/__pycache__/task_medium.cpython-310.pyc and b/server/tasks/__pycache__/task_medium.cpython-310.pyc differ

server/tasks/__pycache__/task_medium.cpython-39.pyc DELETED Viewed

Binary file (10.5 kB)

server/tasks/task_hard.py CHANGED Viewed

@@ -625,4 +625,4 @@ def get_task_by_id(task_id: str) -> dict:
     for t in HARD_TASKS:
         if t["task_id"] == task_id:
             return t.copy()
-    return random.choice(HARD_TASKS).copy()

     for t in HARD_TASKS:
         if t["task_id"] == task_id:
             return t.copy()
+    return random.choice(HARD_TASKS).copy()

server/tasks/task_medium.py CHANGED Viewed

@@ -504,4 +504,4 @@ def get_task_by_id(task_id: str) -> dict:
     for t in MEDIUM_TASKS:
         if t["task_id"] == task_id:
             return t.copy()
-    return random.choice(MEDIUM_TASKS).copy()

     for t in MEDIUM_TASKS:
         if t["task_id"] == task_id:
             return t.copy()
+    return random.choice(MEDIUM_TASKS).copy()

spec.md CHANGED Viewed

@@ -1,23 +1,24 @@
-# Code Debug Environment — Specification
 ## Overview
-The Code Debug Environment is an OpenEnv-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
----
-## API Specification
-### POST /reset
-Start a new episode.
-**Request:**
 ```json
-{"difficulty": "easy"}
 ```
-**Response:**
 ```json
 {
   "observation": {
     "task_id": "easy_003",
@@ -25,112 +26,53 @@ Start a new episode.
     "buggy_code": "def find_max(nums):\n    return min(nums)",
     "instructions": "The function has exactly one bug. Fix it.",
     "test_cases_description": "Finds max value in a list",
-    "reward": null,
-    "passed_tests": null,
-    "total_tests": 3,
-    "feedback": null,
-    "done": false
   },
-  "reward": 0.0,
-  "done": false
 }
 ```
----
 ### POST /step
-Submit a code fix.
-**Request:**
 ```json
-{
-  "fixed_code": "def find_max(nums):\n    return max(nums)",
-  "explanation": "Optional for hard tasks"
-}
-```
-**Response:**
-```json
 {
   "observation": {
-    "task_id": "easy_003",
-    "reward": 1.0,
-    "passed_tests": 3,
-    "total_tests": 3,
     "feedback": "Test 1: ✅ Passed\n   Input: [1,2,3]\n   Expected: 3\n   Got: 3",
     "done": true
   },
-  "reward": 1.0,
-  "done": true
 }
 ```
----
 ### GET /state
-Returns current episode state.
 ```json
-{
-  "episode_id": "uuid",
-  "task_id": "easy_003",
-  "difficulty": "easy",
-  "step_count": 1,
-  "max_steps": 5,
-  "current_reward": 1.0,
-  "best_reward": 1.0,
-  "done": true
-}
 ```
----
-### GET /health
 ```json
-{"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
 ```
----
-## Reward Function
-### Easy & Medium
-```
-reward = passed_tests / total_tests
-```
-- 3/3 → 1.00
-- 2/3 → 0.67
-- 1/3 → 0.33
-- 0/3 → 0.00
-### Hard
-```
-reward = 0.7 × test_score + 0.3 × explanation_score
-```
-### Invalid Actions
-- Empty code → reward = 0.0 + feedback message
-- Non-Python code → reward = 0.0 + feedback message
----
 ## Episode Rules
 - Max 5 steps per episode
-- Episode ends when reward = 1.0 OR max steps reached
-- Each step runs fixed_code against 3 deterministic test cases
-- Feedback shows Input, Expected, Got for each test
----
-## Task Domains
-| Domain | Examples |
-|---|---|
-| List operations | second element, max, flatten |
-| String algorithms | palindrome, reverse, word count |
-| Math | fibonacci, factorial, square root |
-| Sorting | bubble sort, binary search |
-| Data processing | JSON parsing, API validation |
-| Graph algorithms | BFS, cycle detection |
-| Dynamic programming | knapsack, longest subsequence |

+# Code Debug Environment — API Specification
 ## Overview
+OpenEnv-compatible RL environment where LLM agents fix buggy Python code.
+- 45 tasks: 15 easy + 15 medium + 15 hard
+- Partial rewards: 0.33, 0.67, 1.0 based on test cases passed
+- Hard tasks: reward = 0.7×code + 0.3×explanation
+## Endpoints
+### GET /health
 ```json
+{"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
 ```
+### POST /reset
 ```json
+// Request
+{"difficulty": "easy"}  // or "medium", "hard", omit for random
+// Response
 {
   "observation": {
     "task_id": "easy_003",
     "buggy_code": "def find_max(nums):\n    return min(nums)",
     "instructions": "The function has exactly one bug. Fix it.",
     "test_cases_description": "Finds max value in a list",
+    "reward": null, "passed_tests": null, "total_tests": 3,
+    "feedback": null, "done": false
   },
+  "reward": 0.0, "done": false
 }
 ```
 ### POST /step
 ```json
+// Request
+{"fixed_code": "def find_max(nums):\n    return max(nums)", "explanation": "optional for hard"}
+// Response
 {
   "observation": {
+    "task_id": "easy_003", "reward": 1.0,
+    "passed_tests": 3, "total_tests": 3,
     "feedback": "Test 1: ✅ Passed\n   Input: [1,2,3]\n   Expected: 3\n   Got: 3",
     "done": true
   },
+  "reward": 1.0, "done": true
 }
 ```
 ### GET /state
 ```json
+{"episode_id": "uuid", "task_id": "easy_003", "difficulty": "easy",
+ "step_count": 1, "max_steps": 5, "current_reward": 1.0, "best_reward": 1.0, "done": true}
 ```
+### GET /tasks
 ```json
+{"easy": ["easy_001",...], "medium": ["medium_001",...], "hard": ["hard_001",...], "total": 45}
 ```
+## Reward Design
+| Task | Formula |
+|------|---------|
+| Easy | passed/3 |
+| Medium | passed/3 |
+| Hard | 0.7×code_score + 0.3×explanation_score |
+## Invalid Actions
+- Empty code → reward=0.0 + penalty feedback
+- Infinite loop → TimeoutError → reward=0.0 + hint to add visited set
 ## Episode Rules
 - Max 5 steps per episode
+- Ends when reward=1.0 OR max steps reached
+- 3 deterministic test cases per task

tests/test_graders.py CHANGED Viewed

@@ -1,9 +1,5 @@
-# tests/test_graders.py
-# Basic tests to verify all graders work correctly.
-# Run: python -m pytest tests/ -v
-import sys
-import os
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from server.graders.grader_easy import grade_easy
@@ -14,56 +10,41 @@ from server.tasks.task_medium import MEDIUM_TASKS
 from server.tasks.task_hard import HARD_TASKS
-def test_easy_tasks_count():
-    assert len(EASY_TASKS) == 15, f"Expected 15 easy tasks, got {len(EASY_TASKS)}"
-def test_medium_tasks_count():
-    assert len(MEDIUM_TASKS) == 15, f"Expected 15 medium tasks, got {len(MEDIUM_TASKS)}"
-def test_hard_tasks_count():
-    assert len(HARD_TASKS) == 15, f"Expected 15 hard tasks, got {len(HARD_TASKS)}"
-def test_easy_correct_fix_scores_1():
-    for task in EASY_TASKS:
-        reward, passed, total, _, _ = grade_easy(task["fixed_code"], task)
-        assert reward == 1.0, f"{task['task_id']} should score 1.0, got {reward}"
-def test_medium_correct_fix_scores_1():
-    for task in MEDIUM_TASKS:
-        reward, passed, total, _, _ = grade_medium(task["fixed_code"], task)
-        assert reward == 1.0, f"{task['task_id']} should score 1.0, got {reward}"
-def test_hard_correct_fix_scores_high():
-    for task in HARD_TASKS:
-        keywords = task.get("explanation_keywords", [])
-        explanation = " ".join(keywords)
-        reward, passed, total, _, _ = grade_hard(task["fixed_code"], task, explanation)
-        assert reward >= 0.9, f"{task['task_id']} should score >= 0.9, got {reward}"
-def test_reward_range():
-    for task in EASY_TASKS + MEDIUM_TASKS:
-        reward, _, _, _, _ = grade_easy(task["buggy_code"], task)
-        assert 0.0 <= reward <= 1.0, f"Reward out of range: {reward}"
 def test_empty_code_returns_zero():
-    task = EASY_TASKS[0]
-    reward, passed, total, feedback, _ = grade_easy("", task)
-    assert reward == 0.0
-def test_buggy_code_scores_less_than_1():
-    for task in EASY_TASKS[:5]:
-        reward, _, _, _, _ = grade_easy(task["buggy_code"], task)
-        assert reward < 1.0, f"{task['task_id']} buggy code should not score 1.0"
 if __name__ == "__main__":
     import pytest
-    pytest.main([__file__, "-v"])

+# tests/test_graders.py — Run: python -m pytest tests/ -v
+import sys, os
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from server.graders.grader_easy import grade_easy
 from server.tasks.task_hard import HARD_TASKS
+def test_task_counts():
+    assert len(EASY_TASKS) == 15
+    assert len(MEDIUM_TASKS) == 15
+    assert len(HARD_TASKS) == 15
+def test_easy_correct_scores_1():
+    for t in EASY_TASKS:
+        r, _, _, _, _ = grade_easy(t["fixed_code"], t)
+        assert r == 1.0, f"{t['task_id']} expected 1.0 got {r}"
+def test_medium_correct_scores_1():
+    for t in MEDIUM_TASKS:
+        r, _, _, _, _ = grade_medium(t["fixed_code"], t)
+        assert r == 1.0, f"{t['task_id']} expected 1.0 got {r}"
+def test_hard_correct_scores_high():
+    for t in HARD_TASKS:
+        keywords = t.get("explanation_keywords", [])
+        r, _, _, _, _ = grade_hard(t["fixed_code"], t, " ".join(keywords))
+        assert r >= 0.9, f"{t['task_id']} expected >=0.9 got {r}"
+def test_reward_in_range():
+    for t in EASY_TASKS:
+        r, _, _, _, _ = grade_easy(t["buggy_code"], t)
+        assert 0.0 <= r <= 1.0
+def test_buggy_scores_less_than_1():
+    for t in EASY_TASKS[:5]:
+        r, _, _, _, _ = grade_easy(t["buggy_code"], t)
+        assert r < 1.0, f"{t['task_id']} buggy code should not score 1.0"
 def test_empty_code_returns_zero():
+    r, _, _, _, _ = grade_easy("", EASY_TASKS[0])
+    assert r == 0.0
 if __name__ == "__main__":
     import pytest
+    pytest.main([__file__, "-v"])