Spaces:

Souravdanyal
/

code-debug-env

Running

App Files Files Community

Souravdanyal commited on Apr 5

Commit

c01667e

1 Parent(s): 08efbe6

Fix log format, add SPEC.md, tests, invalid action penalty, max_steps=5

Browse files

Files changed (5) hide show

inference.py +74 -162
openenv.yaml +79 -10
server/environment.py +37 -2
spec.md +136 -0
tests/test_graders.py +69 -0

inference.py CHANGED Viewed

@@ -1,21 +1,12 @@
 #!/usr/bin/env python3
-# inference.py
-# ─────────────────────────────────────────────────────────────────────────────
-# Baseline inference script for the Code Debug Environment.
-# Must be run from the project root.
-#
-# Required environment variables:
-#   API_BASE_URL  — LLM API endpoint (OpenAI-compatible)
-#   MODEL_NAME    — Model identifier
-#   HF_TOKEN      — Hugging Face / API key
 #
 # Usage:
 #   python inference.py
-#   python inference.py --url https://your-hf-space.hf.space
 #   python inference.py --difficulty easy
-#
-# Log format: [START], [STEP], [END] — strictly followed for evaluation scoring.
-# ─────────────────────────────────────────────────────────────────────────────
 import os
 import sys
@@ -24,96 +15,52 @@ import time
 import argparse
 import requests
 from openai import OpenAI
 # ─── Configuration ────────────────────────────────────────────────────────────
 API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
-MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
-HF_TOKEN = os.environ.get("HF_TOKEN", "")
-ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
-MAX_STEPS = 3
-DIFFICULTIES = ["easy", "medium", "hard"]
 # ─── OpenAI Client ───────────────────────────────────────────────────────────
-client = OpenAI(
-    api_key=HF_TOKEN or "dummy",
-    base_url=API_BASE_URL,
-)
-# ─── Logging (strict format required by evaluator) ───────────────────────────
-def log_start(task_id: str, difficulty: str, episode: int):
-    print(json.dumps({
-        "type": "START",
-        "episode": episode,
-        "task_id": task_id,
-        "difficulty": difficulty,
-        "timestamp": time.time(),
-    }), flush=True)
-def log_step(task_id: str, step: int, action_summary: str, reward: float, done: bool):
-    print(json.dumps({
-        "type": "STEP",
-        "task_id": task_id,
-        "step": step,
-        "action": action_summary,
-        "reward": reward,
-        "done": done,
-        "timestamp": time.time(),
-    }), flush=True)
-def log_end(task_id: str, difficulty: str, final_reward: float, steps_taken: int, episode: int):
-    print(json.dumps({
-        "type": "END",
-        "episode": episode,
-        "task_id": task_id,
-        "difficulty": difficulty,
-        "final_reward": final_reward,
-        "steps_taken": steps_taken,
-        "timestamp": time.time(),
-    }), flush=True)
 # ─── Environment Client ───────────────────────────────────────────────────────
 def env_reset(env_url: str, difficulty: str) -> dict:
-    resp = requests.post(
-        f"{env_url}/reset",
-        json={"difficulty": difficulty},
-        timeout=30,
-    )
     resp.raise_for_status()
     return resp.json()
 def env_step(env_url: str, fixed_code: str, explanation: str = None) -> dict:
     payload = {"fixed_code": fixed_code}
     if explanation:
         payload["explanation"] = explanation
-    resp = requests.post(
-        f"{env_url}/step",
-        json=payload,
-        timeout=30,
-    )
-    resp.raise_for_status()
-    return resp.json()
-def env_state(env_url: str) -> dict:
-    resp = requests.get(f"{env_url}/state", timeout=10)
     resp.raise_for_status()
     return resp.json()
 # ─── LLM Agent ───────────────────────────────────────────────────────────────
-SYSTEM_PROMPT = """You are an expert Python debugging agent.
 You will be given buggy Python code and must fix it.
 For easy tasks: fix the single bug.
@@ -133,11 +80,8 @@ Rules:
 - Do NOT include markdown fences or any text outside the JSON object.
 """
 def call_llm(buggy_code: str, instructions: str, difficulty: str,
              feedback: str = None, attempt: int = 1) -> dict:
-    """Call the LLM and return parsed {fixed_code, explanation}."""
     user_content = f"""Task difficulty: {difficulty}
 Instructions: {instructions}
@@ -156,130 +100,98 @@ Buggy code:
     try:
         response = client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_tokens=1000,
-            temperature=0.1,
         )
         content = response.choices[0].message.content.strip()
-        # Strip markdown fences if present
         if content.startswith("```"):
             lines = content.split("\n")
             content = "\n".join(lines[1:-1]) if lines[-1] == "```" else "\n".join(lines[1:])
         parsed = json.loads(content)
-        return {
-            "fixed_code": parsed.get("fixed_code", ""),
-            "explanation": parsed.get("explanation", None),
-        }
     except json.JSONDecodeError:
-        # Fallback: return original code if parsing fails
         return {"fixed_code": buggy_code, "explanation": None}
     except Exception as e:
-        print(f"LLM call failed: {e}", file=sys.stderr)
         return {"fixed_code": buggy_code, "explanation": None}
 # ─── Main Episode Loop ────────────────────────────────────────────────────────
-def run_episode(env_url: str, difficulty: str, episode_num: int) -> float:
-    """Run one full episode. Returns final reward."""
-    # Reset
     reset_data = env_reset(env_url, difficulty)
     obs = reset_data["observation"]
-    task_id = obs["task_id"]
-    buggy_code = obs["buggy_code"]
     instructions = obs["instructions"]
-    log_start(task_id, difficulty, episode_num)
     last_feedback = None
-    final_reward = 0.0
-    step_num = 0
     for attempt in range(1, MAX_STEPS + 1):
-        step_num = attempt
-        # Call LLM
         agent_action = call_llm(
-            buggy_code=buggy_code,
-            instructions=instructions,
-            difficulty=difficulty,
-            feedback=last_feedback,
-            attempt=attempt,
         )
-        # Submit to environment
-        result = env_step(
-            env_url,
-            fixed_code=agent_action["fixed_code"],
-            explanation=agent_action.get("explanation"),
-        )
         reward = result.get("reward", 0.0)
-        done = result.get("done", False)
-        obs_result = result.get("observation", {})
-        last_feedback = obs_result.get("feedback", "")
-        log_step(
-            task_id=task_id,
-            step=attempt,
-            action_summary=f"Submitted fix attempt {attempt} ({len(agent_action['fixed_code'])} chars)",
-            reward=reward,
-            done=done,
-        )
-        final_reward = reward
         if done:
             break
-    log_end(task_id, difficulty, final_reward, step_num, episode_num)
-    return final_reward
 def main():
     parser = argparse.ArgumentParser(description="Code Debug Environment Baseline Agent")
     parser.add_argument("--url", default=ENV_URL, help="Environment base URL")
-    parser.add_argument("--difficulty", default=None, choices=["easy", "medium", "hard", "all"],
-                        help="Difficulty to run. 'all' runs one episode per difficulty.")
     args = parser.parse_args()
     env_url = args.url.rstrip("/")
-    # Health check
     try:
         health = requests.get(f"{env_url}/health", timeout=10)
         health.raise_for_status()
-        print(json.dumps({"type": "INFO", "message": f"Environment healthy at {env_url}"}), flush=True)
     except Exception as e:
-        print(json.dumps({"type": "ERROR", "message": f"Health check failed: {e}"}), flush=True)
         sys.exit(1)
-    # Determine episodes to run
-    if args.difficulty == "all" or args.difficulty is None:
-        episodes = [("easy", 1), ("medium", 2), ("hard", 3)]
-    else:
-        episodes = [(args.difficulty, 1)]
     all_rewards = []
-    for episode_num, (difficulty, ep_id) in enumerate(episodes, start=1):
-        reward = run_episode(env_url, difficulty, episode_num)  # use episode_num, not ep_id
-        all_rewards.append({"difficulty": difficulty, "reward": reward})
-        time.sleep(0.5)  # Small pause between episodes
-    # Summary
-    print(json.dumps({
-        "type": "SUMMARY",
-        "total_episodes": len(all_rewards),
-        "results": all_rewards,
-        "average_reward": round(sum(r["reward"] for r in all_rewards) / len(all_rewards), 3),
-        "timestamp": time.time(),
-    }), flush=True)
 if __name__ == "__main__":
-    main()

 #!/usr/bin/env python3
+# inference.py — Code Debug Environment Baseline Agent
+# Log format strictly follows [START] [STEP] [END] as required by evaluator.
 #
+# Required env vars: API_BASE_URL, MODEL_NAME, HF_TOKEN
 # Usage:
 #   python inference.py
+#   python inference.py --url https://Souravdanyal-code-debug-env.hf.space
 #   python inference.py --difficulty easy
 import os
 import sys
 import argparse
 import requests
 from openai import OpenAI
+from typing import List, Optional
 # ─── Configuration ────────────────────────────────────────────────────────────
 API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
+MODEL_NAME   = os.environ.get("MODEL_NAME", "gpt-4o-mini")
+HF_TOKEN     = os.environ.get("HF_TOKEN", "")
+ENV_URL      = os.environ.get("ENV_URL", "http://localhost:7860")
+BENCHMARK    = "code-debug-env"
+MAX_STEPS    = 5
 # ─── OpenAI Client ───────────────────────────────────────────────────────────
+client = OpenAI(api_key=HF_TOKEN or "dummy", base_url=API_BASE_URL)
+# ─── Logging — STRICT FORMAT REQUIRED BY EVALUATOR ───────────────────────────
+# [START] task=<task_id> env=<benchmark> model=<model_name>
+# [STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
+# [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
+def log_start(task_id: str, env: str, model: str) -> None:
+    print(f"[START] task={task_id} env={env} model={model}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    error_val = error if error else "null"
+    done_val = str(done).lower()
+    print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
+def log_end(success: bool, steps: int, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}", flush=True)
 # ─── Environment Client ───────────────────────────────────────────────────────
 def env_reset(env_url: str, difficulty: str) -> dict:
+    resp = requests.post(f"{env_url}/reset", json={"difficulty": difficulty}, timeout=30)
     resp.raise_for_status()
     return resp.json()
 def env_step(env_url: str, fixed_code: str, explanation: str = None) -> dict:
     payload = {"fixed_code": fixed_code}
     if explanation:
         payload["explanation"] = explanation
+    resp = requests.post(f"{env_url}/step", json=payload, timeout=30)
     resp.raise_for_status()
     return resp.json()
 # ─── LLM Agent ───────────────────────────────────────────────────────────────
+SYSTEM_PROMPT = """You are an expert Python debugging agent.
 You will be given buggy Python code and must fix it.
 For easy tasks: fix the single bug.
 - Do NOT include markdown fences or any text outside the JSON object.
 """
 def call_llm(buggy_code: str, instructions: str, difficulty: str,
              feedback: str = None, attempt: int = 1) -> dict:
     user_content = f"""Task difficulty: {difficulty}
 Instructions: {instructions}
     try:
         response = client.chat.completions.create(
+            model=MODEL_NAME, messages=messages, max_tokens=1000, temperature=0.1,
         )
         content = response.choices[0].message.content.strip()
         if content.startswith("```"):
             lines = content.split("\n")
             content = "\n".join(lines[1:-1]) if lines[-1] == "```" else "\n".join(lines[1:])
         parsed = json.loads(content)
+        return {"fixed_code": parsed.get("fixed_code", ""), "explanation": parsed.get("explanation", None)}
     except json.JSONDecodeError:
         return {"fixed_code": buggy_code, "explanation": None}
     except Exception as e:
+        print(f"# LLM call failed: {e}", file=sys.stderr)
         return {"fixed_code": buggy_code, "explanation": None}
 # ─── Main Episode Loop ────────────────────────────────────────────────────────
+def run_episode(env_url: str, difficulty: str) -> tuple:
     reset_data = env_reset(env_url, difficulty)
     obs = reset_data["observation"]
+    task_id      = obs["task_id"]
+    buggy_code   = obs["buggy_code"]
     instructions = obs["instructions"]
+    log_start(task_id=task_id, env=BENCHMARK, model=MODEL_NAME)
     last_feedback = None
+    rewards: List[float] = []
+    steps_taken = 0
+    success = False
     for attempt in range(1, MAX_STEPS + 1):
+        steps_taken = attempt
         agent_action = call_llm(
+            buggy_code=buggy_code, instructions=instructions,
+            difficulty=difficulty, feedback=last_feedback, attempt=attempt,
         )
+        fixed_code = agent_action["fixed_code"]
+        if not fixed_code or not fixed_code.strip():
+            log_step(step=attempt, action="empty_submission", reward=0.0, done=False, error="empty_code")
+            rewards.append(0.0)
+            continue
+        try:
+            result = env_step(env_url, fixed_code=fixed_code, explanation=agent_action.get("explanation"))
+        except Exception as e:
+            log_step(step=attempt, action="step_failed", reward=0.0, done=False, error=str(e)[:60])
+            rewards.append(0.0)
+            continue
         reward = result.get("reward", 0.0)
+        done   = result.get("done", False)
+        obs_r  = result.get("observation", {})
+        last_feedback = obs_r.get("feedback", "")
+        log_step(step=attempt, action=f"fix_{difficulty}_attempt{attempt}", reward=reward, done=done, error=None)
+        rewards.append(reward)
+        if reward >= 1.0:
+            success = True
         if done:
             break
+    log_end(success=success, steps=steps_taken, rewards=rewards)
+    return success, steps_taken, rewards
 def main():
     parser = argparse.ArgumentParser(description="Code Debug Environment Baseline Agent")
     parser.add_argument("--url", default=ENV_URL, help="Environment base URL")
+    parser.add_argument("--difficulty", default=None, choices=["easy", "medium", "hard", "all"])
     args = parser.parse_args()
     env_url = args.url.rstrip("/")
     try:
         health = requests.get(f"{env_url}/health", timeout=10)
         health.raise_for_status()
+        print(f"# Environment healthy at {env_url}", flush=True)
     except Exception as e:
+        print(f"# Health check failed: {e}", file=sys.stderr)
         sys.exit(1)
+    difficulties = ["easy", "medium", "hard"] if (args.difficulty in ("all", None)) else [args.difficulty]
     all_rewards = []
+    all_successes = []
+    for difficulty in difficulties:
+        success, steps, rewards = run_episode(env_url, difficulty)
+        all_rewards.extend(rewards)
+        all_successes.append(success)
+        time.sleep(0.5)
+    avg = round(sum(all_rewards) / len(all_rewards), 3) if all_rewards else 0.0
+    print(f"# SUMMARY: {sum(all_successes)}/{len(difficulties)} tasks solved | avg_reward={avg}", flush=True)
 if __name__ == "__main__":
+    main()

openenv.yaml CHANGED Viewed

@@ -4,12 +4,21 @@ type: typed
 description: >
   A real-world RL environment where an LLM agent diagnoses and fixes
   buggy Python code across three difficulty levels (easy, medium, hard).
-  Tasks are drawn from real-world domains: data processing, API handlers,
-  and algorithmic functions. Rewards are partial and proportional to how
-  many test cases pass, with bonuses for correct explanations on hard tasks.
 version: 1.0.0
-author: Souravdanyal
 runtime:
   type: docker
@@ -20,29 +29,89 @@ app:
   host: 0.0.0.0
   port: 7860
 tasks:
   - id: easy
-    description: "Fix a single off-by-one or operator bug in a Python function"
     difficulty: easy
-    max_steps: 3
     reward_range: [0.0, 1.0]
   - id: medium
-    description: "Fix two bugs (logic + edge case) so all test cases pass"
     difficulty: medium
-    max_steps: 3
     reward_range: [0.0, 1.0]
   - id: hard
-    description: "Fix an algorithmic bug AND provide a correct explanation"
     difficulty: hard
-    max_steps: 3
     reward_range: [0.0, 1.0]
 reward_range: [0.0, 1.0]
 api:
   reset: /reset
   step: /step
   state: /state
   health: /health

 description: >
   A real-world RL environment where an LLM agent diagnoses and fixes
   buggy Python code across three difficulty levels (easy, medium, hard).
+  Tasks are drawn from real-world domains: data processing, string algorithms,
+  API validation, sorting, dynamic programming, and graph algorithms.
+  Rewards are partial and proportional to test cases passed, with bonuses
+  for correct explanations on hard tasks.
 version: 1.0.0
+author: Souravdanyal
+tags:
+  - code-debugging
+  - python
+  - reinforcement-learning
+  - openenv
+  - llm-agent
+  - software-engineering
 runtime:
   type: docker
   host: 0.0.0.0
   port: 7860
+config:
+  episode_timeout: 300
+  max_steps: 5
 tasks:
   - id: easy
+    description: "Fix a single off-by-one, operator, or return bug in a Python function"
     difficulty: easy
+    max_steps: 5
     reward_range: [0.0, 1.0]
+    grader: deterministic
+    num_tasks: 15
   - id: medium
+    description: "Fix two bugs (logic bug + edge case) so all test cases pass"
     difficulty: medium
+    max_steps: 5
     reward_range: [0.0, 1.0]
+    grader: deterministic
+    num_tasks: 15
   - id: hard
+    description: "Fix an algorithmic bug AND provide a correct explanation of the root cause"
     difficulty: hard
+    max_steps: 5
     reward_range: [0.0, 1.0]
+    grader: deterministic
+    num_tasks: 15
 reward_range: [0.0, 1.0]
+action_space:
+  type: dict
+  description: "Agent submits fixed Python code and optional explanation"
+  fields:
+    fixed_code:
+      type: string
+      required: true
+      description: "Complete corrected Python function as a string. Must be valid Python."
+    explanation:
+      type: string
+      required: false
+      description: "Required for hard tasks. Explain the bug, root cause, and why fix is correct."
+observation_space:
+  type: dict
+  description: "Environment observation returned after reset() and step()"
+  fields:
+    task_id:
+      type: string
+      description: "Unique identifier for the current task instance (e.g. easy_003)"
+    difficulty:
+      type: enum
+      values: [easy, medium, hard]
+      description: "Task difficulty level"
+    buggy_code:
+      type: string
+      description: "The buggy Python function the agent must fix"
+    instructions:
+      type: string
+      description: "Natural language description of what is wrong and what to fix"
+    test_cases_description:
+      type: string
+      description: "Description of what the test cases check"
+    reward:
+      type: float
+      description: "Score from 0.0 to 1.0 for this step (null on reset)"
+    passed_tests:
+      type: integer
+      description: "Number of test cases passed (null on reset)"
+    total_tests:
+      type: integer
+      description: "Total number of test cases (always 3)"
+    feedback:
+      type: string
+      description: "Detailed per-test feedback showing input, expected, and got values"
+    done:
+      type: boolean
+      description: "True when episode is complete (perfect score or max steps reached)"
 api:
   reset: /reset
   step: /step
   state: /state
   health: /health
+  tasks: /tasks

server/environment.py CHANGED Viewed

@@ -29,7 +29,7 @@ GRADERS = {
     "hard": grade_hard,
 }
-MAX_STEPS = 3
 class CodeDebugEnvironment(Environment):
@@ -101,6 +101,41 @@ class CodeDebugEnvironment(Environment):
         self._step_count += 1
         # Grade the submission
         grader = GRADERS[self._difficulty]
         if self._difficulty == "hard":
@@ -144,4 +179,4 @@ class CodeDebugEnvironment(Environment):
             current_reward=self._current_reward,
             best_reward=self._best_reward,
             done=self._done,
-        )

     "hard": grade_hard,
 }
+MAX_STEPS = 5
 class CodeDebugEnvironment(Environment):
         self._step_count += 1
+        # ── Invalid action penalty ──────────────────────────────────────────
+        code = action.fixed_code.strip() if action.fixed_code else ""
+        if not code:
+            done = self._step_count >= MAX_STEPS
+            self._done = done
+            return DebugObservation(
+                task_id=self._current_task["task_id"],
+                difficulty=self._difficulty,
+                buggy_code=self._current_task["buggy_code"],
+                instructions=self._current_task["instructions"],
+                test_cases_description=self._current_task["test_cases_description"],
+                reward=0.0,
+                passed_tests=0,
+                total_tests=len(self._current_task["test_cases"]),
+                feedback="❌ Invalid action: fixed_code is empty. Penalty applied. Submit valid Python code.",
+                done=done,
+            )
+        # Check for obvious non-Python (very short or no 'def' keyword)
+        if len(code) < 5 or ("def " not in code and "lambda" not in code and "=" not in code):
+            done = self._step_count >= MAX_STEPS
+            self._done = done
+            return DebugObservation(
+                task_id=self._current_task["task_id"],
+                difficulty=self._difficulty,
+                buggy_code=self._current_task["buggy_code"],
+                instructions=self._current_task["instructions"],
+                test_cases_description=self._current_task["test_cases_description"],
+                reward=0.0,
+                passed_tests=0,
+                total_tests=len(self._current_task["test_cases"]),
+                feedback="❌ Invalid action: submission does not appear to be valid Python. Penalty applied.",
+                done=done,
+            )
         # Grade the submission
         grader = GRADERS[self._difficulty]
         if self._difficulty == "hard":
             current_reward=self._current_reward,
             best_reward=self._best_reward,
             done=self._done,
+        )

spec.md ADDED Viewed

	@@ -0,0 +1,136 @@

+# Code Debug Environment — Specification
+## Overview
+The Code Debug Environment is an OpenEnv-compatible RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels.
+---
+## API Specification
+### POST /reset
+Start a new episode.
+**Request:**
+```json
+{"difficulty": "easy"}
+```
+**Response:**
+```json
+{
+  "observation": {
+    "task_id": "easy_003",
+    "difficulty": "easy",
+    "buggy_code": "def find_max(nums):\n    return min(nums)",
+    "instructions": "The function has exactly one bug. Fix it.",
+    "test_cases_description": "Finds max value in a list",
+    "reward": null,
+    "passed_tests": null,
+    "total_tests": 3,
+    "feedback": null,
+    "done": false
+  },
+  "reward": 0.0,
+  "done": false
+}
+```
+---
+### POST /step
+Submit a code fix.
+**Request:**
+```json
+{
+  "fixed_code": "def find_max(nums):\n    return max(nums)",
+  "explanation": "Optional for hard tasks"
+}
+```
+**Response:**
+```json
+{
+  "observation": {
+    "task_id": "easy_003",
+    "reward": 1.0,
+    "passed_tests": 3,
+    "total_tests": 3,
+    "feedback": "Test 1: ✅ Passed\n   Input: [1,2,3]\n   Expected: 3\n   Got: 3",
+    "done": true
+  },
+  "reward": 1.0,
+  "done": true
+}
+```
+---
+### GET /state
+Returns current episode state.
+```json
+{
+  "episode_id": "uuid",
+  "task_id": "easy_003",
+  "difficulty": "easy",
+  "step_count": 1,
+  "max_steps": 5,
+  "current_reward": 1.0,
+  "best_reward": 1.0,
+  "done": true
+}
+```
+---
+### GET /health
+```json
+{"status": "ok", "environment": "code-debug-env", "version": "1.0.0"}
+```
+---
+## Reward Function
+### Easy & Medium
+```
+reward = passed_tests / total_tests
+```
+- 3/3 → 1.00
+- 2/3 → 0.67
+- 1/3 → 0.33
+- 0/3 → 0.00
+### Hard
+```
+reward = 0.7 × test_score + 0.3 × explanation_score
+```
+### Invalid Actions
+- Empty code → reward = 0.0 + feedback message
+- Non-Python code → reward = 0.0 + feedback message
+---
+## Episode Rules
+- Max 5 steps per episode
+- Episode ends when reward = 1.0 OR max steps reached
+- Each step runs fixed_code against 3 deterministic test cases
+- Feedback shows Input, Expected, Got for each test
+---
+## Task Domains
+| Domain | Examples |
+|---|---|
+| List operations | second element, max, flatten |
+| String algorithms | palindrome, reverse, word count |
+| Math | fibonacci, factorial, square root |
+| Sorting | bubble sort, binary search |
+| Data processing | JSON parsing, API validation |
+| Graph algorithms | BFS, cycle detection |
+| Dynamic programming | knapsack, longest subsequence |

tests/test_graders.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# tests/test_graders.py
+# Basic tests to verify all graders work correctly.
+# Run: python -m pytest tests/ -v
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from server.graders.grader_easy import grade_easy
+from server.graders.grader_medium import grade_medium
+from server.graders.grader_hard import grade_hard
+from server.tasks.task_easy import EASY_TASKS
+from server.tasks.task_medium import MEDIUM_TASKS
+from server.tasks.task_hard import HARD_TASKS
+def test_easy_tasks_count():
+    assert len(EASY_TASKS) == 15, f"Expected 15 easy tasks, got {len(EASY_TASKS)}"
+def test_medium_tasks_count():
+    assert len(MEDIUM_TASKS) == 15, f"Expected 15 medium tasks, got {len(MEDIUM_TASKS)}"
+def test_hard_tasks_count():
+    assert len(HARD_TASKS) == 15, f"Expected 15 hard tasks, got {len(HARD_TASKS)}"
+def test_easy_correct_fix_scores_1():
+    for task in EASY_TASKS:
+        reward, passed, total, _, _ = grade_easy(task["fixed_code"], task)
+        assert reward == 1.0, f"{task['task_id']} should score 1.0, got {reward}"
+def test_medium_correct_fix_scores_1():
+    for task in MEDIUM_TASKS:
+        reward, passed, total, _, _ = grade_medium(task["fixed_code"], task)
+        assert reward == 1.0, f"{task['task_id']} should score 1.0, got {reward}"
+def test_hard_correct_fix_scores_high():
+    for task in HARD_TASKS:
+        keywords = task.get("explanation_keywords", [])
+        explanation = " ".join(keywords)
+        reward, passed, total, _, _ = grade_hard(task["fixed_code"], task, explanation)
+        assert reward >= 0.9, f"{task['task_id']} should score >= 0.9, got {reward}"
+def test_reward_range():
+    for task in EASY_TASKS + MEDIUM_TASKS:
+        reward, _, _, _, _ = grade_easy(task["buggy_code"], task)
+        assert 0.0 <= reward <= 1.0, f"Reward out of range: {reward}"
+def test_empty_code_returns_zero():
+    task = EASY_TASKS[0]
+    reward, passed, total, feedback, _ = grade_easy("", task)
+    assert reward == 0.0
+def test_buggy_code_scores_less_than_1():
+    for task in EASY_TASKS[:5]:
+        reward, _, _, _, _ = grade_easy(task["buggy_code"], task)
+        assert reward < 1.0, f"{task['task_id']} buggy code should not score 1.0"
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__, "-v"])