Spaces:
Running
Running
| # server/environment.py | |
| # Core environment: manages episode state, dispatches to task banks and graders. | |
| import random | |
| from uuid import uuid4 | |
| from typing import Optional | |
| from openenv.core.env_server.interfaces import Environment | |
| from openenv.core.env_server.types import State | |
| from models import DebugAction, DebugObservation, DebugState | |
| from server.tasks.task_easy import get_random_easy_task | |
| from server.tasks.task_medium import get_random_medium_task | |
| from server.tasks.task_hard import get_random_hard_task | |
| from server.graders.grader_easy import grade_easy | |
| from server.graders.grader_medium import grade_medium | |
| from server.graders.grader_hard import grade_hard | |
| TASK_GETTERS = { | |
| "easy": get_random_easy_task, | |
| "medium": get_random_medium_task, | |
| "hard": get_random_hard_task, | |
| } | |
| GRADERS = { | |
| "easy": grade_easy, | |
| "medium": grade_medium, | |
| "hard": grade_hard, | |
| } | |
| MAX_STEPS = 5 | |
| class CodeDebugEnvironment(Environment): | |
| """ | |
| OpenEnv environment for LLM-based code debugging. | |
| Supports 3 difficulty levels with partial rewards. | |
| """ | |
| def __init__(self): | |
| self._episode_id: str = str(uuid4()) | |
| self._difficulty: str = "easy" | |
| self._current_task: Optional[dict] = None | |
| self._step_count: int = 0 | |
| self._best_reward: float = 0.0 | |
| self._current_reward: float = 0.0 | |
| self._done: bool = False | |
| def reset(self, difficulty: Optional[str] = None) -> DebugObservation: | |
| """ | |
| Start a new episode. Optionally specify difficulty: easy | medium | hard. | |
| If not specified, cycles randomly. | |
| """ | |
| self._episode_id = str(uuid4()) | |
| self._step_count = 0 | |
| self._best_reward = 0.0 | |
| self._current_reward = 0.0 | |
| self._done = False | |
| # Validate difficulty | |
| if difficulty and difficulty in TASK_GETTERS: | |
| self._difficulty = difficulty | |
| else: | |
| self._difficulty = random.choice(["easy", "medium", "hard"]) | |
| # Load a task | |
| self._current_task = TASK_GETTERS[self._difficulty]() | |
| return DebugObservation( | |
| task_id=self._current_task["task_id"], | |
| difficulty=self._difficulty, | |
| buggy_code=self._current_task["buggy_code"], | |
| instructions=self._current_task["instructions"], | |
| test_cases_description=self._current_task["test_cases_description"], | |
| reward=None, | |
| passed_tests=None, | |
| total_tests=len(self._current_task["test_cases"]), | |
| feedback=None, | |
| done=False, | |
| ) | |
| def step(self, action: DebugAction) -> DebugObservation: | |
| """ | |
| Agent submits fixed_code (and optionally explanation for hard tasks). | |
| Returns observation with reward, feedback, and done flag. | |
| """ | |
| if self._done: | |
| return DebugObservation( | |
| task_id=self._current_task["task_id"] if self._current_task else "none", | |
| difficulty=self._difficulty, | |
| buggy_code=self._current_task["buggy_code"] if self._current_task else "", | |
| instructions="Episode is already done. Call reset() to start a new episode.", | |
| test_cases_description="", | |
| reward=self._best_reward, | |
| passed_tests=None, | |
| total_tests=0, | |
| feedback="Episode ended. Please call reset() to start a new task.", | |
| done=True, | |
| ) | |
| self._step_count += 1 | |
| # ββ Invalid action penalty ββββββββββββββββββββββββββββββββββββββββββ | |
| code = action.fixed_code.strip() if action.fixed_code else "" | |
| if not code: | |
| done = self._step_count >= MAX_STEPS | |
| self._done = done | |
| return DebugObservation( | |
| task_id=self._current_task["task_id"], | |
| difficulty=self._difficulty, | |
| buggy_code=self._current_task["buggy_code"], | |
| instructions=self._current_task["instructions"], | |
| test_cases_description=self._current_task["test_cases_description"], | |
| reward=0.0, | |
| passed_tests=0, | |
| total_tests=len(self._current_task["test_cases"]), | |
| feedback="β Invalid action: fixed_code is empty. Penalty applied. Submit valid Python code.", | |
| done=done, | |
| ) | |
| # Check for obvious non-Python (very short or no 'def' keyword) | |
| if len(code) < 5 or ("def " not in code and "lambda" not in code and "=" not in code): | |
| done = self._step_count >= MAX_STEPS | |
| self._done = done | |
| return DebugObservation( | |
| task_id=self._current_task["task_id"], | |
| difficulty=self._difficulty, | |
| buggy_code=self._current_task["buggy_code"], | |
| instructions=self._current_task["instructions"], | |
| test_cases_description=self._current_task["test_cases_description"], | |
| reward=0.0, | |
| passed_tests=0, | |
| total_tests=len(self._current_task["test_cases"]), | |
| feedback="β Invalid action: submission does not appear to be valid Python. Penalty applied.", | |
| done=done, | |
| ) | |
| # Grade the submission | |
| grader = GRADERS[self._difficulty] | |
| if self._difficulty == "hard": | |
| reward, passed, total, feedback, _ = grader( | |
| action.fixed_code, self._current_task, action.explanation | |
| ) | |
| else: | |
| reward, passed, total, feedback, _ = grader( | |
| action.fixed_code, self._current_task | |
| ) | |
| self._current_reward = reward | |
| self._best_reward = max(self._best_reward, reward) | |
| # Episode ends if: perfect score OR max steps reached | |
| done = (reward == 1.0) or (self._step_count >= MAX_STEPS) | |
| self._done = done | |
| return DebugObservation( | |
| task_id=self._current_task["task_id"], | |
| difficulty=self._difficulty, | |
| buggy_code=self._current_task["buggy_code"], | |
| instructions=self._current_task["instructions"], | |
| test_cases_description=self._current_task["test_cases_description"], | |
| reward=reward, | |
| passed_tests=passed, | |
| total_tests=total, | |
| feedback=feedback, | |
| done=done, | |
| ) | |
| def state(self) -> DebugState: | |
| """Return current episode metadata.""" | |
| return DebugState( | |
| episode_id=self._episode_id, | |
| step_count=self._step_count, | |
| task_id=self._current_task["task_id"] if self._current_task else "none", | |
| difficulty=self._difficulty, | |
| max_steps=MAX_STEPS, | |
| current_reward=self._current_reward, | |
| best_reward=self._best_reward, | |
| done=self._done, | |
| ) |