Spaces:

Souravdanyal
/

code-debug-env

Sleeping

File size: 5,110 Bytes

2ce1061

# server/environment.py
# Core environment: manages episode state, dispatches to task banks and graders.

import random
from uuid import uuid4
from typing import Optional

from openenv.core.env_server.interfaces import Environment
from openenv.core.env_server.types import State

from models import DebugAction, DebugObservation, DebugState
from server.tasks.task_easy import get_random_easy_task
from server.tasks.task_medium import get_random_medium_task
from server.tasks.task_hard import get_random_hard_task
from server.graders.grader_easy import grade_easy
from server.graders.grader_medium import grade_medium
from server.graders.grader_hard import grade_hard


TASK_GETTERS = {
    "easy": get_random_easy_task,
    "medium": get_random_medium_task,
    "hard": get_random_hard_task,
}

GRADERS = {
    "easy": grade_easy,
    "medium": grade_medium,
    "hard": grade_hard,
}

MAX_STEPS = 3


class CodeDebugEnvironment(Environment):
    """
    OpenEnv environment for LLM-based code debugging.
    Supports 3 difficulty levels with partial rewards.
    """

    def __init__(self):
        self._episode_id: str = str(uuid4())
        self._difficulty: str = "easy"
        self._current_task: Optional[dict] = None
        self._step_count: int = 0
        self._best_reward: float = 0.0
        self._current_reward: float = 0.0
        self._done: bool = False

    def reset(self, difficulty: Optional[str] = None) -> DebugObservation:
        """
        Start a new episode. Optionally specify difficulty: easy | medium | hard.
        If not specified, cycles randomly.
        """
        self._episode_id = str(uuid4())
        self._step_count = 0
        self._best_reward = 0.0
        self._current_reward = 0.0
        self._done = False

        # Validate difficulty
        if difficulty and difficulty in TASK_GETTERS:
            self._difficulty = difficulty
        else:
            self._difficulty = random.choice(["easy", "medium", "hard"])

        # Load a task
        self._current_task = TASK_GETTERS[self._difficulty]()

        return DebugObservation(
            task_id=self._current_task["task_id"],
            difficulty=self._difficulty,
            buggy_code=self._current_task["buggy_code"],
            instructions=self._current_task["instructions"],
            test_cases_description=self._current_task["test_cases_description"],
            reward=None,
            passed_tests=None,
            total_tests=len(self._current_task["test_cases"]),
            feedback=None,
            done=False,
        )

    def step(self, action: DebugAction) -> DebugObservation:
        """
        Agent submits fixed_code (and optionally explanation for hard tasks).
        Returns observation with reward, feedback, and done flag.
        """
        if self._done:
            return DebugObservation(
                task_id=self._current_task["task_id"] if self._current_task else "none",
                difficulty=self._difficulty,
                buggy_code=self._current_task["buggy_code"] if self._current_task else "",
                instructions="Episode is already done. Call reset() to start a new episode.",
                test_cases_description="",
                reward=self._best_reward,
                passed_tests=None,
                total_tests=0,
                feedback="Episode ended. Please call reset() to start a new task.",
                done=True,
            )

        self._step_count += 1

        # Grade the submission
        grader = GRADERS[self._difficulty]
        if self._difficulty == "hard":
            reward, passed, total, feedback, _ = grader(
                action.fixed_code, self._current_task, action.explanation
            )
        else:
            reward, passed, total, feedback, _ = grader(
                action.fixed_code, self._current_task
            )

        self._current_reward = reward
        self._best_reward = max(self._best_reward, reward)

        # Episode ends if: perfect score OR max steps reached
        done = (reward == 1.0) or (self._step_count >= MAX_STEPS)
        self._done = done

        return DebugObservation(
            task_id=self._current_task["task_id"],
            difficulty=self._difficulty,
            buggy_code=self._current_task["buggy_code"],
            instructions=self._current_task["instructions"],
            test_cases_description=self._current_task["test_cases_description"],
            reward=reward,
            passed_tests=passed,
            total_tests=total,
            feedback=feedback,
            done=done,
        )

    @property
    def state(self) -> DebugState:
        """Return current episode metadata."""
        return DebugState(
            episode_id=self._episode_id,
            step_count=self._step_count,
            task_id=self._current_task["task_id"] if self._current_task else "none",
            difficulty=self._difficulty,
            max_steps=MAX_STEPS,
            current_reward=self._current_reward,
            best_reward=self._best_reward,
            done=self._done,
        )