Spaces:

rak2315
/

ml-debug-env

Running

File size: 8,988 Bytes

70a9d5e
 
 
 
 
 
 
 
 
 
 
 
 
49aa3ca
70a9d5e
6d9a8b2
49aa3ca
70a9d5e
 
 
4108ae8
 
 
6d9a8b2
 
70a9d5e
 
49aa3ca
70a9d5e
49aa3ca
 
 
1099086
 
49aa3ca
 
 
 
 
 
 
 
 
 
 
 
70a9d5e
 
 
 
6d9a8b2
49aa3ca
 
 
 
 
 
 
 
 
 
 
 
 
6d9a8b2
 
49aa3ca
 
6d9a8b2
 
49aa3ca
70a9d5e
 
 
 
 
 
4108ae8
70a9d5e
 
 
 
 
 
 
 
49aa3ca
 
70a9d5e
4108ae8
49aa3ca
70a9d5e
 
 
 
 
 
 
 
49aa3ca
 
 
70a9d5e
1099086
 
 
70a9d5e
 
 
 
 
49aa3ca
 
70a9d5e
 
1099086
 
 
 
 
 
70a9d5e
 
49aa3ca
 
 
70a9d5e
6d9a8b2
49aa3ca
 
 
 
 
 
70a9d5e
 
49aa3ca
70a9d5e
 
 
 
 
 
 
 
 
1099086
 
 
 
 
 
 
70a9d5e
 
49aa3ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70a9d5e
49aa3ca
 
 
 
 
70a9d5e
 
49aa3ca
 
 
70a9d5e
 
 
49aa3ca
 
 
 
 
70a9d5e
49aa3ca
 
 
 
70a9d5e
 
 
49aa3ca
 
 
70a9d5e
6d9a8b2
49aa3ca
 
 
 
 
 
70a9d5e
49aa3ca
 
70a9d5e
 
 
 
 
 
 
 
 
 
 
49aa3ca
 
 
 
 
 
 
70a9d5e
49aa3ca
70a9d5e

import sys
import os
from uuid import uuid4
from typing import Optional

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from openenv.core.env_server.interfaces import Environment
from openenv.core.env_server.types import State

from models import DebugAction, DebugObservation, DebugState
from bug_generator import (
    get_scenario,
    execute_tool,
    BugScenario,
    ALL_TASKS,
    AVAILABLE_TOOLS,
    TASK_SHAPE_MISMATCH,
    TASK_TRAINING_COLLAPSE,
    TASK_DATA_LEAKAGE,
    TASK_WRONG_DEVICE,
    TASK_GRADIENT_NOT_ZEROED,
    TASK_MISSING_EVAL_MODE,
    TASK_COMPOUND_SHAPE_DEVICE,
    TASK_COMPOUND_LEAKAGE_EVAL,
)
from grader import grade, GradeResult
from adversarial_scheduler import AdversarialScheduler

MAX_STEPS = 5
SUCCESS_THRESHOLD = 0.95

# Module-level session store — shared across all instances
_SESSION_STORE: dict = {}

def _efficiency_multiplier(steps_used: int, total_steps: int) -> float:
    """
    Reward agents that fix bugs efficiently.
    steps_used = number of steps taken when fix was submitted (1-indexed).
    """
    if steps_used <= 2:
        return 1.2
    elif steps_used <= 3:
        return 1.1
    else:
        return 1.0


class MlDebugEnvEnvironment(Environment):
    """
    ML Debugging Environment — 8 tasks, easy → expert.
    Partially observable: agent sees only a minimal alert on reset().
    Must use tool calls (inspect actions) to gather information before fixing.

    Episode structure:
      - reset() → minimal alert, available tools, step budget
      - step(action_type="inspect", tool_name=X) → tool output (costs 1 step)
      - step(action_type="fix", bug_type=X, ...) → grader score (costs 1 step)
      - Max 5 steps total across all inspect + fix actions

    Efficiency bonus:
      - Fix correct in ≤2 total steps → score × 1.2 (capped at 0.99)
      - Fix correct in ≤3 total steps → score × 1.1
      - Fix in 4-5 steps → score × 1.0

    Single-bug tasks (6):
      shape_mismatch, training_collapse, wrong_device,
      gradient_not_zeroed, data_leakage, missing_eval_mode

    Compound tasks — TWO bugs per script (2):
      compound_shape_device, compound_leakage_eval
    """

    SUPPORTS_CONCURRENT_SESSIONS = True

    def __init__(self, task_id: Optional[str] = None):
        super().__init__()
        self._task_id: Optional[str] = task_id
        self._current_scenario: Optional[BugScenario] = None
        self._state = DebugState(
            episode_id=None,
            step_count=0,
            task_id="",
            max_steps=MAX_STEPS,
            current_score=0.0,
            attempts=0,
            tools_used=[],
            fix_submitted=False,
        )
        self._episode_count = 0
        self._scheduler = AdversarialScheduler(ALL_TASKS)

    def reset(
        self,
        seed: Optional[int] = None,
        episode_id: Optional[str] = None,
        task_id: Optional[str] = None,
        **kwargs,
    ) -> DebugObservation:
        active_task = task_id or self._task_id or self._scheduler.next_task()
        effective_seed = seed if seed is not None else self._scheduler.next_seed(active_task)
        scenario = get_scenario(active_task, seed=effective_seed)

        eid = episode_id or str(uuid4())
        state = DebugState(
            episode_id=eid,
            step_count=0,
            task_id=active_task,
            max_steps=MAX_STEPS,
            current_score=0.0,
            attempts=0,
            tools_used=[],
            fix_submitted=False,
        )

        self._current_scenario = scenario
        self._state = state
        self._episode_id = eid

        _SESSION_STORE[eid] = {"scenario": scenario, "state": state}

        return DebugObservation(
            task_id=active_task,
            alert=scenario.alert,
            available_tools=AVAILABLE_TOOLS,
            step_budget=MAX_STEPS,
            step_number=0,
            num_bugs=scenario.num_bugs,
            action_type=None,
            tool_name=None,
            tool_result=None,
            grader_score=None,
            grader_feedback=None,
            execution_result=None,
            done=False,
            reward=None,
            efficiency_multiplier=None,
        )

    def step(
        self,
        action: DebugAction,
        timeout_s: Optional[float] = None,
        **kwargs,
    ) -> DebugObservation:
        if self._current_scenario is None:
            # Try to recover from session store using most recent session
            if _SESSION_STORE:
                latest = list(_SESSION_STORE.values())[-1]
                self._current_scenario = latest["scenario"]
                self._state = latest["state"]
            else:
                raise RuntimeError("Call reset() before step().")

        self._state.step_count += 1
        steps_remaining = MAX_STEPS - self._state.step_count

        if action.action_type == "inspect":
            return self._handle_inspect(action, steps_remaining)
        elif action.action_type == "fix":
            return self._handle_fix(action, steps_remaining)
        else:
            self._state.step_count -= 1
            raise ValueError(f"Unknown action_type: '{action.action_type}'. Must be 'inspect' or 'fix'.")

    def _handle_inspect(self, action: DebugAction, steps_remaining: int) -> DebugObservation:
        tool_name = action.tool_name or ""
        if tool_name not in AVAILABLE_TOOLS:
            tool_result = (
                f"Unknown tool: '{tool_name}'. "
                f"Available tools: {AVAILABLE_TOOLS}"
            )
        else:
            tool_result = execute_tool(tool_name, self._current_scenario)
            self._state.tools_used.append(tool_name)

        done = self._state.step_count >= MAX_STEPS

        return DebugObservation(
            task_id=self._state.task_id,
            alert=self._current_scenario.alert,
            available_tools=AVAILABLE_TOOLS,
            step_budget=steps_remaining,
            step_number=self._state.step_count,
            num_bugs=self._current_scenario.num_bugs,
            action_type="inspect",
            tool_name=tool_name,
            tool_result=tool_result,
            grader_score=None,
            grader_feedback=None,
            execution_result=None,
            done=done,
            reward=0.0,
            efficiency_multiplier=None,
        )

    def _handle_fix(self, action: DebugAction, steps_remaining: int) -> DebugObservation:
        self._state.attempts += 1
        self._state.fix_submitted = True

        bug_type = action.bug_type or "other"
        diagnosis = action.diagnosis or ""
        fixed_code = action.fixed_code or ""

        result: GradeResult = grade(
            action_bug_type=bug_type,
            action_diagnosis=diagnosis,
            fixed_code=fixed_code,
            scenario=self._current_scenario,
        )

        multiplier = _efficiency_multiplier(self._state.step_count, MAX_STEPS)
        final_score = min(result.score * multiplier, 0.99)

        if final_score > self._state.current_score:
            self._state.current_score = final_score

        done = final_score >= SUCCESS_THRESHOLD or self._state.step_count >= MAX_STEPS

        if done:
            self._scheduler.record(self._state.task_id, final_score)

        return DebugObservation(
            task_id=self._state.task_id,
            alert=self._current_scenario.alert,
            available_tools=AVAILABLE_TOOLS,
            step_budget=steps_remaining,
            step_number=self._state.step_count,
            num_bugs=self._current_scenario.num_bugs,
            action_type="fix",
            tool_name=None,
            tool_result=None,
            grader_score=final_score,
            grader_feedback=result.feedback,
            execution_result=result.execution_output,
            done=done,
            reward=final_score,
            efficiency_multiplier=multiplier,
        )

    @property
    def state(self) -> DebugState:
        return self._state

    def get_metadata(self):
        from openenv.core.env_server.types import EnvironmentMetadata
        return EnvironmentMetadata(
            name="ML Debugging Environment",
            description=(
                "Partially observable RL environment where agents debug broken PyTorch training scripts. "
                "Agent sees only a minimal failure alert on reset — no code, no traceback. "
                "Must use tool calls (run_code, get_traceback, inspect_gradients, print_shapes, view_source) "
                "to investigate before submitting a fix. "
                "5 steps total per episode. Efficiency bonus: fix in ≤2 steps → ×1.2 reward. "
                "8 tasks: six single-bug (easy→hard), two compound double-bug tasks (expert). "
                "Execution-based grading in subprocess."
            ),
            version="4.0.0",
            author="ml-debug-env",
        )