Yaswanth-Bolla's picture
please please please
499adbd
"""
Task registry and evaluation graders.
Each task maps to a scenario. The grader is oracle-independent —
it takes only List[StepRecord] and returns a float in [0, 1].
"""
from __future__ import annotations
from typing import Dict, List, Type
from .models import StepRecord
from .scenarios.base import BaseScenario
from .scenarios.easy_memory_leak import MemoryLeakScenario
from .scenarios.medium_cascading_failure import CascadingFailureScenario
from .scenarios.hard_distributed_deadlock import DistributedDeadlockScenario
# ------------------------------------------------------------------
# Registry
# ------------------------------------------------------------------
TASK_REGISTRY: Dict[str, Type[BaseScenario]] = {
"memory_leak": MemoryLeakScenario,
"cascading_failure": CascadingFailureScenario,
"distributed_deadlock": DistributedDeadlockScenario,
}
TASK_NAMES = list(TASK_REGISTRY.keys())
def get_scenario(task_name: str) -> BaseScenario:
"""Instantiate a scenario by task name."""
cls = TASK_REGISTRY.get(task_name)
if cls is None:
raise ValueError(
f"Unknown task: {task_name}. Available: {TASK_NAMES}")
return cls()
def grade_trajectory(task_name: str, trajectory: List[StepRecord]) -> float:
"""
Grade a trajectory for a given task.
This is the evaluation entry point — standalone, no hidden state.
"""
scenario = get_scenario(task_name)
# scenario.grade() now handles its own [0.01, 0.99] adjustment
score = scenario.grade(trajectory)
return float(score)