Spaces:
Sleeping
Sleeping
Commit ·
fe39ff6
1
Parent(s): 4dc3d0a
Unify benchmark scoring with OpenEnv episode reward
Browse files- src/benchmark.py +6 -20
- tests/test_benchmark_integration.py +7 -0
src/benchmark.py
CHANGED
|
@@ -8,10 +8,7 @@ from typing import Any
|
|
| 8 |
|
| 9 |
from src.models import Action, DispatchAction
|
| 10 |
from src.openenv_environment import OpenEnvEnvironment
|
| 11 |
-
from src.
|
| 12 |
-
from src.tasks.multi_incident import MultiIncidentGrader
|
| 13 |
-
from src.tasks.shift_surge import ShiftSurgeGrader
|
| 14 |
-
from src.tasks.single_incident import SingleIncidentGrader
|
| 15 |
from src.tasks.registry import TaskRegistry
|
| 16 |
|
| 17 |
|
|
@@ -23,27 +20,14 @@ def list_tasks() -> list[dict[str, Any]]:
|
|
| 23 |
]
|
| 24 |
|
| 25 |
|
| 26 |
-
def _get_grader(task_id: str) -> Any:
|
| 27 |
-
if task_id == "single_incident":
|
| 28 |
-
return SingleIncidentGrader()
|
| 29 |
-
elif task_id == "multi_incident":
|
| 30 |
-
return MultiIncidentGrader()
|
| 31 |
-
elif task_id == "mass_casualty":
|
| 32 |
-
return MassCasualtyGrader()
|
| 33 |
-
elif task_id == "shift_surge":
|
| 34 |
-
return ShiftSurgeGrader()
|
| 35 |
-
raise KeyError(f"Unknown task: {task_id}")
|
| 36 |
-
|
| 37 |
-
|
| 38 |
async def _run_episode_async(task_id: str, seed: int) -> tuple[float, list[float]]:
|
| 39 |
env = OpenEnvEnvironment(task_id=task_id, seed=seed)
|
| 40 |
rewards: list[float] = []
|
| 41 |
final_state = None
|
| 42 |
|
| 43 |
try:
|
| 44 |
-
|
| 45 |
final_state = env.state()
|
| 46 |
-
rewards.append(obs.score)
|
| 47 |
|
| 48 |
rng = random.Random(seed)
|
| 49 |
for _ in range(1000):
|
|
@@ -73,7 +57,6 @@ async def _run_episode_async(task_id: str, seed: int) -> tuple[float, list[float
|
|
| 73 |
finally:
|
| 74 |
env.close()
|
| 75 |
|
| 76 |
-
grader = _get_grader(task_id)
|
| 77 |
if final_state is None:
|
| 78 |
from src.models import State
|
| 79 |
|
|
@@ -87,7 +70,10 @@ async def _run_episode_async(task_id: str, seed: int) -> tuple[float, list[float
|
|
| 87 |
metadata={},
|
| 88 |
)
|
| 89 |
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
def run_task(task_id: str, seed: int) -> dict[str, Any]:
|
|
|
|
| 8 |
|
| 9 |
from src.models import Action, DispatchAction
|
| 10 |
from src.openenv_environment import OpenEnvEnvironment
|
| 11 |
+
from src.rewards import TaskGrader
|
|
|
|
|
|
|
|
|
|
| 12 |
from src.tasks.registry import TaskRegistry
|
| 13 |
|
| 14 |
|
|
|
|
| 20 |
]
|
| 21 |
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
async def _run_episode_async(task_id: str, seed: int) -> tuple[float, list[float]]:
|
| 24 |
env = OpenEnvEnvironment(task_id=task_id, seed=seed)
|
| 25 |
rewards: list[float] = []
|
| 26 |
final_state = None
|
| 27 |
|
| 28 |
try:
|
| 29 |
+
await env.reset()
|
| 30 |
final_state = env.state()
|
|
|
|
| 31 |
|
| 32 |
rng = random.Random(seed)
|
| 33 |
for _ in range(1000):
|
|
|
|
| 57 |
finally:
|
| 58 |
env.close()
|
| 59 |
|
|
|
|
| 60 |
if final_state is None:
|
| 61 |
from src.models import State
|
| 62 |
|
|
|
|
| 70 |
metadata={},
|
| 71 |
)
|
| 72 |
|
| 73 |
+
# Score episodes the same way as the OpenEnv evaluation path:
|
| 74 |
+
# a normalized aggregate of per-step rewards.
|
| 75 |
+
final_score = TaskGrader().grade_episode(rewards, task_id=task_id)
|
| 76 |
+
return final_score, rewards
|
| 77 |
|
| 78 |
|
| 79 |
def run_task(task_id: str, seed: int) -> dict[str, Any]:
|
tests/test_benchmark_integration.py
CHANGED
|
@@ -16,6 +16,13 @@ def test_run_task_score_in_range() -> None:
|
|
| 16 |
result = run_task("single_incident", seed=42)
|
| 17 |
assert 0.0 <= result["score"] <= 1.0
|
| 18 |
assert result["task_id"] == "single_incident"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def test_run_all_scores_in_range() -> None:
|
|
|
|
| 16 |
result = run_task("single_incident", seed=42)
|
| 17 |
assert 0.0 <= result["score"] <= 1.0
|
| 18 |
assert result["task_id"] == "single_incident"
|
| 19 |
+
# Benchmark scoring must match the OpenEnv evaluation path: mean step reward.
|
| 20 |
+
rewards = result["rewards"]
|
| 21 |
+
if rewards:
|
| 22 |
+
expected = sum(rewards) / len(rewards)
|
| 23 |
+
else:
|
| 24 |
+
expected = 0.0
|
| 25 |
+
assert abs(result["score"] - expected) < 1e-9
|
| 26 |
|
| 27 |
|
| 28 |
def test_run_all_scores_in_range() -> None:
|