garvitsachdeva commited on
Commit
fe39ff6
·
1 Parent(s): 4dc3d0a

Unify benchmark scoring with OpenEnv episode reward

Browse files
src/benchmark.py CHANGED
@@ -8,10 +8,7 @@ from typing import Any
8
 
9
  from src.models import Action, DispatchAction
10
  from src.openenv_environment import OpenEnvEnvironment
11
- from src.tasks.mass_casualty import MassCasualtyGrader
12
- from src.tasks.multi_incident import MultiIncidentGrader
13
- from src.tasks.shift_surge import ShiftSurgeGrader
14
- from src.tasks.single_incident import SingleIncidentGrader
15
  from src.tasks.registry import TaskRegistry
16
 
17
 
@@ -23,27 +20,14 @@ def list_tasks() -> list[dict[str, Any]]:
23
  ]
24
 
25
 
26
- def _get_grader(task_id: str) -> Any:
27
- if task_id == "single_incident":
28
- return SingleIncidentGrader()
29
- elif task_id == "multi_incident":
30
- return MultiIncidentGrader()
31
- elif task_id == "mass_casualty":
32
- return MassCasualtyGrader()
33
- elif task_id == "shift_surge":
34
- return ShiftSurgeGrader()
35
- raise KeyError(f"Unknown task: {task_id}")
36
-
37
-
38
  async def _run_episode_async(task_id: str, seed: int) -> tuple[float, list[float]]:
39
  env = OpenEnvEnvironment(task_id=task_id, seed=seed)
40
  rewards: list[float] = []
41
  final_state = None
42
 
43
  try:
44
- obs = await env.reset()
45
  final_state = env.state()
46
- rewards.append(obs.score)
47
 
48
  rng = random.Random(seed)
49
  for _ in range(1000):
@@ -73,7 +57,6 @@ async def _run_episode_async(task_id: str, seed: int) -> tuple[float, list[float
73
  finally:
74
  env.close()
75
 
76
- grader = _get_grader(task_id)
77
  if final_state is None:
78
  from src.models import State
79
 
@@ -87,7 +70,10 @@ async def _run_episode_async(task_id: str, seed: int) -> tuple[float, list[float
87
  metadata={},
88
  )
89
 
90
- return grader.grade(final_state, rewards), rewards
 
 
 
91
 
92
 
93
  def run_task(task_id: str, seed: int) -> dict[str, Any]:
 
8
 
9
  from src.models import Action, DispatchAction
10
  from src.openenv_environment import OpenEnvEnvironment
11
+ from src.rewards import TaskGrader
 
 
 
12
  from src.tasks.registry import TaskRegistry
13
 
14
 
 
20
  ]
21
 
22
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  async def _run_episode_async(task_id: str, seed: int) -> tuple[float, list[float]]:
24
  env = OpenEnvEnvironment(task_id=task_id, seed=seed)
25
  rewards: list[float] = []
26
  final_state = None
27
 
28
  try:
29
+ await env.reset()
30
  final_state = env.state()
 
31
 
32
  rng = random.Random(seed)
33
  for _ in range(1000):
 
57
  finally:
58
  env.close()
59
 
 
60
  if final_state is None:
61
  from src.models import State
62
 
 
70
  metadata={},
71
  )
72
 
73
+ # Score episodes the same way as the OpenEnv evaluation path:
74
+ # a normalized aggregate of per-step rewards.
75
+ final_score = TaskGrader().grade_episode(rewards, task_id=task_id)
76
+ return final_score, rewards
77
 
78
 
79
  def run_task(task_id: str, seed: int) -> dict[str, Any]:
tests/test_benchmark_integration.py CHANGED
@@ -16,6 +16,13 @@ def test_run_task_score_in_range() -> None:
16
  result = run_task("single_incident", seed=42)
17
  assert 0.0 <= result["score"] <= 1.0
18
  assert result["task_id"] == "single_incident"
 
 
 
 
 
 
 
19
 
20
 
21
  def test_run_all_scores_in_range() -> None:
 
16
  result = run_task("single_incident", seed=42)
17
  assert 0.0 <= result["score"] <= 1.0
18
  assert result["task_id"] == "single_incident"
19
+ # Benchmark scoring must match the OpenEnv evaluation path: mean step reward.
20
+ rewards = result["rewards"]
21
+ if rewards:
22
+ expected = sum(rewards) / len(rewards)
23
+ else:
24
+ expected = 0.0
25
+ assert abs(result["score"] - expected) < 1e-9
26
 
27
 
28
  def test_run_all_scores_in_range() -> None: