"""Integration tests for benchmark assembly and score ranges.""" from __future__ import annotations import asyncio from src.models import Action, DispatchAction from src.benchmark import list_tasks, run_all, run_task from src.openenv_environment import OpenEnvEnvironment def test_list_tasks_has_four() -> None: tasks = list_tasks() assert len(tasks) == 4 ids = {t["task_id"] for t in tasks} assert ids == {"single_incident", "multi_incident", "mass_casualty", "shift_surge"} def test_run_task_score_in_range() -> None: result = run_task("single_incident", seed=42) assert 0.0 <= result["score"] <= 1.0 assert result["task_id"] == "single_incident" def test_benchmark_and_openenv_use_same_episode_grader(monkeypatch) -> None: from src.tasks.single_incident import SingleIncidentGrader expected_score = 0.777 monkeypatch.setattr(SingleIncidentGrader, "grade", lambda self, state, rewards: expected_score) # Benchmark path. result = run_task("single_incident", seed=42) assert abs(result["score"] - expected_score) < 1e-9 # OpenEnv path. env = OpenEnvEnvironment(task_id="single_incident", seed=42) asyncio.run(env.reset()) obs, reward, done = asyncio.run( env.step( Action( action_type=DispatchAction.DISPATCH, unit_id="MED-1", incident_id="INC-001", ) ) ) assert isinstance(reward, float) assert isinstance(done, bool) assert abs(float(obs.score) - expected_score) < 1e-9 env.close() def test_run_all_scores_in_range() -> None: scores = run_all() assert set(scores.keys()) == {"single_incident", "multi_incident", "mass_casualty", "shift_surge"} assert all(0.0 <= s <= 1.0 for s in scores.values())