from __future__ import annotations from trenches_env.benchmark_runner import ScenarioBenchmarkRunner from trenches_env.env import FogOfWarDiplomacyEnv from trenches_env.models import BenchmarkRunRequest from trenches_env.source_ingestion import SourceHarvester def build_offline_env() -> FogOfWarDiplomacyEnv: return FogOfWarDiplomacyEnv(source_harvester=SourceHarvester(auto_start=False)) def test_named_scenario_applies_distinct_world_and_episode_metadata() -> None: env = build_offline_env() session = env.create_session(seed=11, scenario_id="shipping_crisis") assert session.episode.scenario_id == "shipping_crisis" assert session.episode.scenario_name == "Shipping Crisis" assert "shipping" in session.episode.scenario_tags assert session.world.tension_level >= 64.0 assert session.world.oil_pressure >= 78.0 assert session.world.actor_state["gulf"]["shipping_continuity"] < 78.0 assert any("tankers" in event.summary.lower() or "shipping" in event.summary.lower() for event in session.world.active_events) def test_scenario_creation_is_deterministic_for_fixed_seed() -> None: env = build_offline_env() first = env.create_session(seed=7, scenario_id="border_flareup") second = env.create_session(seed=7, scenario_id="border_flareup") assert first.world.model_dump() == second.world.model_dump() assert first.observations["israel"].perceived_tension == second.observations["israel"].perceived_tension def test_benchmark_runner_returns_scorecards_for_each_agent() -> None: runner = ScenarioBenchmarkRunner(env_factory=build_offline_env) result = runner.run( BenchmarkRunRequest( scenario_ids=["shipping_crisis", "coalition_fracture"], seed=13, steps_per_scenario=4, ) ) assert result.scenario_count == 2 assert set(result.aggregate_mean_total_rewards) == {"us", "israel", "iran", "hezbollah", "gulf", "oversight"} assert result.results[0].scorecards["gulf"].final_state["shipping_continuity"] < 78.0 assert result.results[1].scorecards["israel"].final_state["us_resupply_confidence"] < 75.0 assert result.results[1].scorecards["us"].dominant_action is not None def test_benchmark_runner_is_deterministic_for_fixed_seed() -> None: runner = ScenarioBenchmarkRunner(env_factory=build_offline_env) request = BenchmarkRunRequest( scenario_ids=["corridor_interdiction"], seed=5, steps_per_scenario=4, ) first = runner.run(request) second = runner.run(request) assert first.model_dump() == second.model_dump()