Spaces:

Melikshah
/

dc_ops_env

Running

File size: 21,381 Bytes

aedaf74

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""Integration tests: full episode playback, config loading, cross-facility.

Validates:
  - Known-good action sequences resolve each scenario
  - Reward signals are well-behaved across full episodes
  - YAML config loading produces valid, runnable environments
  - Different facility sizes work correctly
  - Episode metrics (PUE, temps, rewards) are in expected ranges
"""

from __future__ import annotations

import time
from pathlib import Path

import pytest

from dc_ops_env.config import (
    BUILTIN_CONFIGS,
    DatacenterConfig,
    load_datacenter_config,
    make_default_datacenter_config,
)
from dc_ops_env.models import DcOpsAction, DcOpsObservation
from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
from dc_ops_env.scenarios.registry import registered_scenario_ids


# ===========================================================================
# Config Loading Tests
# ===========================================================================
class TestConfigLoading:
    """Validate YAML config loading and built-in configs."""

    def test_builtin_configs_exist(self) -> None:
        """All built-in config files should exist on disk."""
        for name, path in BUILTIN_CONFIGS.items():
            assert path.exists(), f"Built-in config '{name}' not found at {path}"

    @pytest.mark.parametrize("config_name", ["default", "small", "large"])
    def test_load_builtin(self, config_name: str) -> None:
        """Each built-in config should load without error."""
        cfg = load_datacenter_config(config_name)
        assert isinstance(cfg, DatacenterConfig)
        assert len(cfg.zones) > 0
        for zone in cfg.zones:
            assert len(zone.racks) > 0
            assert len(zone.crac_units) > 0

    def test_load_by_path(self) -> None:
        """Loading by explicit path should work."""
        path = BUILTIN_CONFIGS["default"]
        cfg = load_datacenter_config(path)
        assert cfg.name == "DC-OPS Default Facility"

    def test_load_nonexistent_raises(self) -> None:
        """Loading a missing file should raise FileNotFoundError."""
        with pytest.raises(FileNotFoundError):
            load_datacenter_config("/nonexistent/path.yaml")

    def test_default_yaml_matches_programmatic(self) -> None:
        """YAML default config should match make_default_datacenter_config()."""
        yaml_cfg = load_datacenter_config("default")
        prog_cfg = make_default_datacenter_config()

        assert yaml_cfg.name == prog_cfg.name
        assert len(yaml_cfg.zones) == len(prog_cfg.zones)
        assert yaml_cfg.outside_temp_c == prog_cfg.outside_temp_c

        # Same number of racks and CRACs
        yaml_racks = sum(len(z.racks) for z in yaml_cfg.zones)
        prog_racks = sum(len(z.racks) for z in prog_cfg.zones)
        assert yaml_racks == prog_racks

        yaml_cracs = sum(len(z.crac_units) for z in yaml_cfg.zones)
        prog_cracs = sum(len(z.crac_units) for z in prog_cfg.zones)
        assert yaml_cracs == prog_cracs

    def test_small_facility_dimensions(self) -> None:
        """Small facility should have correct dimensions."""
        cfg = load_datacenter_config("small")
        assert len(cfg.zones) == 1
        total_racks = sum(len(z.racks) for z in cfg.zones)
        assert total_racks == 10
        total_it = sum(r.it_load_kw for z in cfg.zones for r in z.racks)
        assert total_it == pytest.approx(80.0)
        assert len(cfg.power.ups_units) == 1

    def test_large_facility_dimensions(self) -> None:
        """Large facility should have correct dimensions."""
        cfg = load_datacenter_config("large")
        assert len(cfg.zones) == 4
        total_racks = sum(len(z.racks) for z in cfg.zones)
        assert total_racks == 60
        total_it = sum(r.it_load_kw for z in cfg.zones for r in z.racks)
        assert total_it == pytest.approx(600.0)
        assert len(cfg.power.ups_units) == 4

    def test_large_facility_has_h1_zone(self) -> None:
        """Large facility should include an H1 high-density zone."""
        cfg = load_datacenter_config("large")
        h1_zones = [z for z in cfg.zones if z.ashrae_class == "H1"]
        assert len(h1_zones) == 1
        # H1 zone should have higher per-rack load
        for rack in h1_zones[0].racks:
            assert rack.it_load_kw == 20.0


# ===========================================================================
# Config-to-Environment Tests
# ===========================================================================
class TestConfigToEnvironment:
    """Validate that loaded configs produce runnable environments."""

    @pytest.mark.parametrize("config_name", ["default", "small", "large"])
    def test_env_runs_with_config(self, config_name: str) -> None:
        """Environment should initialize and run steps with each config."""
        cfg = load_datacenter_config(config_name)
        env = DcOpsEnvironment()
        obs = env.reset(config=cfg)
        assert isinstance(obs, DcOpsObservation)
        assert obs.done is False

        obs = env.step(DcOpsAction(command="check_status"))
        assert isinstance(obs, DcOpsObservation)

    def test_small_facility_pue(self) -> None:
        """Small facility PUE should be realistic after warmup."""
        cfg = load_datacenter_config("small")
        env = DcOpsEnvironment()
        obs = env.reset(config=cfg)
        pue = obs.metadata["pue"]
        assert 1.1 < pue < 2.5, f"PUE {pue} out of realistic range"

    def test_large_facility_total_load(self) -> None:
        """Large facility total IT load should match config."""
        cfg = load_datacenter_config("large")
        env = DcOpsEnvironment()
        obs = env.reset(config=cfg)
        total_it = obs.metadata["total_it_load_kw"]
        assert total_it == pytest.approx(600.0, rel=0.01)


# ===========================================================================
# Full Episode Playback: Thermal Scenarios
# ===========================================================================
class TestEpisodePlaybackThermal:
    """Full episode playback with known-good action sequences for thermal scenarios."""

    def test_a1_optimal_episode(self) -> None:
        """A1 (Cooling Setpoint Optimization): raise setpoints to reduce PUE.

        Optimal sequence: check_status → raise each CRAC setpoint → wait for convergence.
        PUE should improve significantly from baseline.
        """
        env = DcOpsEnvironment()
        obs = env.reset(scenario="A1")
        pue_initial = obs.metadata["pue"]

        # 1. Check status first (procedure bonus)
        obs = env.step(DcOpsAction(command="check_status"))
        assert not obs.done

        # 2. Raise setpoints on all 4 CRACs from 15°C → 24°C (aggressive)
        for crac_id in ["CRAC-1", "CRAC-2", "CRAC-3", "CRAC-4"]:
            obs = env.step(DcOpsAction(command=f"adjust_setpoint {crac_id} 24"))

        # 3. Wait for temps to converge
        for _ in range(5):
            obs = env.step(DcOpsAction(command="wait"))
            if obs.done:
                break

        pue_final = obs.metadata["pue"]
        # PUE should have improved (lower is better)
        assert pue_final < pue_initial, (
            f"PUE should improve: {pue_initial:.2f} → {pue_final:.2f}"
        )

    def test_a2_optimal_episode(self) -> None:
        """A2 (Thermal Event Response): diagnose CRAC-3, compensate with remaining units.

        Optimal: diagnose → increase fan speeds on survivors → adjust setpoints.
        """
        env = DcOpsEnvironment()
        obs = env.reset(scenario="A2")

        # 1. Diagnose the failed CRAC
        obs = env.step(DcOpsAction(command="diagnose CRAC-3"))
        assert "COMPRESSOR" in obs.action_result or "compressor" in obs.action_result.lower()

        # 2. Increase fan speed on remaining CRACs
        for crac_id in ["CRAC-1", "CRAC-2", "CRAC-4"]:
            obs = env.step(DcOpsAction(command=f"set_fan_speed {crac_id} 100"))

        # 3. Lower setpoints slightly on surviving units to compensate
        for crac_id in ["CRAC-1", "CRAC-2", "CRAC-4"]:
            obs = env.step(DcOpsAction(command=f"adjust_setpoint {crac_id} 16"))

        # 4. Wait for stabilization
        for _ in range(8):
            obs = env.step(DcOpsAction(command="wait"))
            if obs.done:
                break

        # Should resolve or be close — temps within recommended for 2+ steps
        # Even if not fully resolved, reward should be reasonable
        assert obs.metadata["cumulative_reward"] > -5.0

    def test_a4_episode_with_load_shedding(self) -> None:
        """A4 (CRAC Failure Cascade): diagnose both, compensate, shed load.

        This is the hardest thermal scenario — two CRACs down.
        """
        env = DcOpsEnvironment()
        obs = env.reset(scenario="A4")

        # 1. Diagnose both failed units
        obs = env.step(DcOpsAction(command="diagnose CRAC-1"))
        obs = env.step(DcOpsAction(command="diagnose CRAC-3"))

        # 2. Max out surviving CRACs
        obs = env.step(DcOpsAction(command="set_fan_speed CRAC-2 100"))
        obs = env.step(DcOpsAction(command="set_fan_speed CRAC-4 100"))
        obs = env.step(DcOpsAction(command="adjust_setpoint CRAC-2 15"))
        obs = env.step(DcOpsAction(command="adjust_setpoint CRAC-4 15"))

        # 3. Shed load on hottest racks
        for rack_id in ["A-01", "A-02", "B-01", "B-02"]:
            obs = env.step(DcOpsAction(command=f"set_rack_load {rack_id} 4"))

        # 4. Wait and monitor
        for _ in range(10):
            obs = env.step(DcOpsAction(command="wait"))
            if obs.done:
                break

        # Hard scenario — may not fully resolve, but should make progress
        assert obs.metadata["cumulative_reward"] > -10.0


# ===========================================================================
# Full Episode Playback: Power Scenarios
# ===========================================================================
class TestEpisodePlaybackPower:
    """Full episode playback with known-good action sequences for power scenarios."""

    def test_b1_optimal_episode(self) -> None:
        """B1 (UPS Alarm Response): diagnose UPS, acknowledge alarm.

        Simple 2-step resolution.
        """
        env = DcOpsEnvironment()
        obs = env.reset(scenario="B1")

        # 1. Diagnose UPS status
        obs = env.step(DcOpsAction(command="diagnose UPS-1"))
        assert not obs.done

        # 2. Acknowledge the alarm
        obs = env.step(DcOpsAction(command="acknowledge_alarm"))
        assert obs.done, "B1 should resolve after diagnose + acknowledge"

        # Speed bonus: (10 - 2) / 10 = 0.8
        assert obs.reward > 0.5, "Should have significant speed bonus"

    def test_b3_optimal_episode(self) -> None:
        """B3 (Generator Test Protocol): follow the correct test sequence.

        check_status → start_generator → diagnose GEN-1 → stop_generator → acknowledge.
        """
        env = DcOpsEnvironment()
        obs = env.reset(scenario="B3")

        # Follow correct protocol
        obs = env.step(DcOpsAction(command="check_status"))
        assert not obs.done

        obs = env.step(DcOpsAction(command="start_generator"))
        assert not obs.done

        # Wait for generator to start (30s game time per step, gen startup ~17s)
        obs = env.step(DcOpsAction(command="wait"))

        obs = env.step(DcOpsAction(command="diagnose GEN-1"))
        assert not obs.done

        obs = env.step(DcOpsAction(command="stop_generator"))
        assert not obs.done

        # Wait for cooldown
        obs = env.step(DcOpsAction(command="wait"))

        obs = env.step(DcOpsAction(command="acknowledge_alarm"))
        assert obs.done, "B3 should resolve after full protocol"

    def test_b4_episode_with_load_shedding(self) -> None:
        """B4 (Power Failure Cascade): manage battery, wait for generator.

        Generator starts automatically on utility loss. Agent monitors
        and sheds load to extend battery life.
        """
        env = DcOpsEnvironment()
        obs = env.reset(scenario="B4")

        # 1. Diagnose to understand the situation
        obs = env.step(DcOpsAction(command="diagnose UPS-1"))
        obs = env.step(DcOpsAction(command="diagnose UPS-2"))

        # 2. Shed non-critical load to extend battery
        obs = env.step(DcOpsAction(command="set_rack_load A-01 4"))
        obs = env.step(DcOpsAction(command="set_rack_load B-01 4"))

        # 3. Check generator status
        obs = env.step(DcOpsAction(command="diagnose GEN-1"))

        # 4. Wait for generator to come online and stabilize
        for _ in range(14):
            obs = env.step(DcOpsAction(command="wait"))
            if obs.done:
                break

        # B4 is hard — may or may not resolve, but should make progress
        assert obs.metadata["cumulative_reward"] > -10.0


# ===========================================================================
# Reward Signal Quality
# ===========================================================================
class TestRewardSignalQuality:
    """Validate that reward signals are well-behaved across full episodes."""

    def test_rewards_bounded_per_step(self) -> None:
        """Every per-step reward should be bounded."""
        env = DcOpsEnvironment()
        env.reset(scenario="A2")

        for _ in range(15):
            obs = env.step(DcOpsAction(command="wait"))
            # Base reward is [-1, 1], speed bonus can add up to 1.0
            assert -2.0 <= obs.reward <= 2.0, f"Reward {obs.reward} out of bounds"
            if obs.done:
                break

    def test_good_actions_beat_bad_actions(self) -> None:
        """An optimal sequence should yield higher cumulative reward than a bad one."""
        env = DcOpsEnvironment()

        # Good episode: diagnose then fix
        env.reset(scenario="B1")
        env.step(DcOpsAction(command="diagnose UPS-1"))
        obs_good = env.step(DcOpsAction(command="acknowledge_alarm"))
        r_good = obs_good.metadata["cumulative_reward"]

        # Bad episode: just wait
        env.reset(scenario="B1")
        for _ in range(10):
            obs_bad = env.step(DcOpsAction(command="wait"))
            if obs_bad.done:
                break
        r_bad = obs_bad.metadata["cumulative_reward"]

        assert r_good > r_bad, f"Good ({r_good:.2f}) should beat bad ({r_bad:.2f})"

    def test_procedure_bonus_visible(self) -> None:
        """Following correct procedure should yield higher cumulative reward.

        Full episode comparison: both episodes do the same actions, but one
        follows procedure (check_status first) and the other doesn't.
        """
        env = DcOpsEnvironment()

        # With procedure: check_status → adjust_setpoint → wait
        env.reset(scenario="A1")
        env.step(DcOpsAction(command="check_status"))
        env.step(DcOpsAction(command="adjust_setpoint CRAC-1 22"))
        obs_proc = env.step(DcOpsAction(command="wait"))
        r_with = obs_proc.metadata["cumulative_reward"]

        # Without procedure: wait → adjust_setpoint → wait (no check_status)
        env.reset(scenario="A1")
        env.step(DcOpsAction(command="wait"))
        env.step(DcOpsAction(command="adjust_setpoint CRAC-1 22"))
        obs_noproc = env.step(DcOpsAction(command="wait"))
        r_without = obs_noproc.metadata["cumulative_reward"]

        assert r_with > r_without, (
            f"Procedure bonus not visible: with={r_with:.3f} vs without={r_without:.3f}"
        )

    @pytest.mark.parametrize("scenario_id", registered_scenario_ids())
    def test_no_nan_rewards(self, scenario_id: str) -> None:
        """No scenario should produce NaN rewards."""
        import math

        env = DcOpsEnvironment()
        env.reset(scenario=scenario_id)

        for _ in range(5):
            obs = env.step(DcOpsAction(command="check_status"))
            assert not math.isnan(obs.reward), f"NaN reward in {scenario_id}"
            assert not math.isinf(obs.reward), f"Inf reward in {scenario_id}"
            if obs.done:
                break


# ===========================================================================
# Cross-Facility Scenario Tests
# ===========================================================================
class TestCrossFacility:
    """Validate scenarios work with different facility configs."""

    def test_scenario_with_small_facility(self) -> None:
        """Scenarios should adapt to smaller configs that have compatible CRACs."""
        cfg = load_datacenter_config("small")
        env = DcOpsEnvironment()
        # Run without a scenario, just with small config
        obs = env.reset(config=cfg, step_budget=5)
        assert obs.done is False

        # Basic operations should work
        obs = env.step(DcOpsAction(command="check_status"))
        assert "status" in obs.action_result.lower()

        obs = env.step(DcOpsAction(command="diagnose CRAC-1"))
        assert "Diagnostic Report" in obs.action_result

    def test_large_facility_steady_state(self) -> None:
        """Large facility should reach reasonable steady state."""
        cfg = load_datacenter_config("large")
        env = DcOpsEnvironment()
        obs = env.reset(config=cfg, step_budget=10)

        pue = obs.metadata["pue"]
        assert 1.1 < pue < 3.0, f"Large facility PUE {pue} unrealistic"

        total_cooling = obs.metadata["total_cooling_power_kw"]
        total_it = obs.metadata["total_it_load_kw"]
        assert total_cooling > 0
        assert total_it > 0


# ===========================================================================
# Episode Metrics & Physics Consistency
# ===========================================================================
class TestEpisodeMetrics:
    """Validate physics consistency across episode metrics."""

    def test_pue_always_above_one(self) -> None:
        """PUE should always be >= 1.0 (physically impossible otherwise)."""
        env = DcOpsEnvironment()
        env.reset(scenario="A1")

        for _ in range(10):
            obs = env.step(DcOpsAction(command="wait"))
            assert obs.metadata["pue"] >= 1.0
            if obs.done:
                break

    def test_higher_load_raises_temperature(self) -> None:
        """Adding rack load should cause temperature to rise."""
        env = DcOpsEnvironment()
        obs = env.reset()
        t_before = obs.metadata["zones"]["zone_a"]["cold_aisle_temp_c"]

        # Significantly increase multiple racks' load
        env.step(DcOpsAction(command="set_rack_load A-01 15"))
        env.step(DcOpsAction(command="set_rack_load A-02 15"))
        env.step(DcOpsAction(command="set_rack_load A-03 15"))

        # Wait for thermal response
        for _ in range(7):
            obs = env.step(DcOpsAction(command="wait"))

        t_after = obs.metadata["zones"]["zone_a"]["cold_aisle_temp_c"]
        assert t_after > t_before, (
            f"Temp should rise with more load: {t_before:.1f} → {t_after:.1f}"
        )

    def test_sim_time_monotonically_increases(self) -> None:
        """Simulation time should always advance."""
        env = DcOpsEnvironment()
        obs = env.reset()
        prev_time = obs.metadata["sim_time_s"]

        for _ in range(5):
            obs = env.step(DcOpsAction(command="wait"))
            assert obs.metadata["sim_time_s"] > prev_time
            prev_time = obs.metadata["sim_time_s"]


# ===========================================================================
# Performance Tests
# ===========================================================================
class TestIntegrationPerformance:
    """Validate performance across different facility sizes."""

    @pytest.mark.parametrize("config_name", ["default", "small", "large"])
    def test_episode_completes_fast(self, config_name: str) -> None:
        """Full episode should complete quickly for any facility size."""
        cfg = load_datacenter_config(config_name)
        env = DcOpsEnvironment()

        start = time.perf_counter()
        env.reset(config=cfg, step_budget=10)
        for _ in range(10):
            env.step(DcOpsAction(command="wait"))
        elapsed = time.perf_counter() - start

        assert elapsed < 10.0, (
            f"{config_name} facility 10-step episode took {elapsed:.2f}s, should be <10s"
        )

    def test_all_scenarios_full_episode_under_10s(self) -> None:
        """Running every scenario for its full step budget should be fast."""
        env = DcOpsEnvironment()
        total_start = time.perf_counter()

        for sid in registered_scenario_ids():
            env.reset(scenario=sid)
            for _ in range(20):  # Max budget across scenarios
                obs = env.step(DcOpsAction(command="wait"))
                if obs.done:
                    break

        total_elapsed = time.perf_counter() - total_start
        assert total_elapsed < 15.0, (
            f"All {len(registered_scenario_ids())} scenarios took {total_elapsed:.2f}s"
        )