Spaces:

Melikshah
/

dc_ops_env

Running

App Files Files Community

Melikshah commited on Apr 7

Commit

aedaf74

verified ·

1 Parent(s): ddcbd24

Upload folder using huggingface_hub

Browse files

Files changed (45) hide show

Dockerfile +82 -0
README.md +494 -5
__init__.py +16 -0
actions/__init__.py +11 -0
actions/parser.py +356 -0
client.py +69 -0
config.py +549 -0
data/datacenter_configs/default.yaml +101 -0
data/datacenter_configs/large_facility.yaml +157 -0
data/datacenter_configs/small_facility.yaml +67 -0
models.py +83 -0
openenv.yaml +7 -0
openenv_dc_ops_env.egg-info/PKG-INFO +9 -0
openenv_dc_ops_env.egg-info/SOURCES.txt +20 -0
openenv_dc_ops_env.egg-info/dependency_links.txt +1 -0
openenv_dc_ops_env.egg-info/entry_points.txt +2 -0
openenv_dc_ops_env.egg-info/requires.txt +5 -0
openenv_dc_ops_env.egg-info/top_level.txt +1 -0
pyproject.toml +50 -0
rendering/__init__.py +11 -0
rendering/dashboard.py +262 -0
rewards/__init__.py +23 -0
rewards/reward_function.py +428 -0
scenarios/__init__.py +31 -0
scenarios/base.py +195 -0
scenarios/power_scenarios.py +496 -0
scenarios/registry.py +81 -0
scenarios/thermal_scenarios.py +443 -0
server/__init__.py +11 -0
server/app.py +101 -0
server/dc_ops_env_environment.py +532 -0
server/requirements.txt +6 -0
server/static/index.html +911 -0
simulation/__init__.py +49 -0
simulation/power.py +668 -0
simulation/thermal.py +515 -0
simulation/types.py +598 -0
tests/__init__.py +0 -0
tests/test_environment.py +439 -0
tests/test_integration.py +535 -0
tests/test_power.py +743 -0
tests/test_rewards.py +650 -0
tests/test_scenarios.py +415 -0
tests/test_thermal.py +499 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local OpenEnv sources)
+# - Standalone environments (with openenv from PyPI/Git)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+# Ensure git is available (required for installing dependencies from VCS)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=dc_ops_env
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install dependencies using uv sync
+# If uv.lock exists, use it; otherwise resolve on the fly
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code — directory name MUST match package name
+# so that relative imports (from ..config, from ..simulation, etc.) resolve correctly
+COPY --from=builder /app/env /app/dc_ops_env
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so dc_ops_env is discoverable as a proper package
+ENV PYTHONPATH="/app:$PYTHONPATH"
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the FastAPI server using the venv's Python to ensure correct dependencies
+# Fully-qualified module path ensures dc_ops_env is the top-level package
+ENV ENABLE_WEB_INTERFACE=true
+CMD ["/app/.venv/bin/python", "-m", "uvicorn", "dc_ops_env.server.app:app", "--host", "0.0.0.0", "--port", "8000"]

README.md CHANGED Viewed

@@ -1,10 +1,499 @@
 ---
-title: Dc Ops Env
-emoji: 🌍
-colorFrom: red
-colorTo: indigo
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: DC-Ops Environment Server
+emoji: 🖥️
+colorFrom: blue
+colorTo: green
 sdk: docker
 pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - openenv
+  - reinforcement-learning
+  - datacenter
+  - simulation
 ---
+# DC-Ops Environment
+A physics-based datacenter operations environment for training LLM agents, built on Meta's [OpenEnv](https://github.com/meta-pytorch/OpenEnv) framework.
+The agent reads a text-based NOC dashboard and issues natural-language operator commands — exactly as a human datacenter operator would.
+## Quick Start
+### Prerequisites
+- Python 3.10+
+- [uv](https://docs.astral.sh/uv/) (recommended) or pip
+- Docker (for containerized deployment)
+### Install & Run Locally
+```bash
+# Clone the repository
+git clone <repo-url>
+cd dc_ops_env
+# Install dependencies
+uv sync
+# Run the test suite (256 tests, <10s)
+uv run pytest tests/ -v
+# Start the server
+uv run server
+```
+The server starts at `http://localhost:8000` with:
+- **Web UI** → `http://localhost:8000/web`
+- **API docs** → `http://localhost:8000/docs`
+- **Health check** → `http://localhost:8000/health`
+- **WebSocket** → `ws://localhost:8000/ws`
+### Run with Docker
+```bash
+# Build the image
+docker build -t dc-ops:latest -f server/Dockerfile .
+# Run the container
+docker run -d -p 8000:8000 dc-ops:latest
+# Verify it's running
+curl http://localhost:8000/health
+```
+---
+## OpenEnv Integration
+DC-Ops is a fully compliant [OpenEnv](https://github.com/meta-pytorch/OpenEnv) environment. OpenEnv provides:
+- **MCP tool-based interactions** for LLM agents (WebSocket `/ws`)
+- **HTTP orchestration layer** for training pipelines (`/reset`, `/step`, `/state`)
+- **HuggingFace Spaces deployment** via `openenv push`
+- **TRL/GRPO integration** for RL training with `GRPOTrainer`
+### Action & Observation Models
+**DcOpsAction** — the agent's command:
+```python
+class DcOpsAction(Action):
+    command: str    # e.g., "diagnose CRAC-3", "adjust_setpoint CRAC-1 20"
+    reasoning: str  # Optional chain-of-thought
+```
+**DcOpsObservation** — what the agent sees:
+```python
+class DcOpsObservation(Observation):
+    dashboard: str           # Text-rendered monitoring dashboard
+    available_actions: list  # Valid commands the agent can issue
+    alert: str               # Current active alert message
+    scenario_type: str       # "thermal", "power", etc.
+    steps_remaining: int     # Steps left in episode budget
+    action_result: str       # Feedback from last action
+```
+### Available Commands
+| Command | Format | Description |
+|---------|--------|-------------|
+| `diagnose` | `diagnose <unit_id>` | Inspect a CRAC/UPS/PDU for faults |
+| `adjust_setpoint` | `adjust_setpoint <crac_id> <temp_c>` | Change CRAC supply air setpoint |
+| `set_fan_speed` | `set_fan_speed <crac_id> <pct>` | Set CRAC fan speed (0-100%) |
+| `set_rack_load` | `set_rack_load <rack_id> <kw>` | Adjust rack IT load (migrate workload) |
+| `start_crac` | `start_crac <crac_id>` | Start a standby CRAC unit |
+| `stop_crac` | `stop_crac <crac_id>` | Put a CRAC into standby |
+| `start_generator` | `start_generator` | Manually start the diesel generator |
+| `stop_generator` | `stop_generator` | Initiate generator cooldown |
+| `set_ups_mode` | `set_ups_mode <ups_id> <mode>` | Set UPS mode (eco/double_conversion/bypass) |
+| `refuel_generator` | `refuel_generator [liters]` | Refuel (default: full tank) |
+| `acknowledge_alarm` | `acknowledge_alarm` | Acknowledge current alert |
+| `check_status` | `check_status` | Request full status report |
+| `escalate` | `escalate` | Escalate to senior engineer |
+| `wait` | `wait` | Take no action this step |
+---
+## Using the Client
+### Programmatic Usage (Python)
+```python
+from dc_ops_env import DcOpsAction, DcOpsEnv
+# Connect to a running server
+async with DcOpsEnv(base_url="http://localhost:8000") as env:
+    # Reset with a specific scenario
+    result = await env.reset(scenario="A2")
+    print(result.observation.dashboard)
+    # Agent loop
+    while not result.done:
+        result = await env.step(
+            DcOpsAction(
+                command="diagnose CRAC-3",
+                reasoning="CRAC-3 shows compressor failure, need to investigate"
+            )
+        )
+        print(f"Reward: {result.reward}")
+        print(result.observation.dashboard)
+```
+### From Docker Image
+```python
+from dc_ops_env import DcOpsAction, DcOpsEnv
+# Start environment from Docker (auto-manages container lifecycle)
+env = DcOpsEnv.from_docker_image("dc-ops:latest")
+try:
+    result = env.reset(scenario="A2")
+    for _ in range(15):
+        result = env.step(DcOpsAction(command="check_status"))
+        if result.done:
+            break
+finally:
+    env.close()
+```
+### Concurrent Sessions
+The server supports multiple concurrent WebSocket sessions for parallel training:
+```python
+# In server/app.py — adjust max_concurrent_envs
+app = create_app(
+    DcOpsEnvironment,
+    DcOpsAction,
+    DcOpsObservation,
+    max_concurrent_envs=16,  # Scale up for parallel RL
+)
+```
+```python
+from concurrent.futures import ThreadPoolExecutor
+from dc_ops_env import DcOpsAction, DcOpsEnv
+def run_episode(scenario_id: str):
+    with DcOpsEnv(base_url="http://localhost:8000") as env:
+        result = env.reset(scenario=scenario_id)
+        total_reward = 0.0
+        while not result.done:
+            result = env.step(DcOpsAction(command="check_status"))
+            total_reward += result.reward
+        return scenario_id, total_reward
+# Run 8 episodes concurrently
+scenarios = ["A1", "A2", "A4", "B1", "B3", "B4", "A2", "B4"]
+with ThreadPoolExecutor(max_workers=8) as executor:
+    results = list(executor.map(run_episode, scenarios))
+```
+---
+## Scenarios
+6 operational scenarios across 3 difficulty levels:
+| ID | Scenario | Difficulty | Type | Fault |
+|----|----------|------------|------|-------|
+| A1 | Cooling Setpoint Optimization | Easy | Thermal | CRACs at 15°C (wasteful) |
+| A2 | Thermal Event Response | Medium | Thermal | CRAC-3 compressor failure |
+| A4 | CRAC Failure Cascade | Hard | Thermal | CRAC-1 compressor + CRAC-3 fan |
+| B1 | UPS Alarm Response | Medium | Power | UPS transferred to battery |
+| B3 | Generator Test Protocol | Easy | Power | None (routine test) |
+| B4 | Power Failure Cascade | Hard | Power | Utility loss + extended gen warmup |
+Reset with a specific scenario:
+```python
+result = env.reset(scenario="A2")           # By ID
+result = env.reset(random_scenario=True)    # Random
+result = env.reset(random_scenario=True, difficulty="hard")  # Random hard
+```
+---
+## Configuration
+### Built-in Facility Configs
+Three YAML configurations are included:
+| Config | Zones | Racks | IT Load | CRACs | Use Case |
+|--------|-------|-------|---------|-------|----------|
+| `default` | 2 | 20 | 160 kW | 4 × 70 kW | Standard facility |
+| `small` | 1 | 10 | 80 kW | 2 × 70 kW | Edge / branch office |
+| `large` | 4 | 60 | 600 kW | 8 × 100 kW | Multi-zone + GPU (H1) |
+```python
+from dc_ops_env.config import load_datacenter_config
+# Load a built-in config
+config = load_datacenter_config("small")
+# Load a custom YAML file
+config = load_datacenter_config("/path/to/my_datacenter.yaml")
+# Use with environment
+result = env.reset(scenario="A2", config=config)
+```
+### Custom YAML Configuration
+Create your own datacenter layout:
+```yaml
+name: "My Custom Facility"
+outside_temp_c: 35.0
+outside_humidity_rh: 0.40
+simulation_dt_s: 1.0
+zones:
+  - zone_id: zone_a
+    containment_type: cold_aisle
+    recirculation_factor: 0.08
+    air_volume_m3: 500.0
+    envelope_r_kw: 0.02
+    initial_cold_aisle_temp_c: 20.0
+    ashrae_class: A2
+    racks:
+      - { rack_id: A-01, row: A, position: 1, it_load_kw: 8.0,
+          num_servers_2u: 20, server_thermal_mass_jk: 11100.0,
+          airflow_cfm_per_kw: 160.0 }
+      # ... more racks
+    crac_units:
+      - { unit_id: CRAC-1, rated_capacity_kw: 70.0,
+          rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03,
+          max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0,
+          cop_rated: 3.5, initial_setpoint_c: 18.0,
+          initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+power:
+  utility_voltage_v: 480.0
+  utility_available: true
+  ups_units:
+    - { unit_id: UPS-1, rated_capacity_kw: 500.0,
+        loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011,
+        battery_capacity_kwh: 8.3, battery_discharge_efficiency: 0.90,
+        battery_aging_factor: 0.85, recharge_rate_kw: 5.0,
+        initial_mode: double_conversion }
+  pdus:
+    - { pdu_id: PDU-A-01, voltage_ll_v: 208.0,
+        max_current_per_phase_a: 24.0, num_phases: 3,
+        efficiency: 0.98, continuous_derating: 0.80 }
+  generator:
+    gen_id: GEN-1
+    rated_capacity_kw: 750.0
+    start_delay_s: 4.0
+    crank_time_s: 5.0
+    warmup_time_s: 8.0
+    fuel_tank_liters: 2000.0
+    consumption_lph_full: 180.0
+    cooldown_time_s: 300.0
+  ats:
+    ats_id: ATS-1
+    transfer_time_ms: 100.0
+    retransfer_delay_s: 300.0
+```
+See [data/datacenter_configs/](data/datacenter_configs/) for complete examples.
+---
+## TRL / GRPO Training Integration
+DC-Ops integrates directly with HuggingFace TRL's `GRPOTrainer` via the OpenEnv `environment_factory` pattern:
+```python
+from trl import GRPOTrainer, GRPOConfig
+from dc_ops_env import DcOpsAction, DcOpsEnv
+def dc_ops_environment_factory():
+    """Factory that returns a DC-Ops environment instance."""
+    env = DcOpsEnv(base_url="http://localhost:8000")
+    return env
+config = GRPOConfig(
+    model_name_or_path="your-base-model",
+    # ... training hyperparameters
+)
+trainer = GRPOTrainer(
+    config=config,
+    environments=dc_ops_environment_factory,
+    # ... other args
+)
+trainer.train()
+```
+For multi-environment parallel training, run multiple servers or increase `max_concurrent_envs` and spawn concurrent clients.
+---
+## Deploy to HuggingFace Spaces
+### Using OpenEnv CLI
+The simplest way to deploy:
+```bash
+# From the dc_ops_env/ directory (where openenv.yaml is located)
+cd dc_ops_env
+# Login to HuggingFace (if not already)
+huggingface-cli login
+# Push to HuggingFace Spaces
+openenv push
+# Or with options
+openenv push --repo-id your-username/dc-ops-env --private
+openenv push --namespace your-org
+```
+### What Gets Deployed
+The `openenv push` command:
+1. Validates the `openenv.yaml` manifest
+2. Builds a Docker Space on HuggingFace
+3. Uploads all environment code
+Your deployed Space will be available at:
+`https://huggingface.co/spaces/<repo-id>`
+The Space includes:
+- **Web Interface** at `/web` — Interactive scenario browser and dashboard viewer
+- **API Documentation** at `/docs` — Full OpenAPI/Swagger interface
+- **Health Check** at `/health` — Container health monitoring
+- **WebSocket** at `/ws` — Persistent session endpoint for agent connections
+### Connecting to a Deployed Space
+```python
+from dc_ops_env import DcOpsAction, DcOpsEnv
+# Connect to your HuggingFace Space
+space_url = "https://your-username-dc-ops-env.hf.space"
+async with DcOpsEnv(base_url=space_url) as env:
+    result = await env.reset(scenario="A2")
+    print(result.observation.dashboard)
+```
+### CLI Options
+| Option | Description |
+|--------|-------------|
+| `--directory`, `-d` | Directory containing the OpenEnv environment (default: current) |
+| `--repo-id`, `-r` | Repository ID `username/repo-name` (default: from openenv.yaml) |
+| `--base-image`, `-b` | Override base Docker image |
+| `--private` | Deploy as a private Space |
+| `--namespace` | HuggingFace namespace (user or org) |
+---
+## Development
+### Running Tests
+```bash
+# All tests (256 tests)
+uv run pytest tests/ -v
+# Specific test modules
+uv run pytest tests/test_thermal.py -v      # Thermal physics
+uv run pytest tests/test_power.py -v        # Power systems
+uv run pytest tests/test_actions.py -v      # Command parser
+uv run pytest tests/test_rewards.py -v      # Reward function
+uv run pytest tests/test_scenarios.py -v    # Scenario framework
+uv run pytest tests/test_integration.py -v  # End-to-end episodes
+# With coverage
+uv run pytest tests/ --cov=dc_ops_env --cov-report=term-missing
+```
+### Direct Environment Testing (No Server)
+Test the environment logic without the HTTP/WebSocket layer:
+```python
+from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
+from dc_ops_env.models import DcOpsAction
+env = DcOpsEnvironment()
+obs = env.reset(scenario="A2")
+print(obs.dashboard)
+obs = env.step(DcOpsAction(command="diagnose CRAC-3"))
+print(f"Reward: {obs.reward}")
+print(obs.dashboard)
+```
+### Running the Server Locally
+```bash
+# Via entry point (recommended)
+uv run server
+# With custom port
+uv run server --port 8001
+# Via uvicorn directly (with auto-reload for development)
+uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
+# Production (multi-worker)
+uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
+```
+---
+## Project Structure
+```
+dc_ops_env/
+├── openenv.yaml                    # OpenEnv manifest
+├── pyproject.toml                  # Dependencies and metadata
+├── README.md                       # This file (HF Space README)
+├── __init__.py                     # Exports: DcOpsEnv, DcOpsAction, DcOpsObservation
+├── config.py                       # Physical constants, ASHRAE limits, YAML loader
+├── models.py                       # Pydantic Action/Observation models
+├── client.py                       # DcOpsEnv (EnvClient subclass)
+├── simulation/
+│   ├── thermal.py                  # RC thermal network (zones, racks, CRACs)
+│   ├── power.py                    # UPS, PDU, generator, ATS models
+│   └── types.py                    # Runtime state dataclasses
+├── scenarios/
+│   ├── base.py                     # Abstract Scenario + ProcedureRule
+│   ├── registry.py                 # Scenario registration and selection
+│   ├── thermal_scenarios.py        # A1, A2, A4
+│   └── power_scenarios.py          # B1, B3, B4
+├── rewards/
+│   └── reward_function.py          # 6-component composite reward
+├── rendering/
+│   └── dashboard.py                # State → text dashboard
+├── actions/
+│   └── parser.py                   # Deterministic command parser
+├── server/
+│   ├── dc_ops_env_environment.py   # OpenEnv Environment implementation
+│   ├── app.py                      # FastAPI application
+│   └── Dockerfile                  # Container image
+├── data/
+│   └── datacenter_configs/         # YAML facility definitions
+│       ├── default.yaml            # 2 zones, 20 racks, 160 kW
+│       ├── small_facility.yaml     # 1 zone, 10 racks, 80 kW
+│       └── large_facility.yaml     # 4 zones, 60 racks, 600 kW
+└── tests/                          # 256 tests across 6 modules
+    ├── test_thermal.py
+    ├── test_power.py
+    ├── test_actions.py
+    ├── test_rewards.py
+    ├── test_scenarios.py
+    └── test_integration.py
+```
+## License
+BSD-style license. See [LICENSE](../LICENSE) for details.

__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Dc Ops Env Environment."""
+from .client import DcOpsEnv
+from .models import DcOpsAction, DcOpsObservation
+__all__ = [
+    "DcOpsAction",
+    "DcOpsObservation",
+    "DcOpsEnv",
+]

actions/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Action parsing for the DC-Ops environment."""
+from .parser import parse_command
+__all__ = ["parse_command"]

actions/parser.py ADDED Viewed

	@@ -0,0 +1,356 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Deterministic action parser for operator commands.
+Parses natural-language commands from the LLM agent into simulation mutations.
+Uses regex matching for speed and testability — no LLM-in-the-loop.
+Command format: command_name [target] [value]
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from typing import Any
+from ..simulation.thermal import ThermalSimulation
+from ..simulation.power import PowerSimulation
+from ..simulation.types import (
+    CRACFaultType,
+    CRACStatus,
+    UPSMode,
+)
+@dataclass
+class CommandResult:
+    """Result of parsing and executing a command."""
+    success: bool
+    message: str
+    command_name: str = ""
+    target: str = ""
+# ---------------------------------------------------------------------------
+# Available commands for the agent
+# ---------------------------------------------------------------------------
+AVAILABLE_ACTIONS: list[str] = [
+    "diagnose <unit_id>           — Inspect a CRAC/UPS/PDU for faults",
+    "adjust_setpoint <crac_id> <temp_c> — Change CRAC supply air setpoint",
+    "set_fan_speed <crac_id> <pct>  — Set CRAC fan speed (0-100%)",
+    "set_rack_load <rack_id> <kw>   — Adjust rack IT load (migrate workload)",
+    "start_crac <crac_id>          — Start a standby CRAC unit",
+    "stop_crac <crac_id>           — Put a CRAC into standby",
+    "start_generator               — Manually start the diesel generator",
+    "stop_generator                — Initiate generator cooldown",
+    "set_ups_mode <ups_id> <mode>  — Set UPS mode (eco/double_conversion/bypass)",
+    "refuel_generator [liters]     — Refuel (default: full tank)",
+    "acknowledge_alarm             — Acknowledge current alert",
+    "check_status                  — Request full status report",
+    "escalate                      — Escalate to senior engineer",
+    "wait                          — Take no action this step",
+]
+def parse_command(
+    command: str,
+    thermal_sim: ThermalSimulation,
+    power_sim: PowerSimulation | None = None,
+) -> CommandResult:
+    """Parse and execute an operator command.
+    Args:
+        command: Raw command string from the agent.
+        thermal_sim: Thermal simulation to mutate.
+        power_sim: Power simulation to mutate (optional).
+    Returns:
+        CommandResult with success status and feedback message.
+    """
+    cmd = command.strip()
+    if not cmd:
+        return CommandResult(False, "Empty command. Use 'check_status' or see available actions.")
+    # Try each handler in order
+    for pattern, handler in _COMMAND_TABLE:
+        match = re.match(pattern, cmd, re.IGNORECASE)
+        if match:
+            return handler(match, thermal_sim, power_sim)
+    return CommandResult(
+        False,
+        f"Unknown command: '{cmd}'. Use 'check_status' for available actions.",
+        command_name="unknown",
+    )
+# ---------------------------------------------------------------------------
+# Command handlers
+# ---------------------------------------------------------------------------
+def _handle_diagnose(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    """Inspect a unit for faults and report status."""
+    target = match.group(1)
+    # Check CRACs
+    for zone in thermal.state.zones:
+        for crac in zone.crac_units:
+            if crac.unit_id.lower() == target.lower():
+                lines = [
+                    f"=== Diagnostic Report: {crac.unit_id} ===",
+                    f"Status: {crac.status.value}",
+                    f"Fault: {crac.fault_type.value}",
+                    f"Setpoint: {crac.setpoint_c:.1f}°C",
+                    f"Supply Temp: {crac.supply_temp_c:.1f}°C",
+                    f"Fan Speed: {crac.fan_speed_pct:.0f}%",
+                    f"Airflow: {crac.current_airflow_m3s:.3f} m³/s",
+                ]
+                if crac.fault_type != CRACFaultType.NONE:
+                    lines.append(f">> FAULT DETECTED: {crac.fault_type.value}")
+                    lines.append(">> Recommended: repair or replace component")
+                else:
+                    lines.append(">> No faults detected. Unit operating normally.")
+                return CommandResult(True, "\n".join(lines), "diagnose", target)
+    # Check UPS
+    if power:
+        for ups in power.state.ups_units:
+            if ups.unit_id.lower() == target.lower():
+                lines = [
+                    f"=== Diagnostic Report: {ups.unit_id} ===",
+                    f"Mode: {ups.mode.value}",
+                    f"Load: {ups.load_fraction * 100:.1f}%",
+                    f"Efficiency: {ups.efficiency * 100:.1f}%",
+                    f"Battery SOC: {ups.battery_soc * 100:.0f}%",
+                    f"Output: {ups.output_power_kw:.1f} kW",
+                    f"Losses: {ups.heat_output_kw:.1f} kW",
+                ]
+                return CommandResult(True, "\n".join(lines), "diagnose", target)
+    return CommandResult(False, f"Unit '{target}' not found.", "diagnose", target)
+def _handle_adjust_setpoint(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    target = match.group(1)
+    try:
+        value = float(match.group(2))
+    except (ValueError, IndexError):
+        return CommandResult(False, "Invalid temperature value.", "adjust_setpoint", target)
+    if value < 10.0 or value > 35.0:
+        return CommandResult(
+            False,
+            f"Setpoint {value:.1f}°C out of safe range (10-35°C).",
+            "adjust_setpoint", target,
+        )
+    if thermal.set_crac_setpoint(target, value):
+        return CommandResult(
+            True,
+            f"Setpoint for {target} adjusted to {value:.1f}°C. "
+            "Supply temp will converge over ~30 seconds.",
+            "adjust_setpoint", target,
+        )
+    return CommandResult(False, f"CRAC '{target}' not found.", "adjust_setpoint", target)
+def _handle_set_fan_speed(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    target = match.group(1)
+    try:
+        value = float(match.group(2))
+    except (ValueError, IndexError):
+        return CommandResult(False, "Invalid fan speed value.", "set_fan_speed", target)
+    if value < 0 or value > 100:
+        return CommandResult(
+            False, f"Fan speed {value:.0f}% out of range (0-100%).",
+            "set_fan_speed", target,
+        )
+    if thermal.set_crac_fan_speed(target, value):
+        return CommandResult(
+            True,
+            f"Fan speed for {target} set to {value:.0f}%.",
+            "set_fan_speed", target,
+        )
+    return CommandResult(False, f"CRAC '{target}' not found.", "set_fan_speed", target)
+def _handle_set_rack_load(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    target = match.group(1)
+    try:
+        value = float(match.group(2))
+    except (ValueError, IndexError):
+        return CommandResult(False, "Invalid load value.", "set_rack_load", target)
+    if value < 0 or value > 30:
+        return CommandResult(
+            False, f"Load {value:.1f} kW out of range (0-30 kW).",
+            "set_rack_load", target,
+        )
+    if thermal.set_rack_load(target, value):
+        return CommandResult(
+            True,
+            f"IT load for rack {target} set to {value:.1f} kW.",
+            "set_rack_load", target,
+        )
+    return CommandResult(False, f"Rack '{target}' not found.", "set_rack_load", target)
+def _handle_start_crac(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    target = match.group(1)
+    for zone in thermal.state.zones:
+        for crac in zone.crac_units:
+            if crac.unit_id.lower() == target.lower():
+                if crac.status == CRACStatus.RUNNING:
+                    return CommandResult(False, f"{target} is already running.", "start_crac", target)
+                if crac.fault_type != CRACFaultType.NONE:
+                    return CommandResult(
+                        False,
+                        f"{target} has an active fault ({crac.fault_type.value}). "
+                        "Clear the fault before starting.",
+                        "start_crac", target,
+                    )
+                crac.status = CRACStatus.RUNNING
+                return CommandResult(True, f"{target} started.", "start_crac", target)
+    return CommandResult(False, f"CRAC '{target}' not found.", "start_crac", target)
+def _handle_stop_crac(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    target = match.group(1)
+    for zone in thermal.state.zones:
+        for crac in zone.crac_units:
+            if crac.unit_id.lower() == target.lower():
+                if crac.status == CRACStatus.STANDBY:
+                    return CommandResult(False, f"{target} is already in standby.", "stop_crac", target)
+                crac.status = CRACStatus.STANDBY
+                return CommandResult(True, f"{target} placed in standby.", "stop_crac", target)
+    return CommandResult(False, f"CRAC '{target}' not found.", "stop_crac", target)
+def _handle_start_generator(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    if power is None:
+        return CommandResult(False, "Power subsystem not available.", "start_generator")
+    power.start_generator()
+    return CommandResult(True, "Generator start sequence initiated.", "start_generator")
+def _handle_stop_generator(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    if power is None:
+        return CommandResult(False, "Power subsystem not available.", "stop_generator")
+    power.stop_generator()
+    return CommandResult(True, "Generator cooldown initiated.", "stop_generator")
+def _handle_set_ups_mode(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    if power is None:
+        return CommandResult(False, "Power subsystem not available.", "set_ups_mode")
+    target = match.group(1)
+    mode_str = match.group(2).lower().strip()
+    mode_map = {
+        "double_conversion": UPSMode.DOUBLE_CONVERSION,
+        "eco": UPSMode.ECO,
+        "line_interactive": UPSMode.LINE_INTERACTIVE,
+        "bypass": UPSMode.BYPASS,
+    }
+    mode = mode_map.get(mode_str)
+    if mode is None:
+        valid = ", ".join(mode_map.keys())
+        return CommandResult(False, f"Unknown UPS mode '{mode_str}'. Valid: {valid}", "set_ups_mode", target)
+    if power.set_ups_mode(target, mode):
+        return CommandResult(True, f"{target} set to {mode_str} mode.", "set_ups_mode", target)
+    return CommandResult(False, f"UPS '{target}' not found.", "set_ups_mode", target)
+def _handle_refuel_generator(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    if power is None:
+        return CommandResult(False, "Power subsystem not available.", "refuel_generator")
+    liters_str = match.group(1) if match.group(1) else None
+    if liters_str:
+        try:
+            liters = float(liters_str)
+        except ValueError:
+            return CommandResult(False, "Invalid liters value.", "refuel_generator")
+        power.refuel_generator(liters)
+        return CommandResult(True, f"Added {liters:.0f}L to generator.", "refuel_generator")
+    else:
+        power.refuel_generator()
+        return CommandResult(True, "Generator refueled to full tank.", "refuel_generator")
+def _handle_acknowledge_alarm(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    return CommandResult(True, "Alarm acknowledged.", "acknowledge_alarm")
+def _handle_check_status(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    return CommandResult(True, "Full status displayed in dashboard.", "check_status")
+def _handle_escalate(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    return CommandResult(
+        True,
+        "Incident escalated to senior datacenter engineer. Episode ending.",
+        "escalate",
+    )
+def _handle_wait(
+    match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
+) -> CommandResult:
+    return CommandResult(True, "Waiting. No action taken.", "wait")
+# ---------------------------------------------------------------------------
+# Command table: (regex_pattern, handler_function)
+# Order matters — first match wins.
+# ---------------------------------------------------------------------------
+_COMMAND_TABLE: list[tuple[re.Pattern | str, Any]] = [
+    (r"diagnose\s+(\S+)", _handle_diagnose),
+    (r"adjust_setpoint\s+(\S+)\s+([\d.]+)", _handle_adjust_setpoint),
+    (r"set_fan_speed\s+(\S+)\s+([\d.]+)", _handle_set_fan_speed),
+    (r"(?:set_rack_load|migrate_workload)\s+(\S+)\s+([\d.]+)", _handle_set_rack_load),
+    (r"start_crac\s+(\S+)", _handle_start_crac),
+    (r"stop_crac\s+(\S+)", _handle_stop_crac),
+    (r"start_generator\b", _handle_start_generator),
+    (r"stop_generator\b", _handle_stop_generator),
+    (r"set_ups_mode\s+(\S+)\s+(\S+)", _handle_set_ups_mode),
+    (r"refuel_generator\s*([\d.]*)", _handle_refuel_generator),
+    (r"acknowledge_alarm\b", _handle_acknowledge_alarm),
+    (r"check_status\b", _handle_check_status),
+    (r"escalate\b", _handle_escalate),
+    (r"wait\b", _handle_wait),
+]

client.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""DC-Ops Environment Client."""
+from typing import Dict
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from openenv.core.env_server.types import State
+from .models import DcOpsAction, DcOpsObservation
+class DcOpsEnv(
+    EnvClient[DcOpsAction, DcOpsObservation, State]
+):
+    """
+    Client for the DC-Ops Environment.
+    Connects to the environment server over WebSocket and provides
+    reset/step/state methods for interacting with the datacenter simulation.
+    Example:
+        >>> async with DcOpsEnv(base_url="http://localhost:8000") as client:
+        ...     result = await client.reset()
+        ...     print(result.observation.dashboard)
+        ...
+        ...     result = await client.step(DcOpsAction(command="diagnose CRAC-1"))
+        ...     print(result.observation.dashboard)
+    """
+    def _step_payload(self, action: DcOpsAction) -> Dict:
+        """Convert DcOpsAction to JSON payload for step message."""
+        payload = {"command": action.command}
+        if action.reasoning:
+            payload["reasoning"] = action.reasoning
+        return payload
+    def _parse_result(self, payload: Dict) -> StepResult[DcOpsObservation]:
+        """Parse server response into StepResult[DcOpsObservation]."""
+        obs_data = payload.get("observation", {})
+        observation = DcOpsObservation(
+            dashboard=obs_data.get("dashboard", ""),
+            available_actions=obs_data.get("available_actions", []),
+            alert=obs_data.get("alert", ""),
+            scenario_type=obs_data.get("scenario_type", ""),
+            steps_remaining=obs_data.get("steps_remaining", 0),
+            action_result=obs_data.get("action_result", ""),
+            done=payload.get("done", False),
+            reward=payload.get("reward"),
+            metadata=obs_data.get("metadata", {}),
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict) -> State:
+        """Parse server response into State object."""
+        return State(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+        )

config.py ADDED Viewed

	@@ -0,0 +1,549 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Physical constants, ASHRAE thermal guidelines, and unit conversion utilities.
+All internal simulation values use SI units:
+  - Temperature: °C (Celsius)
+  - Power/Heat: W (Watts)
+  - Energy: J (Joules)
+  - Airflow: m³/s
+  - Thermal capacitance: J/K
+  - Thermal resistance: K/W
+  - Time: s (seconds)
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Union
+# ---------------------------------------------------------------------------
+# Air properties (dry air at standard conditions: ~20 °C, 101.325 kPa)
+# ---------------------------------------------------------------------------
+AIR_DENSITY_KG_M3 = 1.2
+AIR_SPECIFIC_HEAT_J_KGK = 1005.0
+AIR_RHO_CP = AIR_DENSITY_KG_M3 * AIR_SPECIFIC_HEAT_J_KGK  # 1206.0 J/(m³·K)
+# ---------------------------------------------------------------------------
+# Unit conversion helpers
+# ---------------------------------------------------------------------------
+CFM_TO_M3S = 4.71947e-4       # 1 CFM = 4.71947 × 10⁻⁴ m³/s
+M3S_TO_CFM = 1.0 / CFM_TO_M3S  # ≈ 2118.88
+TONS_TO_KW = 3.517             # 1 ton of refrigeration = 3.517 kW thermal
+KW_TO_TONS = 1.0 / TONS_TO_KW
+BTU_HR_TO_W = 0.29307107      # 1 BTU/hr = 0.293 W
+W_TO_BTU_HR = 1.0 / BTU_HR_TO_W  # ≈ 3.412
+def fahrenheit_to_celsius(f: float) -> float:
+    return (f - 32.0) * 5.0 / 9.0
+def celsius_to_fahrenheit(c: float) -> float:
+    return c * 9.0 / 5.0 + 32.0
+def cfm_to_m3s(cfm: float) -> float:
+    return cfm * CFM_TO_M3S
+def m3s_to_cfm(m3s: float) -> float:
+    return m3s * M3S_TO_CFM
+# ---------------------------------------------------------------------------
+# ASHRAE TC 9.9 Thermal Guidelines, 5th Edition (2021)
+#
+# Each class defines recommended and allowable operating envelopes for
+# server inlet temperatures and humidity.
+# ---------------------------------------------------------------------------
+@dataclass(frozen=True)
+class ASHRAEClass:
+    """ASHRAE thermal envelope for a given equipment class."""
+    name: str
+    recommended_min_c: float
+    recommended_max_c: float
+    allowable_min_c: float
+    allowable_max_c: float
+    max_dew_point_c: float
+    max_rh: float              # Fraction, e.g. 0.80 = 80%
+    description: str = ""
+ASHRAE_A1 = ASHRAEClass(
+    name="A1",
+    recommended_min_c=18.0,
+    recommended_max_c=27.0,
+    allowable_min_c=15.0,
+    allowable_max_c=32.0,
+    max_dew_point_c=17.0,
+    max_rh=0.80,
+    description="Enterprise servers, storage",
+)
+ASHRAE_A2 = ASHRAEClass(
+    name="A2",
+    recommended_min_c=18.0,
+    recommended_max_c=27.0,
+    allowable_min_c=10.0,
+    allowable_max_c=35.0,
+    max_dew_point_c=21.0,
+    max_rh=0.80,
+    description="Volume servers",
+)
+ASHRAE_A3 = ASHRAEClass(
+    name="A3",
+    recommended_min_c=18.0,
+    recommended_max_c=27.0,
+    allowable_min_c=5.0,
+    allowable_max_c=40.0,
+    max_dew_point_c=24.0,
+    max_rh=0.85,
+    description="Extended temperature range",
+)
+ASHRAE_A4 = ASHRAEClass(
+    name="A4",
+    recommended_min_c=18.0,
+    recommended_max_c=27.0,
+    allowable_min_c=5.0,
+    allowable_max_c=45.0,
+    max_dew_point_c=24.0,
+    max_rh=0.90,
+    description="Maximum temperature flexibility",
+)
+ASHRAE_H1 = ASHRAEClass(
+    name="H1",
+    recommended_min_c=18.0,
+    recommended_max_c=22.0,
+    allowable_min_c=5.0,
+    allowable_max_c=25.0,
+    max_dew_point_c=17.0,
+    max_rh=0.80,
+    description="High-density / AI / HPC",
+)
+ASHRAE_CLASSES: dict[str, ASHRAEClass] = {
+    "A1": ASHRAE_A1,
+    "A2": ASHRAE_A2,
+    "A3": ASHRAE_A3,
+    "A4": ASHRAE_A4,
+    "H1": ASHRAE_H1,
+}
+# Minimum humidity boundary (all classes):
+# Higher of dew point -12 °C OR 8% RH
+ASHRAE_MIN_DEW_POINT_C = -12.0
+ASHRAE_MIN_RH = 0.08
+# Rate-of-change limits
+ASHRAE_RATE_LIMIT_SOLID_STATE_C_PER_HR = 20.0    # °C/hr max
+ASHRAE_RATE_LIMIT_SOLID_STATE_C_PER_15MIN = 5.0  # °C per 15 min max
+# Sensor accuracy
+ASHRAE_SENSOR_ACCURACY_STANDARD_C = 0.5
+ASHRAE_SENSOR_ACCURACY_HIGH_DENSITY_C = 0.3
+# ---------------------------------------------------------------------------
+# Default datacenter configuration
+# ---------------------------------------------------------------------------
+@dataclass
+class CRACConfig:
+    """Configuration for a single CRAC/CRAH unit."""
+    unit_id: str = "CRAC-1"
+    rated_capacity_kw: float = 70.0        # Nominal cooling capacity at rated conditions
+    rated_return_temp_c: float = 24.0       # Return air temp at which capacity is rated
+    capacity_slope_per_c: float = 0.03      # Fractional capacity increase per °C above rated return
+    max_airflow_cfm: float = 12000.0        # Maximum airflow at 100% fan speed
+    fan_rated_power_kw: float = 5.0         # Fan power at 100% speed
+    cop_rated: float = 3.5                  # Coefficient of performance at design conditions
+    cop_degradation_per_c: float = 0.04     # COP fractional decrease per °C outside temp above 35°C
+    initial_setpoint_c: float = 18.0        # Default supply air setpoint
+    initial_fan_speed_pct: float = 100.0    # Default fan speed
+    supply_temp_lag_s: float = 30.0         # Time constant for supply temp to reach setpoint
+@dataclass
+class RackConfig:
+    """Configuration for a single server rack."""
+    rack_id: str = "A-01"
+    row: str = "A"
+    position: int = 1
+    it_load_kw: float = 8.0                # IT power draw
+    num_servers_2u: int = 20               # Number of 2U servers
+    server_thermal_mass_jk: float = 11100.0  # 11.1 kJ/K per 2U server (measured experimentally)
+    airflow_cfm_per_kw: float = 160.0      # Server fan airflow per kW IT load
+@dataclass
+class ZoneConfig:
+    """Configuration for a thermal zone (section of datacenter)."""
+    zone_id: str = "zone_a"
+    racks: list[RackConfig] = field(default_factory=list)
+    crac_units: list[CRACConfig] = field(default_factory=list)
+    containment_type: str = "cold_aisle"    # "cold_aisle", "hot_aisle", "none"
+    recirculation_factor: float = 0.08      # 0 = perfect containment, 0.3 = none
+    air_volume_m3: float = 500.0            # Zone air volume
+    envelope_r_kw: float = 0.02            # Thermal resistance to outside (K/W)
+    initial_cold_aisle_temp_c: float = 20.0
+    initial_humidity_rh: float = 0.45
+    ashrae_class: str = "A2"
+# ---------------------------------------------------------------------------
+# Power distribution configuration
+# ---------------------------------------------------------------------------
+@dataclass
+class UPSConfig:
+    """Configuration for a UPS unit.
+    Efficiency model (quadratic loss):
+        η(x) = x / (x + c_0 + c_1·x + c_2·x²)
+    where x = load_fraction (0 to 1).
+    Default coefficients from APC White Paper 108 (modern double-conversion):
+        c_0 = 0.013  (no-load: transformers, logic boards)
+        c_1 = 0.006  (proportional: conduction losses)
+        c_2 = 0.011  (square-law: I²R in conductors)
+    """
+    unit_id: str = "UPS-1"
+    rated_capacity_kw: float = 500.0
+    # Quadratic loss coefficients (fractions of rated capacity)
+    loss_c0: float = 0.013                 # No-load losses
+    loss_c1: float = 0.006                 # Proportional losses
+    loss_c2: float = 0.011                 # Square-law losses
+    # Battery
+    battery_capacity_kwh: float = 8.3      # ~10 min at full load
+    battery_discharge_efficiency: float = 0.90
+    battery_aging_factor: float = 0.85     # End-of-life derating
+    battery_temp_c: float = 25.0           # Battery room temperature
+    # Recharge: ~10× discharge time
+    recharge_rate_kw: float = 5.0          # Max recharge rate
+    # Operating mode
+    initial_mode: str = "double_conversion"  # "double_conversion", "line_interactive", "eco", "bypass"
+@dataclass
+class PDUConfig:
+    """Configuration for a three-phase PDU.
+    US standard: 208V L-L / 120V L-N, 24A per phase.
+    Total nameplate: √3 × 208 × 24 ≈ 8,646 W.
+    80% NEC continuous derating: 6,917 W.
+    European: 400V L-L / 230V L-N, 32A per phase.
+    Total nameplate: √3 × 400 × 32 ≈ 22,170 W.
+    """
+    pdu_id: str = "PDU-A1"
+    voltage_ll_v: float = 208.0            # Line-to-line voltage
+    max_current_per_phase_a: float = 24.0
+    num_phases: int = 3
+    breaker_rating_a: float = 20.0         # Per-branch circuit breaker
+    num_outlets: int = 48
+    efficiency: float = 0.98               # Transformer efficiency (2% losses)
+    continuous_derating: float = 0.80      # NEC 80% rule for continuous loads
+@dataclass
+class GeneratorConfig:
+    """Configuration for a diesel standby generator.
+    Startup sequence (NFPA 110 Type 10):
+        Start delay → cranking → warm-up → ready to accept load
+        Total: 10-20 seconds
+    """
+    gen_id: str = "GEN-1"
+    rated_capacity_kw: float = 750.0
+    # Startup timing
+    start_delay_s: float = 4.0             # Programmed delay before crank
+    crank_time_s: float = 5.0              # Engine cranking duration
+    warmup_time_s: float = 8.0             # Warm-up before load acceptance
+    # Fuel
+    fuel_tank_liters: float = 2000.0
+    consumption_lph_full: float = 180.0    # Liters/hour at full load
+    # Cool-down
+    cooldown_time_s: float = 300.0         # 5-min unloaded cool-down
+@dataclass
+class ATSConfig:
+    """Configuration for an Automatic Transfer Switch."""
+    ats_id: str = "ATS-1"
+    transfer_time_ms: float = 100.0        # Mechanical transfer time
+    retransfer_delay_s: float = 300.0      # Wait before transferring back to utility
+@dataclass
+class PowerConfig:
+    """Aggregated power infrastructure configuration."""
+    ups_units: list[UPSConfig] = field(default_factory=list)
+    pdus: list[PDUConfig] = field(default_factory=list)
+    generator: GeneratorConfig = field(default_factory=GeneratorConfig)
+    ats: ATSConfig = field(default_factory=ATSConfig)
+    utility_voltage_v: float = 480.0       # Main utility feed voltage
+    utility_available: bool = True
+@dataclass
+class DatacenterConfig:
+    """Full datacenter configuration."""
+    name: str = "DC-OPS Facility"
+    zones: list[ZoneConfig] = field(default_factory=list)
+    power: PowerConfig = field(default_factory=PowerConfig)
+    outside_temp_c: float = 35.0
+    outside_humidity_rh: float = 0.40
+    lighting_w_per_m2: float = 10.0         # Typical 10 W/m²
+    floor_area_m2: float = 500.0
+    simulation_dt_s: float = 1.0            # Integration timestep
+    # Kept for backward compatibility with Phase 1 thermal sim
+    ups_loss_fraction: float = 0.05
+    pdu_loss_fraction: float = 0.02
+def make_default_datacenter_config() -> DatacenterConfig:
+    """Create a realistic default datacenter: 2 zones, 10 racks each, 4 CRACs total.
+    Power infrastructure:
+      - 2× UPS (N+1 redundant, 500 kW each for 160 kW total IT load)
+      - 20× PDUs (one per rack, US 3-phase 208V/24A)
+      - 1× diesel generator (750 kW)
+      - 1× ATS
+    """
+    zone_a_racks = [
+        RackConfig(rack_id=f"A-{i:02d}", row="A", position=i, it_load_kw=8.0)
+        for i in range(1, 11)
+    ]
+    zone_a_cracs = [
+        CRACConfig(unit_id="CRAC-1"),
+        CRACConfig(unit_id="CRAC-2"),
+    ]
+    zone_b_racks = [
+        RackConfig(rack_id=f"B-{i:02d}", row="B", position=i, it_load_kw=8.0)
+        for i in range(1, 11)
+    ]
+    zone_b_cracs = [
+        CRACConfig(unit_id="CRAC-3"),
+        CRACConfig(unit_id="CRAC-4"),
+    ]
+    # Power infrastructure
+    ups_units = [
+        UPSConfig(unit_id="UPS-1", rated_capacity_kw=500.0),
+        UPSConfig(unit_id="UPS-2", rated_capacity_kw=500.0),
+    ]
+    pdus = [
+        PDUConfig(pdu_id=f"PDU-{rack.rack_id}")
+        for rack in zone_a_racks + zone_b_racks
+    ]
+    power = PowerConfig(
+        ups_units=ups_units,
+        pdus=pdus,
+        generator=GeneratorConfig(gen_id="GEN-1", rated_capacity_kw=750.0),
+        ats=ATSConfig(ats_id="ATS-1"),
+    )
+    return DatacenterConfig(
+        name="DC-OPS Default Facility",
+        zones=[
+            ZoneConfig(
+                zone_id="zone_a",
+                racks=zone_a_racks,
+                crac_units=zone_a_cracs,
+                air_volume_m3=600.0,
+            ),
+            ZoneConfig(
+                zone_id="zone_b",
+                racks=zone_b_racks,
+                crac_units=zone_b_cracs,
+                air_volume_m3=600.0,
+            ),
+        ],
+        power=power,
+        outside_temp_c=35.0,
+        outside_humidity_rh=0.40,
+        floor_area_m2=1200.0,
+    )
+# ---------------------------------------------------------------------------
+# YAML config loader
+# ---------------------------------------------------------------------------
+_CONFIG_DIR = Path(__file__).parent / "data" / "datacenter_configs"
+# Built-in config names (resolved relative to this package)
+BUILTIN_CONFIGS: dict[str, Path] = {
+    "default": _CONFIG_DIR / "default.yaml",
+    "small": _CONFIG_DIR / "small_facility.yaml",
+    "large": _CONFIG_DIR / "large_facility.yaml",
+}
+def load_datacenter_config(source: Union[str, Path]) -> DatacenterConfig:
+    """Load a DatacenterConfig from a YAML file or built-in name.
+    Args:
+        source: Either a built-in name ("default", "small", "large"),
+                or a path to a YAML file.
+    Returns:
+        Fully constructed DatacenterConfig.
+    Examples:
+        config = load_datacenter_config("small")
+        config = load_datacenter_config("/path/to/custom.yaml")
+    """
+    import yaml
+    # Resolve source to a file path
+    if isinstance(source, str) and source in BUILTIN_CONFIGS:
+        path = BUILTIN_CONFIGS[source]
+    else:
+        path = Path(source)
+    if not path.exists():
+        raise FileNotFoundError(f"Config file not found: {path}")
+    with open(path, "r") as f:
+        data = yaml.safe_load(f)
+    return _dict_to_datacenter_config(data)
+def _dict_to_datacenter_config(data: dict) -> DatacenterConfig:
+    """Convert a raw YAML dict into a DatacenterConfig."""
+    zones = [_dict_to_zone_config(z) for z in data.get("zones", [])]
+    power = _dict_to_power_config(data.get("power", {}))
+    return DatacenterConfig(
+        name=data.get("name", "DC-OPS Facility"),
+        zones=zones,
+        power=power,
+        outside_temp_c=data.get("outside_temp_c", 35.0),
+        outside_humidity_rh=data.get("outside_humidity_rh", 0.40),
+        lighting_w_per_m2=data.get("lighting_w_per_m2", 10.0),
+        floor_area_m2=data.get("floor_area_m2", 500.0),
+        simulation_dt_s=data.get("simulation_dt_s", 1.0),
+        ups_loss_fraction=data.get("ups_loss_fraction", 0.05),
+        pdu_loss_fraction=data.get("pdu_loss_fraction", 0.02),
+    )
+def _dict_to_zone_config(data: dict) -> ZoneConfig:
+    """Convert a raw dict into a ZoneConfig."""
+    racks = [_dict_to_rack_config(r) for r in data.get("racks", [])]
+    cracs = [_dict_to_crac_config(c) for c in data.get("crac_units", [])]
+    return ZoneConfig(
+        zone_id=data.get("zone_id", "zone_a"),
+        racks=racks,
+        crac_units=cracs,
+        containment_type=data.get("containment_type", "cold_aisle"),
+        recirculation_factor=data.get("recirculation_factor", 0.08),
+        air_volume_m3=data.get("air_volume_m3", 500.0),
+        envelope_r_kw=data.get("envelope_r_kw", 0.02),
+        initial_cold_aisle_temp_c=data.get("initial_cold_aisle_temp_c", 20.0),
+        initial_humidity_rh=data.get("initial_humidity_rh", 0.45),
+        ashrae_class=data.get("ashrae_class", "A2"),
+    )
+def _dict_to_rack_config(data: dict) -> RackConfig:
+    """Convert a raw dict into a RackConfig."""
+    return RackConfig(
+        rack_id=data.get("rack_id", "A-01"),
+        row=data.get("row", "A"),
+        position=data.get("position", 1),
+        it_load_kw=data.get("it_load_kw", 8.0),
+        num_servers_2u=data.get("num_servers_2u", 20),
+        server_thermal_mass_jk=data.get("server_thermal_mass_jk", 11100.0),
+        airflow_cfm_per_kw=data.get("airflow_cfm_per_kw", 160.0),
+    )
+def _dict_to_crac_config(data: dict) -> CRACConfig:
+    """Convert a raw dict into a CRACConfig."""
+    return CRACConfig(
+        unit_id=data.get("unit_id", "CRAC-1"),
+        rated_capacity_kw=data.get("rated_capacity_kw", 70.0),
+        rated_return_temp_c=data.get("rated_return_temp_c", 24.0),
+        capacity_slope_per_c=data.get("capacity_slope_per_c", 0.03),
+        max_airflow_cfm=data.get("max_airflow_cfm", 12000.0),
+        fan_rated_power_kw=data.get("fan_rated_power_kw", 5.0),
+        cop_rated=data.get("cop_rated", 3.5),
+        cop_degradation_per_c=data.get("cop_degradation_per_c", 0.04),
+        initial_setpoint_c=data.get("initial_setpoint_c", 18.0),
+        initial_fan_speed_pct=data.get("initial_fan_speed_pct", 100.0),
+        supply_temp_lag_s=data.get("supply_temp_lag_s", 30.0),
+    )
+def _dict_to_power_config(data: dict) -> PowerConfig:
+    """Convert a raw dict into a PowerConfig."""
+    ups = [_dict_to_ups_config(u) for u in data.get("ups_units", [])]
+    pdus = [_dict_to_pdu_config(p) for p in data.get("pdus", [])]
+    gen_data = data.get("generator", {})
+    ats_data = data.get("ats", {})
+    return PowerConfig(
+        ups_units=ups,
+        pdus=pdus,
+        generator=GeneratorConfig(
+            gen_id=gen_data.get("gen_id", "GEN-1"),
+            rated_capacity_kw=gen_data.get("rated_capacity_kw", 750.0),
+            start_delay_s=gen_data.get("start_delay_s", 4.0),
+            crank_time_s=gen_data.get("crank_time_s", 5.0),
+            warmup_time_s=gen_data.get("warmup_time_s", 8.0),
+            fuel_tank_liters=gen_data.get("fuel_tank_liters", 2000.0),
+            consumption_lph_full=gen_data.get("consumption_lph_full", 180.0),
+            cooldown_time_s=gen_data.get("cooldown_time_s", 300.0),
+        ),
+        ats=ATSConfig(
+            ats_id=ats_data.get("ats_id", "ATS-1"),
+            transfer_time_ms=ats_data.get("transfer_time_ms", 100.0),
+            retransfer_delay_s=ats_data.get("retransfer_delay_s", 300.0),
+        ),
+        utility_voltage_v=data.get("utility_voltage_v", 480.0),
+        utility_available=data.get("utility_available", True),
+    )
+def _dict_to_ups_config(data: dict) -> UPSConfig:
+    """Convert a raw dict into a UPSConfig."""
+    return UPSConfig(
+        unit_id=data.get("unit_id", "UPS-1"),
+        rated_capacity_kw=data.get("rated_capacity_kw", 500.0),
+        loss_c0=data.get("loss_c0", 0.013),
+        loss_c1=data.get("loss_c1", 0.006),
+        loss_c2=data.get("loss_c2", 0.011),
+        battery_capacity_kwh=data.get("battery_capacity_kwh", 8.3),
+        battery_discharge_efficiency=data.get("battery_discharge_efficiency", 0.90),
+        battery_aging_factor=data.get("battery_aging_factor", 0.85),
+        battery_temp_c=data.get("battery_temp_c", 25.0),
+        recharge_rate_kw=data.get("recharge_rate_kw", 5.0),
+        initial_mode=data.get("initial_mode", "double_conversion"),
+    )
+def _dict_to_pdu_config(data: dict) -> PDUConfig:
+    """Convert a raw dict into a PDUConfig."""
+    return PDUConfig(
+        pdu_id=data.get("pdu_id", "PDU-A1"),
+        voltage_ll_v=data.get("voltage_ll_v", 208.0),
+        max_current_per_phase_a=data.get("max_current_per_phase_a", 24.0),
+        num_phases=data.get("num_phases", 3),
+        breaker_rating_a=data.get("breaker_rating_a", 20.0),
+        num_outlets=data.get("num_outlets", 48),
+        efficiency=data.get("efficiency", 0.98),
+        continuous_derating=data.get("continuous_derating", 0.80),
+    )

data/datacenter_configs/default.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+# DC-OPS Default Facility Configuration
+# 2 zones, 20 racks (10/zone), 160 kW total IT load
+# N+1 cooling (4 CRACs × 70 kW for 160 kW IT)
+# N+1 power (2 UPS × 500 kW, 1 generator 750 kW)
+# Location: Phoenix, AZ (hot climate)
+name: "DC-OPS Default Facility"
+outside_temp_c: 35.0
+outside_humidity_rh: 0.40
+lighting_w_per_m2: 10.0
+floor_area_m2: 1200.0
+simulation_dt_s: 1.0
+zones:
+  - zone_id: zone_a
+    containment_type: cold_aisle
+    recirculation_factor: 0.08
+    air_volume_m3: 600.0
+    envelope_r_kw: 0.02
+    initial_cold_aisle_temp_c: 20.0
+    initial_humidity_rh: 0.45
+    ashrae_class: A2
+    racks:
+      - { rack_id: A-01, row: A, position: 1, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-02, row: A, position: 2, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-03, row: A, position: 3, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-04, row: A, position: 4, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-05, row: A, position: 5, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-06, row: A, position: 6, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-07, row: A, position: 7, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-08, row: A, position: 8, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-09, row: A, position: 9, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-10, row: A, position: 10, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+    crac_units:
+      - { unit_id: CRAC-1, rated_capacity_kw: 70.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+      - { unit_id: CRAC-2, rated_capacity_kw: 70.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+  - zone_id: zone_b
+    containment_type: cold_aisle
+    recirculation_factor: 0.08
+    air_volume_m3: 600.0
+    envelope_r_kw: 0.02
+    initial_cold_aisle_temp_c: 20.0
+    initial_humidity_rh: 0.45
+    ashrae_class: A2
+    racks:
+      - { rack_id: B-01, row: B, position: 1, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-02, row: B, position: 2, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-03, row: B, position: 3, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-04, row: B, position: 4, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-05, row: B, position: 5, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-06, row: B, position: 6, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-07, row: B, position: 7, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-08, row: B, position: 8, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-09, row: B, position: 9, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-10, row: B, position: 10, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+    crac_units:
+      - { unit_id: CRAC-3, rated_capacity_kw: 70.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+      - { unit_id: CRAC-4, rated_capacity_kw: 70.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+power:
+  utility_voltage_v: 480.0
+  utility_available: true
+  ups_units:
+    - { unit_id: UPS-1, rated_capacity_kw: 500.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 8.3, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 5.0, initial_mode: double_conversion }
+    - { unit_id: UPS-2, rated_capacity_kw: 500.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 8.3, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 5.0, initial_mode: double_conversion }
+  pdus:
+    - { pdu_id: PDU-A-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-02, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-03, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-04, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-05, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-06, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-07, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-08, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-09, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-10, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-B-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-B-02, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-B-03, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-B-04, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-B-05, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-B-06, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-B-07, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-B-08, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-B-09, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-B-10, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+  generator:
+    gen_id: GEN-1
+    rated_capacity_kw: 750.0
+    start_delay_s: 4.0
+    crank_time_s: 5.0
+    warmup_time_s: 8.0
+    fuel_tank_liters: 2000.0
+    consumption_lph_full: 180.0
+    cooldown_time_s: 300.0
+  ats:
+    ats_id: ATS-1
+    transfer_time_ms: 100.0
+    retransfer_delay_s: 300.0

data/datacenter_configs/large_facility.yaml ADDED Viewed

	@@ -0,0 +1,157 @@

+# Large Facility Configuration
+# 4 zones, 80 racks (20/zone), 640 kW total IT load
+# N+1 cooling (8 CRACs × 100 kW for 640 kW IT)
+# 2N power (4 UPS × 500 kW, 2 generators × 750 kW)
+# Mixed ASHRAE classes: A2 standard + H1 high-density
+# Use case: enterprise datacenter with GPU/HPC section
+name: "DC-OPS Large Facility"
+outside_temp_c: 35.0
+outside_humidity_rh: 0.35
+lighting_w_per_m2: 10.0
+floor_area_m2: 4000.0
+simulation_dt_s: 1.0
+zones:
+  # Standard-density zones (A2 class, 8 kW/rack)
+  - zone_id: zone_a
+    containment_type: cold_aisle
+    recirculation_factor: 0.06       # Excellent containment
+    air_volume_m3: 800.0
+    envelope_r_kw: 0.015
+    initial_cold_aisle_temp_c: 20.0
+    initial_humidity_rh: 0.45
+    ashrae_class: A2
+    racks:
+      - { rack_id: A-01, row: A, position: 1, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-02, row: A, position: 2, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-03, row: A, position: 3, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-04, row: A, position: 4, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-05, row: A, position: 5, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-06, row: A, position: 6, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-07, row: A, position: 7, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-08, row: A, position: 8, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-09, row: A, position: 9, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-10, row: A, position: 10, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-11, row: A, position: 11, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-12, row: A, position: 12, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-13, row: A, position: 13, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-14, row: A, position: 14, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-15, row: A, position: 15, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-16, row: A, position: 16, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-17, row: A, position: 17, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-18, row: A, position: 18, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-19, row: A, position: 19, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-20, row: A, position: 20, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+    crac_units:
+      - { unit_id: CRAC-1, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+      - { unit_id: CRAC-2, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+  - zone_id: zone_b
+    containment_type: cold_aisle
+    recirculation_factor: 0.06
+    air_volume_m3: 800.0
+    envelope_r_kw: 0.015
+    initial_cold_aisle_temp_c: 20.0
+    initial_humidity_rh: 0.45
+    ashrae_class: A2
+    racks:
+      - { rack_id: B-01, row: B, position: 1, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-02, row: B, position: 2, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-03, row: B, position: 3, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-04, row: B, position: 4, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-05, row: B, position: 5, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-06, row: B, position: 6, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-07, row: B, position: 7, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-08, row: B, position: 8, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-09, row: B, position: 9, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-10, row: B, position: 10, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-11, row: B, position: 11, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-12, row: B, position: 12, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-13, row: B, position: 13, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-14, row: B, position: 14, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-15, row: B, position: 15, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-16, row: B, position: 16, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-17, row: B, position: 17, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-18, row: B, position: 18, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-19, row: B, position: 19, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: B-20, row: B, position: 20, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+    crac_units:
+      - { unit_id: CRAC-3, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+      - { unit_id: CRAC-4, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+  # High-density GPU zone (H1 class, 20 kW/rack)
+  - zone_id: zone_c
+    containment_type: hot_aisle
+    recirculation_factor: 0.05       # Hot aisle containment — tighter
+    air_volume_m3: 800.0
+    envelope_r_kw: 0.015
+    initial_cold_aisle_temp_c: 20.0
+    initial_humidity_rh: 0.45
+    ashrae_class: H1               # High-density class: 18-22°C recommended, 25°C allowable max
+    racks:
+      - { rack_id: C-01, row: C, position: 1, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
+      - { rack_id: C-02, row: C, position: 2, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
+      - { rack_id: C-03, row: C, position: 3, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
+      - { rack_id: C-04, row: C, position: 4, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
+      - { rack_id: C-05, row: C, position: 5, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
+      - { rack_id: C-06, row: C, position: 6, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
+      - { rack_id: C-07, row: C, position: 7, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
+      - { rack_id: C-08, row: C, position: 8, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
+      - { rack_id: C-09, row: C, position: 9, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
+      - { rack_id: C-10, row: C, position: 10, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
+    crac_units:
+      - { unit_id: CRAC-5, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 17.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+      - { unit_id: CRAC-6, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 17.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+  # Standard-density zone D
+  - zone_id: zone_d
+    containment_type: cold_aisle
+    recirculation_factor: 0.06
+    air_volume_m3: 800.0
+    envelope_r_kw: 0.015
+    initial_cold_aisle_temp_c: 20.0
+    initial_humidity_rh: 0.45
+    ashrae_class: A2
+    racks:
+      - { rack_id: D-01, row: D, position: 1, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: D-02, row: D, position: 2, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: D-03, row: D, position: 3, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: D-04, row: D, position: 4, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: D-05, row: D, position: 5, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: D-06, row: D, position: 6, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: D-07, row: D, position: 7, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: D-08, row: D, position: 8, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: D-09, row: D, position: 9, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: D-10, row: D, position: 10, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+    crac_units:
+      - { unit_id: CRAC-7, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+      - { unit_id: CRAC-8, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+power:
+  utility_voltage_v: 480.0
+  utility_available: true
+  ups_units:
+    - { unit_id: UPS-1, rated_capacity_kw: 500.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 12.0, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 8.0, initial_mode: double_conversion }
+    - { unit_id: UPS-2, rated_capacity_kw: 500.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 12.0, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 8.0, initial_mode: double_conversion }
+    - { unit_id: UPS-3, rated_capacity_kw: 500.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 12.0, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 8.0, initial_mode: double_conversion }
+    - { unit_id: UPS-4, rated_capacity_kw: 500.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 12.0, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 8.0, initial_mode: double_conversion }
+  pdus:
+    - { pdu_id: PDU-A-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-B-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-C-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-D-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+  generator:
+    gen_id: GEN-1
+    rated_capacity_kw: 1500.0
+    start_delay_s: 4.0
+    crank_time_s: 5.0
+    warmup_time_s: 8.0
+    fuel_tank_liters: 5000.0
+    consumption_lph_full: 400.0
+    cooldown_time_s: 300.0
+  ats:
+    ats_id: ATS-1
+    transfer_time_ms: 100.0
+    retransfer_delay_s: 300.0

data/datacenter_configs/small_facility.yaml ADDED Viewed

	@@ -0,0 +1,67 @@

+# Small Facility Configuration
+# 1 zone, 10 racks, 80 kW total IT load
+# N+1 cooling (2 CRACs × 70 kW for 80 kW IT)
+# Single UPS, smaller generator
+# Use case: edge datacenter, branch office
+name: "DC-OPS Small Facility"
+outside_temp_c: 30.0
+outside_humidity_rh: 0.50
+lighting_w_per_m2: 10.0
+floor_area_m2: 300.0
+simulation_dt_s: 1.0
+zones:
+  - zone_id: zone_a
+    containment_type: cold_aisle
+    recirculation_factor: 0.10       # Slightly less tight containment
+    air_volume_m3: 300.0
+    envelope_r_kw: 0.03             # Less insulation than large facility
+    initial_cold_aisle_temp_c: 20.0
+    initial_humidity_rh: 0.45
+    ashrae_class: A2
+    racks:
+      - { rack_id: A-01, row: A, position: 1, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-02, row: A, position: 2, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-03, row: A, position: 3, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-04, row: A, position: 4, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-05, row: A, position: 5, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-06, row: A, position: 6, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-07, row: A, position: 7, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-08, row: A, position: 8, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-09, row: A, position: 9, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+      - { rack_id: A-10, row: A, position: 10, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
+    crac_units:
+      - { unit_id: CRAC-1, rated_capacity_kw: 70.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+      - { unit_id: CRAC-2, rated_capacity_kw: 70.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
+power:
+  utility_voltage_v: 480.0
+  utility_available: true
+  ups_units:
+    - { unit_id: UPS-1, rated_capacity_kw: 200.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 5.0, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 3.0, initial_mode: double_conversion }
+  pdus:
+    - { pdu_id: PDU-A-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-02, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-03, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-04, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-05, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-06, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-07, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-08, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-09, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+    - { pdu_id: PDU-A-10, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
+  generator:
+    gen_id: GEN-1
+    rated_capacity_kw: 300.0
+    start_delay_s: 4.0
+    crank_time_s: 5.0
+    warmup_time_s: 8.0
+    fuel_tank_liters: 1000.0
+    consumption_lph_full: 80.0
+    cooldown_time_s: 300.0
+  ats:
+    ats_id: ATS-1
+    transfer_time_ms: 100.0
+    retransfer_delay_s: 300.0

models.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Pydantic models for the DC-Ops Environment.
+Action: Natural-language operator commands (e.g., "adjust_setpoint CRAC-1 20").
+Observation: Text dashboard + structured metadata for the LLM agent.
+These use OpenEnv's Action/Observation base classes which enforce
+`extra="forbid"` — only declared fields are allowed.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List
+from openenv.core.env_server.types import Action, Observation
+from pydantic import Field
+class DcOpsAction(Action):
+    """Operator command issued by the LLM agent.
+    The agent reads the dashboard observation and responds with a command string.
+    Commands follow the format: `command_name [target] [value]`
+    Examples:
+        - "diagnose CRAC-3"
+        - "adjust_setpoint CRAC-1 20"
+        - "increase_fan_speed CRAC-2 80"
+        - "start_generator"
+        - "acknowledge_alarm"
+        - "escalate"
+    """
+    command: str = Field(
+        ...,
+        description="Operator command (e.g., 'diagnose CRAC-3', 'adjust_setpoint CRAC-1 20')",
+    )
+    reasoning: str = Field(
+        default="",
+        description="Optional chain-of-thought reasoning from the agent",
+    )
+class DcOpsObservation(Observation):
+    """Text-based monitoring dashboard observation.
+    The 'dashboard' field contains the full text rendering of the current
+    datacenter state — formatted like a real operator's monitoring screen.
+    This is the primary field the LLM agent reads.
+    Structured data is available in the inherited 'metadata' dict.
+    """
+    dashboard: str = Field(
+        default="",
+        description="Text-rendered monitoring dashboard",
+    )
+    available_actions: List[str] = Field(
+        default_factory=list,
+        description="Valid commands the agent can issue",
+    )
+    alert: str = Field(
+        default="",
+        description="Current active alert message, if any",
+    )
+    scenario_type: str = Field(
+        default="",
+        description="Type of scenario (thermal, power, network, incident)",
+    )
+    steps_remaining: int = Field(
+        default=0,
+        description="Steps left in episode budget",
+    )
+    action_result: str = Field(
+        default="",
+        description="Feedback from the last action (success/error message)",
+    )

openenv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+spec_version: 1
+name: dc_ops_env
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

openenv_dc_ops_env.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,9 @@

+Metadata-Version: 2.4
+Name: openenv-dc_ops_env
+Version: 0.1.0
+Summary: Dc Ops Env environment for OpenEnv
+Requires-Python: >=3.10
+Requires-Dist: openenv-core[core]>=0.2.1
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

openenv_dc_ops_env.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+README.md
+__init__.py
+client.py
+models.py
+pyproject.toml
+openenv_dc_ops_env.egg-info/PKG-INFO
+openenv_dc_ops_env.egg-info/SOURCES.txt
+openenv_dc_ops_env.egg-info/dependency_links.txt
+openenv_dc_ops_env.egg-info/entry_points.txt
+openenv_dc_ops_env.egg-info/requires.txt
+openenv_dc_ops_env.egg-info/top_level.txt
+server/__init__.py
+server/app.py
+server/dc_ops_env_environment.py
+tests/test_environment.py
+tests/test_integration.py
+tests/test_power.py
+tests/test_rewards.py
+tests/test_scenarios.py
+tests/test_thermal.py

openenv_dc_ops_env.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

openenv_dc_ops_env.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ server = dc_ops_env.server.app:main

openenv_dc_ops_env.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+openenv-core[core]>=0.2.1
+[dev]
+pytest>=8.0.0
+pytest-cov>=4.0.0

openenv_dc_ops_env.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

pyproject.toml ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-dc_ops_env"
+version = "0.1.0"
+description = "Dc Ops Env environment for OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+    # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
+    # install from github
+    # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
+    "openenv-core[core]>=0.2.1",
+    # Environment-specific dependencies
+    # Add all dependencies needed for your environment here
+    # Examples:
+    # "numpy>=1.19.0",
+    # "torch>=2.0.0",
+    # "gymnasium>=0.29.0",
+    # "openspiel>=1.0.0",
+    # "smolagents>=1.22.0,<2",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+[project.scripts]
+# Server entry point - enables running via: uv run --project . server
+# or: python -m dc_ops_env.server.app
+server = "dc_ops_env.server.app:main"
+[tool.setuptools]
+include-package-data = true
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["dc_ops_env*"]
+[tool.setuptools.package-dir]
+dc_ops_env = "."

rendering/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Dashboard rendering for the DC-Ops environment."""
+from .dashboard import render_dashboard
+__all__ = ["render_dashboard"]

rendering/dashboard.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Renders simulation state into a text-based monitoring dashboard.
+The dashboard mimics what a real datacenter operator would see on their
+NOC (Network Operations Center) screens. It is the primary observation
+for the LLM agent.
+"""
+from __future__ import annotations
+from ..config import ASHRAE_CLASSES, m3s_to_cfm
+from ..simulation.types import (
+    CRACFaultType,
+    CRACState,
+    CRACStatus,
+    DatacenterState,
+    PowerState,
+    ZoneState,
+)
+def render_dashboard(
+    state: DatacenterState,
+    *,
+    alert: str = "",
+    step: int = 0,
+    max_steps: int = 15,
+    scenario_type: str = "",
+) -> str:
+    """Render the full monitoring dashboard as a text string.
+    Args:
+        state: Current datacenter simulation state.
+        alert: Active alert message to display prominently.
+        step: Current step number in the episode.
+        max_steps: Maximum steps in the episode.
+        scenario_type: Type of scenario being run.
+    Returns:
+        Multi-line string formatted as a monitoring dashboard.
+    """
+    w = 68  # Inner width of the dashboard frame
+    lines: list[str] = []
+    def hline(char: str = "═") -> str:
+        return f"╠{char * w}╣"
+    def row(text: str) -> str:
+        return f"║ {text:<{w - 2}} ║"
+    # Header
+    lines.append(f"╔{'═' * w}╗")
+    title = "DC-OPS MONITORING DASHBOARD"
+    lines.append(f"║{title:^{w}}║")
+    sim_min = state.sim_time_s / 60.0
+    status_line = f"Sim Time: {sim_min:.1f} min    Step: {step}/{max_steps}"
+    if scenario_type:
+        status_line += f"    [{scenario_type}]"
+    lines.append(row(status_line))
+    # Alert section
+    if alert:
+        lines.append(hline())
+        # Split long alerts across lines
+        alert_prefix = "!! ALERT: "
+        remaining = w - 2 - len(alert_prefix)
+        if len(alert) <= remaining:
+            lines.append(row(f"{alert_prefix}{alert}"))
+        else:
+            lines.append(row(f"{alert_prefix}{alert[:remaining]}"))
+            # Continuation lines
+            for i in range(remaining, len(alert), w - 4):
+                lines.append(row(f"  {alert[i:i + w - 4]}"))
+    # Cooling Units
+    lines.append(hline())
+    lines.append(row("COOLING UNITS"))
+    lines.append(row(f"{'Unit':<10} {'Status':<12} {'Setpoint':>8} {'Supply':>8} {'Fan%':>5} {'CFM':>7} {'kW':>6}"))
+    lines.append(row("-" * (w - 2)))
+    for zone in state.zones:
+        for crac in zone.crac_units:
+            lines.append(row(_format_crac_row(crac, state.outside_temp_c, zone.hot_aisle_temp_c)))
+    # Zone Temperatures
+    lines.append(hline())
+    lines.append(row("ZONE TEMPERATURES"))
+    lines.append(row(f"{'Zone':<8} {'Cold Aisle':>10} {'Hot Aisle':>10} {'Max Inlet':>10} {'IT Load':>8} {'Class':>6}"))
+    lines.append(row("-" * (w - 2)))
+    for zone in state.zones:
+        lines.append(row(_format_zone_row(zone)))
+    # Rack Detail (per zone, show max-temp racks)
+    lines.append(hline())
+    lines.append(row("RACK TEMPERATURES (top 5 hottest)"))
+    lines.append(row(f"{'Rack':<8} {'Inlet':>8} {'Outlet':>8} {'Load kW':>8} {'CFM':>7}"))
+    lines.append(row("-" * (w - 2)))
+    # Collect all racks, sort by inlet temp descending
+    all_racks = []
+    for zone in state.zones:
+        all_racks.extend(zone.racks)
+    all_racks.sort(key=lambda r: r.inlet_temp_c, reverse=True)
+    for rack in all_racks[:5]:
+        cfm = m3s_to_cfm(rack.airflow_m3s)
+        lines.append(row(
+            f"{rack.rack_id:<8} {rack.inlet_temp_c:>7.1f}°C {rack.outlet_temp_c:>7.1f}°C "
+            f"{rack.it_load_kw:>7.1f} {cfm:>7.0f}"
+        ))
+    # Power Section
+    lines.append(hline())
+    lines.append(row("POWER"))
+    p_it = state.total_it_load_kw
+    p_cooling = state.total_cooling_power_kw
+    pue = state.pue
+    lines.append(row(
+        f"IT Load: {p_it:.1f} kW | Cooling: {p_cooling:.1f} kW | PUE: {pue:.2f}"
+    ))
+    if state.power is not None:
+        lines.append(row(_format_power_section(state.power)))
+        lines.append(row(_format_ups_summary(state.power)))
+    else:
+        lines.append(row("UPS: N/A | Generator: N/A"))
+    # Environment
+    lines.append(hline())
+    lines.append(row("ENVIRONMENT"))
+    lines.append(row(
+        f"Outside: {state.outside_temp_c:.1f}°C | "
+        f"Humidity: {state.outside_humidity_rh * 100:.0f}% RH"
+    ))
+    # Footer
+    lines.append(f"╚{'═' * w}╝")
+    return "\n".join(lines)
+def _format_crac_row(crac: CRACState, outside_temp_c: float, hot_aisle_temp_c: float) -> str:
+    """Format a single CRAC row for the dashboard."""
+    # Status display
+    if crac.status == CRACStatus.FAULT:
+        fault_label = crac.fault_type.value.upper() if crac.fault_type != CRACFaultType.NONE else "FAULT"
+        status_str = f"!! {fault_label}"
+    elif crac.status == CRACStatus.MAINTENANCE:
+        status_str = "MAINT"
+    elif crac.status == CRACStatus.STANDBY:
+        status_str = "STANDBY"
+    else:
+        status_str = "RUNNING"
+    # Supply temp display
+    if crac.status != CRACStatus.RUNNING:
+        supply_str = "---"
+    else:
+        supply_str = f"{crac.supply_temp_c:.1f}°C"
+    # CFM
+    cfm = m3s_to_cfm(crac.current_airflow_m3s)
+    # Power consumption
+    q_cool = crac.compute_cooling_output_kw(hot_aisle_temp_c)
+    p_kw = crac.compute_power_consumption_kw(q_cool, outside_temp_c)
+    return (
+        f"{crac.unit_id:<10} {status_str:<12} {crac.setpoint_c:>7.1f}°C "
+        f"{supply_str:>8} {crac.fan_speed_pct:>5.0f} {cfm:>7.0f} {p_kw:>6.1f}"
+    )
+def _format_zone_row(zone: ZoneState) -> str:
+    """Format a single zone row for the dashboard."""
+    ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
+    max_inlet = zone.max_inlet_temp_c
+    # Mark if exceeding ASHRAE recommended
+    inlet_marker = ""
+    if ashrae and max_inlet > ashrae.recommended_max_c:
+        inlet_marker = "*"
+    if ashrae and max_inlet > ashrae.allowable_max_c:
+        inlet_marker = "!!"
+    return (
+        f"{zone.zone_id:<8} {zone.cold_aisle_temp_c:>9.1f}°C "
+        f"{zone.hot_aisle_temp_c:>9.1f}°C {max_inlet:>8.1f}°C{inlet_marker:<2}"
+        f"{zone.total_it_load_kw:>7.1f} {zone.ashrae_class:>6}"
+    )
+def _format_power_section(power: PowerState) -> str:
+    """Format power source status line."""
+    parts: list[str] = []
+    # Utility / generator status
+    if power.utility_available:
+        parts.append("Utility: NORMAL")
+    else:
+        parts.append("Utility: DOWN")
+    from ..simulation.types import GeneratorState as GS
+    gen = power.generator
+    if gen.state == GS.OFF:
+        parts.append("Gen: OFF")
+    elif gen.state == GS.LOADED:
+        fuel_hrs = gen.fuel_remaining_hours
+        fuel_str = f"{fuel_hrs:.1f}h" if fuel_hrs < 100 else ">100h"
+        parts.append(f"Gen: LOADED {gen.load_fraction * 100:.0f}% (fuel: {fuel_str})")
+    elif gen.state in (GS.START_DELAY, GS.CRANKING, GS.WARMING):
+        parts.append(f"Gen: STARTING ({gen.state.value})")
+    elif gen.state == GS.READY:
+        parts.append("Gen: READY")
+    elif gen.state == GS.COOLDOWN:
+        parts.append("Gen: COOLDOWN")
+    # ATS position
+    from ..simulation.types import ATSPosition
+    ats = power.ats
+    if ats.position == ATSPosition.UTILITY:
+        parts.append("ATS: UTILITY")
+    elif ats.position == ATSPosition.GENERATOR:
+        parts.append("ATS: GENERATOR")
+    elif ats.position == ATSPosition.TRANSFERRING:
+        parts.append("ATS: TRANSFERRING")
+    return " | ".join(parts)
+def _format_ups_summary(power: PowerState) -> str:
+    """Format UPS status summary line."""
+    if not power.ups_units:
+        return "UPS: N/A"
+    parts: list[str] = []
+    for ups in power.ups_units:
+        soc_pct = ups.battery_soc * 100
+        mode_str = ups.mode.value.upper().replace("_", " ")
+        load_pct = ups.load_fraction * 100
+        eta_pct = ups.efficiency * 100
+        if ups.mode.value == "on_battery":
+            time_str = ""
+            if ups.battery_time_remaining_s < float("inf"):
+                mins = ups.battery_time_remaining_s / 60.0
+                time_str = f" {mins:.0f}min"
+            parts.append(f"{ups.unit_id}: BATTERY {soc_pct:.0f}%{time_str}")
+        elif ups.mode.value == "fault":
+            parts.append(f"{ups.unit_id}: FAULT")
+        else:
+            parts.append(f"{ups.unit_id}: {mode_str} {load_pct:.0f}% η{eta_pct:.0f}%")
+    return "UPS: " + " | ".join(parts)

rewards/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Reward system for DC-Ops environment."""
+from .reward_function import (
+    RewardComponents,
+    RewardFunction,
+    RewardWeights,
+    WEIGHT_PROFILES,
+    softplus,
+)
+__all__ = [
+    "RewardComponents",
+    "RewardFunction",
+    "RewardWeights",
+    "WEIGHT_PROFILES",
+    "softplus",
+]

rewards/reward_function.py ADDED Viewed

	@@ -0,0 +1,428 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Multi-objective reward function for DC-Ops environment.
+Research-informed design:
+  - Softplus barrier functions for safety constraints
+    (Google/DeepMind 2017, ICLR 2025 DC Cooling)
+  - Delta-based progress rewards for credit assignment
+    (process reward model literature)
+  - Normalized components in [-1, 1] via tanh
+  - Scenario-type-aware weight profiles
+All components are bounded to [-1, 1]. Total reward is the weighted sum,
+clamped to [-1, 1].
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass, field
+from typing import Optional
+from ..config import ASHRAE_CLASSES
+from ..simulation.thermal import ThermalSimulation
+from ..simulation.power import PowerSimulation
+from ..simulation.types import UPSMode
+from ..actions.parser import CommandResult
+from ..scenarios.base import ScenarioResult
+# ---------------------------------------------------------------------------
+# Numerically stable softplus
+# ---------------------------------------------------------------------------
+def softplus(x: float) -> float:
+    """Numerically stable softplus: ln(1 + exp(x)).
+    - x > 20:  returns x        (avoids exp overflow)
+    - x < -20: returns 0.0      (avoids underflow noise)
+    """
+    if x > 20.0:
+        return x
+    if x < -20.0:
+        return 0.0
+    return math.log1p(math.exp(x))
+# ---------------------------------------------------------------------------
+# Reward components dataclass
+# ---------------------------------------------------------------------------
+@dataclass
+class RewardComponents:
+    """Individual reward components for logging and analysis."""
+    thermal_safety: float = 0.0
+    power_safety: float = 0.0
+    efficiency: float = 0.0
+    scenario_progress: float = 0.0
+    procedure: float = 0.0
+    action_quality: float = 0.0
+    speed_bonus: float = 0.0
+    total: float = 0.0
+# ---------------------------------------------------------------------------
+# Weight profiles
+# ---------------------------------------------------------------------------
+@dataclass
+class RewardWeights:
+    """Weights for reward components. Should sum to 1.0."""
+    thermal_safety: float = 0.30
+    power_safety: float = 0.10
+    efficiency: float = 0.15
+    scenario_progress: float = 0.25
+    procedure: float = 0.15
+    action_quality: float = 0.05
+WEIGHT_PROFILES: dict[str, RewardWeights] = {
+    "thermal": RewardWeights(
+        thermal_safety=0.30,
+        power_safety=0.05,
+        efficiency=0.10,
+        scenario_progress=0.30,
+        procedure=0.20,
+        action_quality=0.05,
+    ),
+    "power": RewardWeights(
+        thermal_safety=0.10,
+        power_safety=0.25,
+        efficiency=0.05,
+        scenario_progress=0.30,
+        procedure=0.25,
+        action_quality=0.05,
+    ),
+    "default": RewardWeights(
+        thermal_safety=0.30,
+        power_safety=0.15,
+        efficiency=0.25,
+        scenario_progress=0.0,
+        procedure=0.0,
+        action_quality=0.30,
+    ),
+}
+# ---------------------------------------------------------------------------
+# Softplus barrier constants
+# ---------------------------------------------------------------------------
+# Thermal barriers
+_ALPHA_RECOMMENDED = 2.0   # °C transition width at recommended limit
+_ALPHA_ALLOWABLE = 1.5     # °C transition width at allowable limit
+_ALLOWABLE_WEIGHT = 3.0    # Allowable violations 3x worse per degree
+_THERMAL_NORM = 8.0        # Normalization so T=40°C (A2) → R≈-0.97
+# Thermal safety positive baseline — small reward for being well within limits
+# Based on DCRL-Green (ICLR 2025): agents learn faster with a positive signal
+# for maintaining safe state, not just penalties for violations.
+_SAFE_MARGIN_C = 3.0       # °C below recommended max to qualify as "safe"
+_SAFE_BASELINE = 0.1       # Small positive reward when all zones safe
+# Power barriers
+_SOC_THRESHOLD = 0.5       # Concern increases below 50% SOC
+_SOC_ALPHA = 0.15          # Sharp transition around threshold
+_UPS_FAULT_PENALTY = 5.0   # Fixed penalty for UPS fault
+_POWER_NORM = 4.0          # Normalization constant
+# Efficiency
+_PUE_NORM = 2.0            # PUE sensitivity: PUE=3.0 → R≈-0.76
+# Action quality
+_REPEAT_WHITELIST = frozenset({"wait", "check_status"})
+# ---------------------------------------------------------------------------
+# Main reward function
+# ---------------------------------------------------------------------------
+class RewardFunction:
+    """Composable, research-informed reward function for DC operations.
+    Usage:
+        rf = RewardFunction(scenario_type="thermal")
+        rf.reset()   # Call at episode start
+        # Each step:
+        components = rf.compute(thermal_sim, power_sim, cmd_result,
+                                action_command, action_history, scenario_result)
+        reward = components.total
+    """
+    def __init__(
+        self,
+        scenario_type: str = "default",
+        weights: Optional[RewardWeights] = None,
+    ) -> None:
+        self._scenario_type = scenario_type
+        self._weights = weights or WEIGHT_PROFILES.get(
+            scenario_type, WEIGHT_PROFILES["default"]
+        )
+        self._prev_progress: float = 0.0
+    def reset(self) -> None:
+        """Reset state between episodes."""
+        self._prev_progress = 0.0
+    def compute(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: Optional[PowerSimulation],
+        cmd_result: CommandResult,
+        action_command: str,
+        action_history: list[str],
+        scenario_result: Optional[ScenarioResult],
+    ) -> RewardComponents:
+        """Compute all reward components and weighted total.
+        Returns RewardComponents with per-component values and total.
+        Total is clamped to [-1, 1].
+        """
+        r_thermal = self._thermal_safety(thermal_sim)
+        r_power = self._power_safety(power_sim)
+        r_efficiency = self._efficiency(thermal_sim, power_sim)
+        r_progress = self._scenario_progress(scenario_result)
+        r_procedure = self._procedure(scenario_result)
+        r_action = self._action_quality(
+            cmd_result, action_command, action_history, thermal_sim, power_sim,
+        )
+        w = self._weights
+        total = (
+            w.thermal_safety * r_thermal
+            + w.power_safety * r_power
+            + w.efficiency * r_efficiency
+            + w.scenario_progress * r_progress
+            + w.procedure * r_procedure
+            + w.action_quality * r_action
+        )
+        total = max(-1.0, min(1.0, total))
+        return RewardComponents(
+            thermal_safety=r_thermal,
+            power_safety=r_power,
+            efficiency=r_efficiency,
+            scenario_progress=r_progress,
+            procedure=r_procedure,
+            action_quality=r_action,
+            total=total,
+        )
+    # -------------------------------------------------------------------
+    # Component implementations
+    # -------------------------------------------------------------------
+    @staticmethod
+    def _thermal_safety(thermal_sim: ThermalSimulation) -> float:
+        """ASHRAE compliance via dual softplus barriers.
+        Returns value in [-1, _SAFE_BASELINE].
+        Two barriers per zone: recommended (gentle) and allowable (steep).
+        Averaged across zones so the signal is independent of zone count.
+        Positive baseline (+0.1) when ALL zones are well within safe range
+        (>= _SAFE_MARGIN_C below recommended max). This provides gradient
+        signal for maintaining good state, not just avoiding violations.
+        (Informed by DCRL-Green, ICLR 2025.)
+        """
+        zones = thermal_sim.state.zones
+        if not zones:
+            return 0.0
+        n_zones = len(zones)
+        penalty = 0.0
+        all_safe = True
+        for zone in zones:
+            ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
+            if not ashrae:
+                continue
+            t = zone.max_inlet_temp_c
+            rec_max = ashrae.recommended_max_c
+            allow_max = ashrae.allowable_max_c
+            # Check if zone is well within safe range
+            if t > rec_max - _SAFE_MARGIN_C:
+                all_safe = False
+            # Soft barrier at recommended limit
+            penalty += softplus((t - rec_max) / _ALPHA_RECOMMENDED) / n_zones
+            # Harder barrier at allowable limit
+            penalty += (
+                _ALLOWABLE_WEIGHT
+                * softplus((t - allow_max) / _ALPHA_ALLOWABLE)
+                / n_zones
+            )
+        if penalty < 1e-6 and all_safe:
+            return _SAFE_BASELINE
+        return -math.tanh(penalty / _THERMAL_NORM)
+    @staticmethod
+    def _power_safety(power_sim: Optional[PowerSimulation]) -> float:
+        """UPS battery and fault condition penalty.
+        Returns value in [-1, 0].
+        Penalty compounds across multiple failing UPS units.
+        """
+        if power_sim is None:
+            return 0.0
+        penalty = 0.0
+        for ups in power_sim.state.ups_units:
+            if ups.mode == UPSMode.ON_BATTERY:
+                penalty += softplus((_SOC_THRESHOLD - ups.battery_soc) / _SOC_ALPHA)
+            elif ups.mode == UPSMode.FAULT:
+                penalty += _UPS_FAULT_PENALTY
+        return -math.tanh(penalty / _POWER_NORM)
+    @staticmethod
+    def _efficiency(
+        thermal_sim: ThermalSimulation,
+        power_sim: Optional[PowerSimulation],
+    ) -> float:
+        """PUE-based energy efficiency penalty.
+        Returns value in [-1, 0].
+        PUE 1.0 (ideal) → 0, PUE 2.0 → -0.46, PUE 3.0 → -0.76.
+        During power emergencies (UPS on battery), efficiency is suppressed
+        to zero — the agent should not be penalized for load shedding that
+        increases PUE but correctly preserves battery life.
+        """
+        # Suppress efficiency signal during power emergencies
+        if power_sim is not None:
+            for ups in power_sim.state.ups_units:
+                if ups.mode in (UPSMode.ON_BATTERY, UPSMode.FAULT):
+                    return 0.0
+        pue = thermal_sim.state.pue
+        return -math.tanh((pue - 1.0) / _PUE_NORM)
+    def _scenario_progress(self, scenario_result: Optional[ScenarioResult]) -> float:
+        """Delta-based progress toward scenario resolution.
+        Returns value in [-1, 1].
+        Rewards the CHANGE in progress — gives credit to the action that
+        actually caused forward progress.
+        """
+        if scenario_result is None:
+            return 0.0
+        current = scenario_result.progress
+        delta = current - self._prev_progress
+        self._prev_progress = current
+        return max(-1.0, min(1.0, delta))
+    @staticmethod
+    def _procedure(scenario_result: Optional[ScenarioResult]) -> float:
+        """Procedural correctness from scenario rules.
+        Returns value in [-1, 1].
+        """
+        if scenario_result is None:
+            return 0.0
+        return max(-1.0, min(1.0, scenario_result.procedure_reward))
+    @staticmethod
+    def _action_quality(
+        cmd_result: CommandResult,
+        action_command: str,
+        action_history: list[str],
+        thermal_sim: ThermalSimulation,
+        power_sim: Optional[PowerSimulation],
+    ) -> float:
+        """Action quality assessment.
+        Returns value in [-1, 1].
+        Considers: validity, repetition, action type, urgency context.
+        """
+        if not cmd_result.success:
+            return -0.5
+        cmd_lower = action_command.strip().lower()
+        name = cmd_result.command_name
+        # Check for exact repeated command — but whitelist commands that
+        # are legitimately repeatable (wait, check_status).
+        if name not in _REPEAT_WHITELIST:
+            prior = (
+                [h.strip().lower() for h in action_history[:-1]]
+                if len(action_history) > 1
+                else []
+            )
+            if cmd_lower in prior:
+                return -0.2
+        # "wait" quality depends on whether there's an active concern
+        if name == "wait":
+            if _has_active_concern(thermal_sim, power_sim):
+                # Waiting during a power event where we're waiting for
+                # generator startup is acceptable — check if generator
+                # is in startup sequence.
+                if power_sim is not None and _generator_starting(power_sim):
+                    return 0.1  # Waiting for gen to warm up is reasonable
+                return -0.2  # Waiting during a thermal problem
+            return 0.0  # Nothing wrong, waiting is fine
+        # Information-gathering actions are valuable
+        if name in ("diagnose", "check_status"):
+            return 0.3
+        # Active interventions
+        if name in (
+            "adjust_setpoint", "set_fan_speed", "set_rack_load",
+            "migrate_workload", "start_generator", "stop_generator",
+            "set_ups_mode", "start_crac", "stop_crac", "refuel_generator",
+        ):
+            return 0.2
+        # Administrative
+        if name == "acknowledge_alarm":
+            return 0.1
+        # Escalation — handled solely by scenario procedure rules now,
+        # no extra penalty here. The environment no longer double-penalizes.
+        if name == "escalate":
+            return -0.1
+        return 0.1  # Other valid commands
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _has_active_concern(
+    thermal_sim: ThermalSimulation,
+    power_sim: Optional[PowerSimulation],
+) -> bool:
+    """Check if there is an active thermal or power concern."""
+    for zone in thermal_sim.state.zones:
+        ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
+        if ashrae and zone.max_inlet_temp_c > ashrae.recommended_max_c:
+            return True
+    if power_sim:
+        for ups in power_sim.state.ups_units:
+            if ups.mode == UPSMode.ON_BATTERY:
+                return True
+    return False
+def _generator_starting(power_sim: PowerSimulation) -> bool:
+    """Check if the generator is in a startup sequence (agent should wait)."""
+    from ..simulation.types import GeneratorState
+    return power_sim.state.generator.state in (
+        GeneratorState.START_DELAY,
+        GeneratorState.CRANKING,
+        GeneratorState.WARMING,
+    )

scenarios/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Datacenter operation scenarios."""
+from .base import ProcedureRule, Scenario, ScenarioResult
+from .registry import (
+    get_scenario,
+    list_scenarios,
+    random_scenario,
+    register_scenario,
+    registered_scenario_ids,
+)
+# Import scenario modules to trigger registration
+from . import thermal_scenarios  # noqa: F401
+from . import power_scenarios    # noqa: F401
+__all__ = [
+    "ProcedureRule",
+    "Scenario",
+    "ScenarioResult",
+    "get_scenario",
+    "list_scenarios",
+    "random_scenario",
+    "register_scenario",
+    "registered_scenario_ids",
+]

scenarios/base.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Abstract base class for datacenter operation scenarios.
+A Scenario defines:
+  - Initial datacenter configuration overrides
+  - Fault injection (what goes wrong)
+  - Available actions for the agent
+  - Resolution criteria (how to "win")
+  - Scenario-specific reward shaping
+  - Procedural correctness rules (diagnose before repair, etc.)
+"""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any
+from ..config import DatacenterConfig
+from ..simulation.thermal import ThermalSimulation
+from ..simulation.power import PowerSimulation
+@dataclass
+class ProcedureRule:
+    """A procedural correctness rule for reward shaping.
+    Attributes:
+        required_before: Commands that must appear before `trigger_command`.
+        trigger_command: The command this rule applies to.
+        bonus: Reward bonus if required_before was satisfied.
+        penalty: Reward penalty if trigger_command issued without required_before.
+        description: Human-readable explanation.
+    """
+    required_before: list[str]
+    trigger_command: str
+    bonus: float = 0.3
+    penalty: float = -0.2
+    description: str = ""
+@dataclass
+class ScenarioResult:
+    """Outcome of checking scenario state after a step.
+    Attributes:
+        resolved: True if the incident is successfully resolved.
+        resolution_message: Human-readable message on resolution.
+        scenario_reward: Legacy scenario-specific reward (kept for compat).
+        procedure_reward: Procedural correctness reward from check_procedure().
+        progress: Normalized [0, 1] progress toward resolution.
+            Used by the delta-based reward function for credit assignment.
+        info: Additional scenario-specific data for logging.
+    """
+    resolved: bool = False
+    resolution_message: str = ""
+    scenario_reward: float = 0.0
+    procedure_reward: float = 0.0
+    progress: float = 0.0
+    info: dict[str, Any] = field(default_factory=dict)
+class Scenario(ABC):
+    """Abstract base class for datacenter operation scenarios.
+    Lifecycle:
+      1. Environment calls `configure(config)` to get modified DatacenterConfig
+      2. Environment calls `inject_fault(thermal_sim, power_sim)` after warmup
+      3. Each step, environment calls `evaluate_step(...)` for reward + resolution
+      4. Environment uses `alert_message`, `step_budget`, etc. for episode control
+    Subclasses must implement all abstract methods/properties.
+    """
+    @abstractmethod
+    def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
+        """Optionally modify the datacenter configuration for this scenario.
+        Override to change rack loads, outside temperature, number of CRACs, etc.
+        Return the base_config unchanged if no modifications needed.
+        """
+    def reset_state(self) -> None:
+        """Reset mutable episode state between episodes.
+        Called by the environment at the start of each episode, before
+        configure() / inject_fault(). Subclasses with mutable state
+        (counters, flags) MUST override this and reset them.
+        """
+    @abstractmethod
+    def inject_fault(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+    ) -> None:
+        """Inject the fault or initial condition into the running simulation.
+        Called after warmup, so the datacenter is at quasi-steady-state.
+        """
+    @abstractmethod
+    def evaluate_step(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+        action_command: str,
+        action_history: list[str],
+        step: int,
+    ) -> ScenarioResult:
+        """Evaluate the current state after a step.
+        Returns ScenarioResult with:
+          - resolved: True if the incident is successfully resolved
+          - scenario_reward: Scenario-specific reward component
+          - procedure_reward: Procedural correctness reward
+        """
+    @property
+    @abstractmethod
+    def scenario_id(self) -> str:
+        """Unique identifier, e.g. 'A1', 'B4'."""
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Human-readable scenario name."""
+    @property
+    @abstractmethod
+    def scenario_type(self) -> str:
+        """Category: 'thermal', 'power', 'network', 'incident'."""
+    @property
+    @abstractmethod
+    def difficulty(self) -> str:
+        """'easy', 'medium', 'hard'."""
+    @property
+    @abstractmethod
+    def step_budget(self) -> int:
+        """Maximum steps allowed for this scenario."""
+    @property
+    @abstractmethod
+    def alert_message(self) -> str:
+        """Initial alert shown to the agent."""
+    @property
+    def game_time_per_step_s(self) -> float:
+        """Simulation time per agent step. Override for faster/slower scenarios."""
+        return 60.0
+    @property
+    def procedure_rules(self) -> list[ProcedureRule]:
+        """Procedural correctness rules. Override to define scenario-specific rules."""
+        return []
+    @property
+    def available_actions(self) -> list[str] | None:
+        """Override to restrict available actions. None = all actions available."""
+        return None
+    def check_procedure(self, action_command: str, action_history: list[str]) -> float:
+        """Check procedural correctness of the current action against history.
+        Returns reward bonus/penalty based on whether required prerequisites
+        were satisfied before the current action.
+        """
+        if not self.procedure_rules:
+            return 0.0
+        # Extract just the command name (first word)
+        cmd_name = action_command.strip().split()[0].lower() if action_command.strip() else ""
+        history_cmds = [h.strip().split()[0].lower() for h in action_history[:-1] if h.strip()]
+        reward = 0.0
+        for rule in self.procedure_rules:
+            if cmd_name == rule.trigger_command:
+                # Check if all required_before commands appeared in history
+                all_satisfied = all(
+                    any(req == h for h in history_cmds)
+                    for req in rule.required_before
+                )
+                if all_satisfied:
+                    reward += rule.bonus
+                else:
+                    reward += rule.penalty
+        return reward

scenarios/power_scenarios.py ADDED Viewed

	@@ -0,0 +1,496 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Power operation scenarios (Category B).
+B1: UPS Alarm Response (Medium)
+    - UPS switches to battery after utility micro-outage
+    - Agent must verify UPS status, check battery, ensure generator readiness
+B3: Generator Test Protocol (Easy)
+    - Monthly generator test — agent must follow proper procedure
+    - Start generator, verify output, run loaded test, cooldown, shutdown
+B4: Power Failure Cascade (Hard)
+    - Full utility loss + generator fails to start
+    - Agent must manage UPS battery time, shed load, troubleshoot generator
+"""
+from __future__ import annotations
+from ..config import ASHRAE_CLASSES, DatacenterConfig
+from ..simulation.thermal import ThermalSimulation
+from ..simulation.power import PowerSimulation
+from ..simulation.types import GeneratorState, UPSMode
+from .base import ProcedureRule, Scenario, ScenarioResult
+from .registry import register_scenario
+# ===========================================================================
+# B1: UPS Alarm Response (Medium)
+# ===========================================================================
+@register_scenario
+class UPSAlarmResponse(Scenario):
+    """Agent responds to UPS switching to battery.
+    Scenario: A brief utility dip caused UPS to transfer to battery.
+    Utility has been restored, but the agent should:
+      1. Check UPS status (diagnose UPS-1)
+      2. Verify battery SOC
+      3. Verify generator is in standby and ready
+      4. Verify ATS is back on utility
+      5. Acknowledge the alarm
+    Resolution: Agent diagnoses UPS AND acknowledges alarm.
+    The system will self-recover, but proper procedure matters.
+    """
+    _BATTERY_DRAIN_SECONDS = 30  # Brief outage duration
+    def __init__(self) -> None:
+        super().__init__()
+        self._diagnosed_ups = False
+        self._acknowledged = False
+    def reset_state(self) -> None:
+        self._diagnosed_ups = False
+        self._acknowledged = False
+    def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
+        return base_config
+    def inject_fault(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+    ) -> None:
+        if power_sim is None:
+            return
+        # Simulate a brief utility outage that has already ended
+        # Drain some battery to show it was on battery
+        for ups in power_sim.state.ups_units:
+            ups.battery_soc = 0.85  # ~15% used during brief outage
+            ups.mode = UPSMode.DOUBLE_CONVERSION  # Already back on utility
+    def evaluate_step(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+        action_command: str,
+        action_history: list[str],
+        step: int,
+    ) -> ScenarioResult:
+        cmd = action_command.strip().lower()
+        # Track diagnosis
+        if cmd.startswith("diagnose") and "ups" in cmd:
+            self._diagnosed_ups = True
+        if cmd.startswith("acknowledge"):
+            self._acknowledged = True
+        resolved = self._diagnosed_ups and self._acknowledged
+        # Reward for proper investigation
+        scenario_reward = 0.0
+        if self._diagnosed_ups:
+            scenario_reward += 0.3
+        if self._acknowledged:
+            scenario_reward += 0.2
+        procedure_reward = self.check_procedure(action_command, action_history)
+        # Progress: 50% for diagnose, 50% for acknowledge
+        progress = 0.0
+        if self._diagnosed_ups:
+            progress += 0.5
+        if self._acknowledged:
+            progress += 0.5
+        return ScenarioResult(
+            resolved=resolved,
+            resolution_message="UPS alarm properly investigated and acknowledged." if resolved else "",
+            scenario_reward=scenario_reward,
+            procedure_reward=procedure_reward,
+            progress=progress,
+            info={
+                "diagnosed_ups": self._diagnosed_ups,
+                "acknowledged": self._acknowledged,
+            },
+        )
+    @property
+    def scenario_id(self) -> str:
+        return "B1"
+    @property
+    def name(self) -> str:
+        return "UPS Alarm Response"
+    @property
+    def scenario_type(self) -> str:
+        return "power"
+    @property
+    def difficulty(self) -> str:
+        return "medium"
+    @property
+    def step_budget(self) -> int:
+        return 10
+    @property
+    def alert_message(self) -> str:
+        return (
+            "WARNING: UPS-1 transferred to battery at 14:23:05. "
+            "Utility restored at 14:23:35. Battery SOC: 85%. "
+            "Verify system status and acknowledge."
+        )
+    @property
+    def procedure_rules(self) -> list[ProcedureRule]:
+        return [
+            ProcedureRule(
+                required_before=["diagnose"],
+                trigger_command="acknowledge_alarm",
+                bonus=0.3,
+                penalty=-0.2,
+                description="Diagnose UPS before acknowledging alarm",
+            ),
+        ]
+# ===========================================================================
+# B3: Generator Test Protocol (Easy)
+# ===========================================================================
+@register_scenario
+class GeneratorTestProtocol(Scenario):
+    """Agent must follow proper monthly generator test procedure.
+    Correct sequence:
+      1. check_status — Review current system state
+      2. start_generator — Initiate startup
+      3. diagnose GEN-1 — Verify engine started and output is stable
+      4. stop_generator — Initiate cooldown
+      5. acknowledge_alarm — Log test completion
+    Resolution: Generator successfully started, verified, and shut down.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self._started = False
+        self._verified = False
+        self._stopped = False
+        self._completed = False
+    def reset_state(self) -> None:
+        self._started = False
+        self._verified = False
+        self._stopped = False
+        self._completed = False
+    def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
+        return base_config
+    def inject_fault(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+    ) -> None:
+        # No fault — this is a routine test procedure
+        pass
+    def evaluate_step(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+        action_command: str,
+        action_history: list[str],
+        step: int,
+    ) -> ScenarioResult:
+        cmd = action_command.strip().lower()
+        if cmd.startswith("start_generator"):
+            self._started = True
+        if self._started and cmd.startswith("diagnose") and "gen" in cmd:
+            self._verified = True
+        if cmd.startswith("stop_generator"):
+            if self._started and self._verified:
+                self._stopped = True
+        if cmd.startswith("acknowledge") and self._stopped:
+            self._completed = True
+        # Check generator state
+        gen_running = False
+        if power_sim:
+            gen_running = power_sim.state.generator.state in (
+                GeneratorState.READY, GeneratorState.LOADED,
+                GeneratorState.WARMING, GeneratorState.CRANKING,
+                GeneratorState.START_DELAY,
+            )
+        resolved = self._completed
+        scenario_reward = 0.0
+        if self._started:
+            scenario_reward += 0.1
+        if self._verified:
+            scenario_reward += 0.2
+        if self._stopped:
+            scenario_reward += 0.2
+        if self._completed:
+            scenario_reward += 0.3
+        procedure_reward = self.check_procedure(action_command, action_history)
+        # Progress: 25% per protocol step
+        progress = 0.0
+        if self._started:
+            progress += 0.25
+        if self._verified:
+            progress += 0.25
+        if self._stopped:
+            progress += 0.25
+        if self._completed:
+            progress += 0.25
+        return ScenarioResult(
+            resolved=resolved,
+            resolution_message="Generator test protocol completed successfully." if resolved else "",
+            scenario_reward=scenario_reward,
+            procedure_reward=procedure_reward,
+            progress=progress,
+            info={
+                "started": self._started,
+                "verified": self._verified,
+                "stopped": self._stopped,
+                "completed": self._completed,
+                "gen_running": gen_running,
+            },
+        )
+    @property
+    def scenario_id(self) -> str:
+        return "B3"
+    @property
+    def name(self) -> str:
+        return "Generator Test Protocol"
+    @property
+    def scenario_type(self) -> str:
+        return "power"
+    @property
+    def difficulty(self) -> str:
+        return "easy"
+    @property
+    def step_budget(self) -> int:
+        return 10
+    @property
+    def alert_message(self) -> str:
+        return (
+            "SCHEDULED: Monthly generator test due. "
+            "Follow standard test protocol: start, verify, loaded test, shutdown."
+        )
+    @property
+    def procedure_rules(self) -> list[ProcedureRule]:
+        return [
+            ProcedureRule(
+                required_before=["check_status"],
+                trigger_command="start_generator",
+                bonus=0.2,
+                penalty=-0.1,
+                description="Check system status before starting generator",
+            ),
+            ProcedureRule(
+                required_before=["start_generator"],
+                trigger_command="stop_generator",
+                bonus=0.2,
+                penalty=-0.3,
+                description="Must start generator before stopping it",
+            ),
+        ]
+    @property
+    def game_time_per_step_s(self) -> float:
+        # Generator startup is ~17s, so 30s per step lets agent observe transitions
+        return 30.0
+# ===========================================================================
+# B4: Power Failure Cascade (Hard)
+# ===========================================================================
+@register_scenario
+class PowerFailureCascade(Scenario):
+    """Full utility loss with degraded generator response.
+    Scenario: Utility power fails. Generator starts but takes longer than
+    usual (warm-up extended). UPS batteries are bridging the gap.
+    Meanwhile, battery SOC is dropping.
+    The agent must:
+      1. Diagnose UPS status and battery levels
+      2. Verify generator startup sequence
+      3. Shed non-critical IT load to extend battery life
+      4. Monitor temperatures (no cooling compressors during transfer)
+      5. Stabilize once generator is online
+    Resolution: Generator loaded AND all temps within allowable limits
+    AND UPS battery SOC stabilized (charging or >20%).
+    """
+    _CONSECUTIVE_STABLE_STEPS = 2
+    def __init__(self) -> None:
+        super().__init__()
+        self._stable_count = 0
+    def reset_state(self) -> None:
+        self._stable_count = 0
+    def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
+        # Extend generator warmup to make it more challenging
+        base_config.power.generator.warmup_time_s = 15.0  # Longer than default 8s
+        return base_config
+    def inject_fault(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+    ) -> None:
+        if power_sim is None:
+            return
+        power_sim.set_utility_available(False)
+    def evaluate_step(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+        action_command: str,
+        action_history: list[str],
+        step: int,
+    ) -> ScenarioResult:
+        dc = thermal_sim.state
+        # Check temperatures
+        all_within_allowable = True
+        max_over = 0.0
+        for zone in dc.zones:
+            ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
+            if not ashrae:
+                continue
+            if zone.max_inlet_temp_c > ashrae.allowable_max_c:
+                all_within_allowable = False
+                max_over = max(max_over, zone.max_inlet_temp_c - ashrae.allowable_max_c)
+        # Check power recovery
+        gen_loaded = False
+        battery_ok = True
+        if power_sim:
+            gen_loaded = power_sim.state.generator.state == GeneratorState.LOADED
+            for ups in power_sim.state.ups_units:
+                if ups.battery_soc < 0.10:
+                    battery_ok = False
+        stable = all_within_allowable and gen_loaded and battery_ok
+        if stable:
+            self._stable_count += 1
+        else:
+            self._stable_count = 0
+        resolved = self._stable_count >= self._CONSECUTIVE_STABLE_STEPS
+        # Reward shaping
+        scenario_reward = 0.0
+        # Penalty for temperature overshoot
+        if max_over > 0:
+            scenario_reward -= max_over * 1.5
+        # Reward for generator online
+        if gen_loaded:
+            scenario_reward += 0.3
+        # Penalty for low battery
+        if power_sim:
+            min_soc = min(u.battery_soc for u in power_sim.state.ups_units) if power_sim.state.ups_units else 1.0
+            if min_soc < 0.20:
+                scenario_reward -= (0.20 - min_soc) * 5.0
+        procedure_reward = self.check_procedure(action_command, action_history)
+        # Progress: partial credit per condition, full credit for stability
+        conditions_met = sum([gen_loaded, all_within_allowable, battery_ok])
+        if conditions_met == 3:
+            progress = 0.5 + 0.5 * min(1.0, self._stable_count / self._CONSECUTIVE_STABLE_STEPS)
+        else:
+            progress = (conditions_met / 3.0) * 0.5
+        return ScenarioResult(
+            resolved=resolved,
+            resolution_message="Power failure resolved. Generator online, temps stable." if resolved else "",
+            scenario_reward=scenario_reward,
+            procedure_reward=procedure_reward,
+            progress=progress,
+            info={
+                "max_overshoot_c": max_over,
+                "gen_loaded": gen_loaded,
+                "battery_ok": battery_ok,
+                "stable_count": self._stable_count,
+            },
+        )
+    @property
+    def scenario_id(self) -> str:
+        return "B4"
+    @property
+    def name(self) -> str:
+        return "Power Failure Cascade"
+    @property
+    def scenario_type(self) -> str:
+        return "power"
+    @property
+    def difficulty(self) -> str:
+        return "hard"
+    @property
+    def step_budget(self) -> int:
+        return 20
+    @property
+    def alert_message(self) -> str:
+        return (
+            "CRITICAL: Utility power lost. UPS on battery. "
+            "Generator startup in progress. "
+            "Battery SOC declining. Immediate action required."
+        )
+    @property
+    def procedure_rules(self) -> list[ProcedureRule]:
+        return [
+            ProcedureRule(
+                required_before=["diagnose"],
+                trigger_command="set_rack_load",
+                bonus=0.3,
+                penalty=-0.1,
+                description="Diagnose before shedding load",
+            ),
+            ProcedureRule(
+                required_before=[],
+                trigger_command="escalate",
+                bonus=0.0,
+                penalty=-0.5,
+                description="Escalation during power cascade is heavily penalized",
+            ),
+        ]
+    @property
+    def game_time_per_step_s(self) -> float:
+        # Fast progression — every second counts with battery draining
+        return 15.0

scenarios/registry.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Scenario registry for selecting scenarios by ID, type, or difficulty.
+"""
+from __future__ import annotations
+import random
+from typing import Optional
+from .base import Scenario
+# Global registry: scenario_id → Scenario class
+_REGISTRY: dict[str, type[Scenario]] = {}
+def register_scenario(cls: type[Scenario]) -> type[Scenario]:
+    """Class decorator to register a scenario.
+    Usage:
+        @register_scenario
+        class MyCoolScenario(Scenario):
+            ...
+    """
+    # Instantiate temporarily to read scenario_id
+    instance = cls()
+    _REGISTRY[instance.scenario_id] = cls
+    return cls
+def get_scenario(scenario_id: str) -> Scenario:
+    """Get a scenario by its ID (e.g. 'A1', 'B4')."""
+    cls = _REGISTRY.get(scenario_id)
+    if cls is None:
+        available = ", ".join(sorted(_REGISTRY.keys()))
+        raise KeyError(f"Unknown scenario '{scenario_id}'. Available: {available}")
+    return cls()
+def list_scenarios(
+    *,
+    scenario_type: Optional[str] = None,
+    difficulty: Optional[str] = None,
+) -> list[Scenario]:
+    """List registered scenarios, optionally filtered by type or difficulty."""
+    result = []
+    for cls in _REGISTRY.values():
+        instance = cls()
+        if scenario_type and instance.scenario_type != scenario_type:
+            continue
+        if difficulty and instance.difficulty != difficulty:
+            continue
+        result.append(instance)
+    return result
+def random_scenario(
+    *,
+    scenario_type: Optional[str] = None,
+    difficulty: Optional[str] = None,
+    seed: Optional[int] = None,
+) -> Scenario:
+    """Pick a random scenario from the registry, optionally filtered."""
+    candidates = list_scenarios(scenario_type=scenario_type, difficulty=difficulty)
+    if not candidates:
+        raise ValueError(
+            f"No scenarios match type={scenario_type!r}, difficulty={difficulty!r}"
+        )
+    rng = random.Random(seed)
+    return rng.choice(candidates)
+def registered_scenario_ids() -> list[str]:
+    """Return all registered scenario IDs in sorted order."""
+    return sorted(_REGISTRY.keys())

scenarios/thermal_scenarios.py ADDED Viewed

	@@ -0,0 +1,443 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Thermal operation scenarios (Category A).
+A1: Cooling Setpoint Optimization (Easy)
+    - PUE is high because setpoints are too low
+    - Agent must raise setpoints to improve efficiency without violating ASHRAE
+A2: Thermal Event Response (Medium)
+    - Single CRAC failure causes temperature rise
+    - Agent must diagnose, compensate, and stabilize
+A4: CRAC Failure Cascade (Hard)
+    - Two CRACs fail in quick succession
+    - Agent must triage, redistribute cooling, migrate workload
+"""
+from __future__ import annotations
+from ..config import ASHRAE_CLASSES, DatacenterConfig
+from ..simulation.thermal import ThermalSimulation
+from ..simulation.power import PowerSimulation
+from ..simulation.types import CRACFaultType
+from .base import ProcedureRule, Scenario, ScenarioResult
+from .registry import register_scenario
+# ===========================================================================
+# A1: Cooling Setpoint Optimization (Easy)
+# ===========================================================================
+@register_scenario
+class CoolingSetpointOptimization(Scenario):
+    """Agent must optimize CRAC setpoints to reduce PUE.
+    Initial condition: All CRACs at 15°C setpoint (overly aggressive cooling).
+    This wastes energy — PUE is unnecessarily high.
+    Goal: Raise setpoints closer to ASHRAE recommended range (18-27°C for A2)
+    while keeping all inlet temps within recommended limits.
+    Resolution: PUE drops below target AND all temps within recommended range.
+    """
+    _PUE_TARGET = 1.6  # Achievable PUE with proper setpoints
+    def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
+        # Set all CRACs to 15°C (too cold, wasting energy)
+        for zone_cfg in base_config.zones:
+            for crac_cfg in zone_cfg.crac_units:
+                crac_cfg.initial_setpoint_c = 15.0
+        return base_config
+    def inject_fault(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+    ) -> None:
+        # No fault — this is an optimization scenario
+        # The "problem" is already baked into the config (low setpoints)
+        pass
+    def evaluate_step(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+        action_command: str,
+        action_history: list[str],
+        step: int,
+    ) -> ScenarioResult:
+        dc = thermal_sim.state
+        pue = dc.pue
+        # Check all zones within recommended
+        all_within_recommended = True
+        for zone in dc.zones:
+            ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
+            if ashrae and zone.max_inlet_temp_c > ashrae.recommended_max_c:
+                all_within_recommended = False
+                break
+        # Reward: improvement toward target PUE
+        # Baseline PUE at 15°C setpoints is ~2.0+, target is ~1.6
+        pue_reward = max(0, 2.0 - pue)  # Higher is better as PUE drops
+        resolved = pue < self._PUE_TARGET and all_within_recommended
+        procedure_reward = self.check_procedure(action_command, action_history)
+        # Progress: PUE improvement toward target + temperature compliance
+        pue_progress = max(0.0, min(1.0, (2.0 - pue) / (2.0 - self._PUE_TARGET)))
+        temp_factor = 1.0 if all_within_recommended else 0.0
+        progress = 0.7 * pue_progress + 0.3 * temp_factor
+        return ScenarioResult(
+            resolved=resolved,
+            resolution_message="PUE optimized within target range." if resolved else "",
+            scenario_reward=pue_reward * 0.5,
+            procedure_reward=procedure_reward,
+            progress=progress,
+            info={"pue": pue, "target_pue": self._PUE_TARGET},
+        )
+    @property
+    def scenario_id(self) -> str:
+        return "A1"
+    @property
+    def name(self) -> str:
+        return "Cooling Setpoint Optimization"
+    @property
+    def scenario_type(self) -> str:
+        return "thermal"
+    @property
+    def difficulty(self) -> str:
+        return "easy"
+    @property
+    def step_budget(self) -> int:
+        return 10
+    @property
+    def alert_message(self) -> str:
+        return (
+            "NOTICE: PUE exceeds 1.8 — cooling setpoints may be suboptimal. "
+            "Review CRAC setpoints and adjust for energy efficiency."
+        )
+    @property
+    def procedure_rules(self) -> list[ProcedureRule]:
+        return [
+            ProcedureRule(
+                required_before=["check_status"],
+                trigger_command="adjust_setpoint",
+                bonus=0.2,
+                penalty=-0.1,
+                description="Check status before adjusting setpoints",
+            ),
+        ]
+# ===========================================================================
+# A2: Thermal Event Response (Medium)
+# ===========================================================================
+@register_scenario
+class ThermalEventResponse(Scenario):
+    """Agent must respond to a single CRAC compressor failure.
+    A CRAC unit suffers a compressor failure, reducing cooling capacity.
+    With N+1 provisioning the remaining CRACs can handle the load,
+    but the agent should:
+      1. Diagnose the failed unit
+      2. Increase fan speeds or adjust setpoints on remaining CRACs
+      3. Optionally reduce load on hottest racks
+    Resolution: All inlet temps within recommended range for 2+ consecutive steps.
+    """
+    _FAILED_UNIT = "CRAC-3"
+    _CONSECUTIVE_STABLE_STEPS = 2
+    def __init__(self) -> None:
+        super().__init__()
+        self._stable_count = 0
+    def reset_state(self) -> None:
+        self._stable_count = 0
+    def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
+        return base_config  # Default config is fine
+    def inject_fault(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+    ) -> None:
+        thermal_sim.inject_crac_fault(self._FAILED_UNIT, CRACFaultType.COMPRESSOR)
+    def evaluate_step(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+        action_command: str,
+        action_history: list[str],
+        step: int,
+    ) -> ScenarioResult:
+        dc = thermal_sim.state
+        # Check if all zones within recommended
+        all_within_recommended = True
+        max_over = 0.0
+        for zone in dc.zones:
+            ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
+            if not ashrae:
+                continue
+            if zone.max_inlet_temp_c > ashrae.recommended_max_c:
+                all_within_recommended = False
+                max_over = max(max_over, zone.max_inlet_temp_c - ashrae.recommended_max_c)
+        if all_within_recommended:
+            self._stable_count += 1
+        else:
+            self._stable_count = 0
+        resolved = self._stable_count >= self._CONSECUTIVE_STABLE_STEPS
+        # Scenario reward: penalty proportional to temperature overshoot
+        scenario_reward = -max_over * 0.5 if max_over > 0 else 0.1
+        procedure_reward = self.check_procedure(action_command, action_history)
+        # Progress: partial credit for being close, full credit for stability
+        if all_within_recommended:
+            progress = 0.5 + 0.5 * min(1.0, self._stable_count / self._CONSECUTIVE_STABLE_STEPS)
+        else:
+            progress = max(0.0, 0.4 / (1.0 + max_over))
+        return ScenarioResult(
+            resolved=resolved,
+            resolution_message="Thermal event stabilized. All zones within recommended range." if resolved else "",
+            scenario_reward=scenario_reward,
+            procedure_reward=procedure_reward,
+            progress=progress,
+            info={"max_overshoot_c": max_over, "stable_count": self._stable_count},
+        )
+    @property
+    def scenario_id(self) -> str:
+        return "A2"
+    @property
+    def name(self) -> str:
+        return "Thermal Event Response"
+    @property
+    def scenario_type(self) -> str:
+        return "thermal"
+    @property
+    def difficulty(self) -> str:
+        return "medium"
+    @property
+    def step_budget(self) -> int:
+        return 15
+    @property
+    def alert_message(self) -> str:
+        return (
+            f"CRITICAL: {self._FAILED_UNIT} compressor failure detected. "
+            "Zone B temperatures rising. Investigate and stabilize."
+        )
+    @property
+    def procedure_rules(self) -> list[ProcedureRule]:
+        return [
+            ProcedureRule(
+                required_before=["diagnose"],
+                trigger_command="adjust_setpoint",
+                bonus=0.3,
+                penalty=-0.2,
+                description="Diagnose the fault before adjusting setpoints",
+            ),
+            ProcedureRule(
+                required_before=["diagnose"],
+                trigger_command="set_fan_speed",
+                bonus=0.3,
+                penalty=-0.2,
+                description="Diagnose the fault before adjusting fan speed",
+            ),
+            ProcedureRule(
+                required_before=[],
+                trigger_command="escalate",
+                bonus=0.0,
+                penalty=-0.3,
+                description="Escalated without attempting diagnosis or fix",
+            ),
+        ]
+# ===========================================================================
+# A4: CRAC Failure Cascade (Hard)
+# ===========================================================================
+@register_scenario
+class CRACFailureCascade(Scenario):
+    """Two CRACs fail, overwhelming remaining cooling capacity.
+    CRAC-1 has a compressor failure and CRAC-3 has a fan failure.
+    With only 2 of 4 CRACs operational, cooling is severely degraded.
+    The agent must:
+      1. Diagnose both failures
+      2. Aggressively compensate (max fan speeds, lower setpoints on survivors)
+      3. Reduce IT load on hottest racks (workload migration)
+      4. Monitor and stabilize before thermal runaway
+    Resolution: All inlet temps below allowable max for 2+ steps.
+    """
+    _FAILED_UNITS = [
+        ("CRAC-1", CRACFaultType.COMPRESSOR),
+        ("CRAC-3", CRACFaultType.FAN),
+    ]
+    _CONSECUTIVE_STABLE_STEPS = 2
+    def __init__(self) -> None:
+        super().__init__()
+        self._stable_count = 0
+    def reset_state(self) -> None:
+        self._stable_count = 0
+    def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
+        return base_config
+    def inject_fault(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+    ) -> None:
+        for unit_id, fault_type in self._FAILED_UNITS:
+            thermal_sim.inject_crac_fault(unit_id, fault_type)
+    def evaluate_step(
+        self,
+        thermal_sim: ThermalSimulation,
+        power_sim: PowerSimulation | None,
+        action_command: str,
+        action_history: list[str],
+        step: int,
+    ) -> ScenarioResult:
+        dc = thermal_sim.state
+        all_within_allowable = True
+        max_over = 0.0
+        for zone in dc.zones:
+            ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
+            if not ashrae:
+                continue
+            if zone.max_inlet_temp_c > ashrae.allowable_max_c:
+                all_within_allowable = False
+                max_over = max(max_over, zone.max_inlet_temp_c - ashrae.allowable_max_c)
+        if all_within_allowable:
+            self._stable_count += 1
+        else:
+            self._stable_count = 0
+        resolved = self._stable_count >= self._CONSECUTIVE_STABLE_STEPS
+        # Heavy penalty for being over allowable
+        scenario_reward = -max_over * 2.0 if max_over > 0 else 0.2
+        procedure_reward = self.check_procedure(action_command, action_history)
+        # Bonus for diagnosing both units
+        diagnosed_units = set()
+        for h in action_history:
+            parts = h.strip().split()
+            if len(parts) >= 2 and parts[0].lower() == "diagnose":
+                diagnosed_units.add(parts[1].upper())
+        if "CRAC-1" in diagnosed_units and "CRAC-3" in diagnosed_units:
+            procedure_reward += 0.2  # Bonus for thorough diagnosis
+        # Progress: partial credit for being close, full credit for stability
+        if all_within_allowable:
+            progress = 0.5 + 0.5 * min(1.0, self._stable_count / self._CONSECUTIVE_STABLE_STEPS)
+        else:
+            progress = max(0.0, 0.4 / (1.0 + max_over))
+        return ScenarioResult(
+            resolved=resolved,
+            resolution_message="CRAC cascade stabilized. Temps within allowable range." if resolved else "",
+            scenario_reward=scenario_reward,
+            procedure_reward=procedure_reward,
+            progress=progress,
+            info={
+                "max_overshoot_c": max_over,
+                "stable_count": self._stable_count,
+                "diagnosed_units": list(diagnosed_units),
+            },
+        )
+    @property
+    def scenario_id(self) -> str:
+        return "A4"
+    @property
+    def name(self) -> str:
+        return "CRAC Failure Cascade"
+    @property
+    def scenario_type(self) -> str:
+        return "thermal"
+    @property
+    def difficulty(self) -> str:
+        return "hard"
+    @property
+    def step_budget(self) -> int:
+        return 20
+    @property
+    def alert_message(self) -> str:
+        return (
+            "CRITICAL: Multiple CRAC failures detected. "
+            "CRAC-1 compressor fault, CRAC-3 fan fault. "
+            "Temperatures rising rapidly. Immediate action required."
+        )
+    @property
+    def procedure_rules(self) -> list[ProcedureRule]:
+        return [
+            ProcedureRule(
+                required_before=["diagnose"],
+                trigger_command="adjust_setpoint",
+                bonus=0.2,
+                penalty=-0.3,
+                description="Diagnose before adjusting setpoints during cascade",
+            ),
+            ProcedureRule(
+                required_before=["diagnose"],
+                trigger_command="set_fan_speed",
+                bonus=0.2,
+                penalty=-0.3,
+                description="Diagnose before adjusting fan speed during cascade",
+            ),
+            ProcedureRule(
+                required_before=["diagnose"],
+                trigger_command="set_rack_load",
+                bonus=0.3,
+                penalty=-0.1,
+                description="Diagnose before migrating workloads",
+            ),
+        ]
+    @property
+    def game_time_per_step_s(self) -> float:
+        # Faster time progression — cascade is urgent
+        return 30.0

server/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Dc Ops Env environment server components."""
+from .dc_ops_env_environment import DcOpsEnvironment
+__all__ = ["DcOpsEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+FastAPI application for the Dc Ops Env Environment.
+This module creates an HTTP server that exposes the DcOpsEnvironment
+over HTTP and WebSocket endpoints, compatible with EnvClient.
+Endpoints:
+    - POST /reset: Reset the environment
+    - POST /step: Execute an action
+    - GET /state: Get current environment state
+    - GET /schema: Get action/observation schemas
+    - WS /ws: WebSocket endpoint for persistent sessions
+Usage:
+    # Development (with auto-reload):
+    uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
+    # Production:
+    uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
+    # Or run directly:
+    python -m server.app
+"""
+from pathlib import Path
+from fastapi.responses import FileResponse
+from fastapi.staticfiles import StaticFiles
+try:
+    from openenv.core.env_server.http_server import create_app
+except Exception as e:  # pragma: no cover
+    raise ImportError(
+        "openenv is required for the web interface. Install dependencies with '\n    uv sync\n'"
+    ) from e
+try:
+    from ..models import DcOpsAction, DcOpsObservation
+    from .dc_ops_env_environment import DcOpsEnvironment
+except ModuleNotFoundError:
+    from models import DcOpsAction, DcOpsObservation
+    from server.dc_ops_env_environment import DcOpsEnvironment
+# Create the app with web interface and README integration
+app = create_app(
+    DcOpsEnvironment,
+    DcOpsAction,
+    DcOpsObservation,
+    env_name="dc_ops_env",
+    max_concurrent_envs=4,  # increase this number to allow more concurrent WebSocket sessions
+)
+# Mount custom DC-Ops dashboard UI at /web
+_STATIC_DIR = Path(__file__).parent / "static"
+@app.get("/web")
+async def web_ui():
+    """Serve the DC-Ops operations console."""
+    return FileResponse(_STATIC_DIR / "index.html", media_type="text/html")
+app.mount("/static", StaticFiles(directory=str(_STATIC_DIR)), name="static")
+def main(host: str = "0.0.0.0", port: int = 8000):
+    """
+    Entry point for direct execution via uv run or python -m.
+    This function enables running the server without Docker:
+        uv run --project . server
+        uv run --project . server --port 8001
+        python -m dc_ops_env.server.app
+    Args:
+        host: Host address to bind to (default: "0.0.0.0")
+        port: Port number to listen on (default: 8000)
+    For production deployments, consider using uvicorn directly with
+    multiple workers:
+        uvicorn dc_ops_env.server.app:app --workers 4
+    """
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=8000)
+    args = parser.parse_args()
+    main(port=args.port)

server/dc_ops_env_environment.py ADDED Viewed

	@@ -0,0 +1,532 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+DC-Ops Environment Implementation.
+Wires the thermal and power simulations into OpenEnv's Environment interface.
+Each step:
+  1. Parse the agent's command
+  2. Apply mutations to simulation state
+  3. Advance simulation by game-time dt (default 60s)
+  4. Render dashboard observation
+  5. Compute reward (via multi-objective RewardFunction)
+  6. Check termination conditions
+"""
+from __future__ import annotations
+from typing import Any, Optional
+from uuid import uuid4
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import State
+try:
+    from ..config import (
+        ASHRAE_CLASSES,
+        DatacenterConfig,
+        PowerConfig,
+        make_default_datacenter_config,
+        load_datacenter_config,
+    )
+    from ..models import DcOpsAction, DcOpsObservation
+    from ..actions.parser import AVAILABLE_ACTIONS, CommandResult, parse_command
+    from ..rendering.dashboard import render_dashboard
+    from ..simulation.thermal import ThermalAlarm, ThermalSimulation
+    from ..simulation.power import PowerAlarm, PowerSimulation
+    from ..scenarios.base import Scenario, ScenarioResult
+    from ..scenarios.registry import get_scenario, random_scenario
+    from ..rewards.reward_function import RewardFunction
+except ImportError:
+    from config import (
+        ASHRAE_CLASSES,
+        DatacenterConfig,
+        PowerConfig,
+        make_default_datacenter_config,
+        load_datacenter_config,
+    )
+    from models import DcOpsAction, DcOpsObservation
+    from actions.parser import AVAILABLE_ACTIONS, CommandResult, parse_command
+    from rendering.dashboard import render_dashboard
+    from simulation.thermal import ThermalAlarm, ThermalSimulation
+    from simulation.power import PowerAlarm, PowerSimulation
+    from scenarios.base import Scenario, ScenarioResult
+    from scenarios.registry import get_scenario, random_scenario
+    from rewards.reward_function import RewardFunction
+# Default episode configuration
+DEFAULT_STEP_BUDGET = 15
+DEFAULT_GAME_TIME_PER_STEP_S = 60.0  # 1 minute of sim time per agent step
+DEFAULT_SIM_DT_S = 1.0               # Physics integration timestep
+class DcOpsEnvironment(Environment):
+    """Datacenter operations environment for LLM-based RL agents.
+    The agent observes a text-based monitoring dashboard and issues
+    natural-language operator commands. The environment simulates
+    physics-based thermal and power dynamics.
+    Episode flow:
+      1. reset() initializes the datacenter and optionally injects a fault
+      2. step() parses the command, advances simulation, returns dashboard
+      3. Episode ends on: budget exhaustion, critical failure, escalation, or resolution
+    """
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self) -> None:
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._thermal_sim: ThermalSimulation | None = None
+        self._power_sim: PowerSimulation | None = None
+        self._config: DatacenterConfig | None = None
+        self._scenario: Scenario | None = None
+        self._reward_fn: RewardFunction | None = None
+        self._step_budget: int = DEFAULT_STEP_BUDGET
+        self._game_time_per_step_s: float = DEFAULT_GAME_TIME_PER_STEP_S
+        self._sim_dt_s: float = DEFAULT_SIM_DT_S
+        self._alert: str = ""
+        self._scenario_type: str = ""
+        self._done: bool = False
+        self._cumulative_reward: float = 0.0
+        self._action_history: list[str] = []
+        self._escalated: bool = False
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        episode_id: Optional[str] = None,
+        **kwargs: Any,
+    ) -> DcOpsObservation:
+        """Reset the environment and return initial observation.
+        Kwargs:
+            scenario (str | Scenario): Scenario ID (e.g. 'A1') or Scenario instance.
+                If provided, overrides config/alert/step_budget/scenario_type.
+                If not provided, uses raw kwargs (backward compatible).
+            config (DatacenterConfig): Custom datacenter configuration.
+            config_name (str): Built-in config name ("default", "small", "large").
+                Used when config is not provided (e.g. from WebSocket/HTTP JSON).
+            step_budget (int): Max steps for the episode.
+            game_time_per_step_s (float): Simulation time per step.
+            scenario_type (str): Scenario category label.
+            alert (str): Initial alert message.
+            fault_injection (dict): Fault to inject, e.g.
+                {"type": "crac_fault", "unit_id": "CRAC-3", "fault": "compressor"}
+        """
+        # Episode state
+        self._state = State(
+            episode_id=episode_id or str(uuid4()),
+            step_count=0,
+        )
+        self._done = False
+        self._cumulative_reward = 0.0
+        self._action_history = []
+        self._escalated = False
+        # Resolve scenario
+        scenario_arg = kwargs.get("scenario")
+        if isinstance(scenario_arg, str):
+            self._scenario = get_scenario(scenario_arg)
+        elif isinstance(scenario_arg, Scenario):
+            self._scenario = scenario_arg
+        elif scenario_arg is None and kwargs.get("random_scenario"):
+            self._scenario = random_scenario(
+                scenario_type=kwargs.get("scenario_type"),
+                difficulty=kwargs.get("difficulty"),
+                seed=seed,
+            )
+        else:
+            self._scenario = None
+        # Reset scenario mutable state (counters, flags) for episode reuse
+        if self._scenario:
+            self._scenario.reset_state()
+        # Configuration — scenario can modify the base config
+        # Support config_name (string) from JSON APIs, or config (DatacenterConfig) from Python
+        config_arg = kwargs.get("config")
+        config_name = kwargs.get("config_name")
+        if isinstance(config_arg, DatacenterConfig):
+            self._config = config_arg
+        elif config_name and isinstance(config_name, str) and config_name != "default":
+            self._config = load_datacenter_config(config_name)
+        else:
+            self._config = make_default_datacenter_config()
+        if self._scenario:
+            self._config = self._scenario.configure(self._config)
+        # Episode parameters — scenario provides defaults, kwargs can override
+        if self._scenario:
+            self._step_budget = kwargs.get("step_budget", self._scenario.step_budget)
+            self._game_time_per_step_s = kwargs.get(
+                "game_time_per_step_s", self._scenario.game_time_per_step_s
+            )
+            self._scenario_type = kwargs.get("scenario_type", self._scenario.scenario_type)
+            self._alert = kwargs.get("alert", self._scenario.alert_message)
+        else:
+            self._step_budget = kwargs.get("step_budget", DEFAULT_STEP_BUDGET)
+            self._game_time_per_step_s = kwargs.get("game_time_per_step_s", DEFAULT_GAME_TIME_PER_STEP_S)
+            self._scenario_type = kwargs.get("scenario_type", "")
+            self._alert = kwargs.get("alert", "")
+        self._sim_dt_s = self._config.simulation_dt_s
+        # Initialize reward function with scenario-type-aware weights
+        self._reward_fn = RewardFunction(scenario_type=self._scenario_type)
+        # Initialize simulations
+        self._thermal_sim = ThermalSimulation(self._config)
+        # Initialize power sim if config has power infrastructure
+        if self._config.power and self._config.power.ups_units:
+            it_load = self._thermal_sim.state.total_it_load_kw
+            self._power_sim = PowerSimulation(self._config.power, it_load_kw=it_load)
+            # Wire power state into datacenter state
+            self._thermal_sim.state.power = self._power_sim.state
+        else:
+            self._power_sim = None
+        # Apply fault injection — scenario or raw kwargs
+        if self._scenario:
+            # Warmup FIRST, then inject fault (so DC is at steady-state)
+            self._warmup_simulation()
+            self._scenario.inject_fault(self._thermal_sim, self._power_sim)
+        else:
+            fault = kwargs.get("fault_injection")
+            if fault:
+                self._apply_fault_injection(fault)
+            self._warmup_simulation()
+        # Render initial observation
+        return self._make_observation(action_result="Environment initialized. Awaiting your command.")
+    def step(
+        self,
+        action: DcOpsAction,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> DcOpsObservation:
+        """Execute one agent step.
+        1. Parse and execute the command
+        2. Advance simulation by game_time_per_step_s
+        3. Check for alarms and termination
+        4. Compute reward via RewardFunction
+        5. Return observation
+        """
+        if self._done:
+            return self._make_observation(
+                action_result="Episode already ended. Call reset().",
+                reward=0.0,
+            )
+        self._state.step_count += 1
+        self._action_history.append(action.command)
+        # 1. Parse and execute command
+        cmd_result = parse_command(
+            action.command,
+            self._thermal_sim,
+            self._power_sim,
+        )
+        # Handle escalation
+        if cmd_result.command_name == "escalate":
+            self._escalated = True
+            self._done = True
+            # Evaluate scenario for procedure penalties
+            scenario_result: ScenarioResult | None = None
+            if self._scenario:
+                scenario_result = self._scenario.evaluate_step(
+                    self._thermal_sim, self._power_sim,
+                    action.command, self._action_history,
+                    self._state.step_count,
+                )
+            # Compute base reward components — escalation penalty is handled
+            # by scenario procedure rules + action_quality, not doubled here
+            components = self._reward_fn.compute(
+                self._thermal_sim, self._power_sim, cmd_result,
+                action.command, self._action_history, scenario_result,
+            )
+            reward = components.total
+            self._cumulative_reward += reward
+            return self._make_observation(
+                action_result=cmd_result.message,
+                reward=reward,
+            )
+        # 2. Advance simulation
+        thermal_alarms, power_alarms = self._advance_simulation()
+        # 3. Build alert from alarms
+        self._update_alert(thermal_alarms, power_alarms)
+        # 4. Evaluate scenario (before reward, so progress is available)
+        scenario_result = None
+        if self._scenario:
+            scenario_result = self._scenario.evaluate_step(
+                self._thermal_sim, self._power_sim,
+                action.command, self._action_history,
+                self._state.step_count,
+            )
+        # 5. Compute reward via RewardFunction
+        components = self._reward_fn.compute(
+            self._thermal_sim, self._power_sim, cmd_result,
+            action.command, self._action_history, scenario_result,
+        )
+        reward = components.total
+        self._cumulative_reward += reward
+        # 6. Check termination
+        self._check_termination(thermal_alarms, power_alarms)
+        # 6b. Scenario resolution
+        if scenario_result and scenario_result.resolved and not self._done:
+            self._done = True
+            # Speed bonus: fraction of budget remaining
+            speed_bonus = (self._step_budget - self._state.step_count) / self._step_budget
+            reward += speed_bonus
+            self._cumulative_reward += speed_bonus
+            if scenario_result.resolution_message:
+                self._alert = scenario_result.resolution_message
+        return self._make_observation(
+            action_result=cmd_result.message,
+            reward=reward,
+        )
+    @property
+    def state(self) -> State:
+        return self._state
+    # -------------------------------------------------------------------
+    # Internal methods
+    # -------------------------------------------------------------------
+    def _warmup_simulation(self, warmup_steps: int = 120) -> None:
+        """Run simulation for a brief warmup to reach quasi-steady-state."""
+        for _ in range(warmup_steps):
+            self._thermal_sim.step(self._sim_dt_s)
+            if self._power_sim:
+                it_load = self._thermal_sim.state.total_it_load_kw
+                self._power_sim.step(self._sim_dt_s, it_load)
+    def _advance_simulation(self) -> tuple[list[ThermalAlarm], list[PowerAlarm]]:
+        """Advance simulation by game_time_per_step_s seconds."""
+        n_substeps = int(self._game_time_per_step_s / self._sim_dt_s)
+        all_thermal_alarms: list[ThermalAlarm] = []
+        all_power_alarms: list[PowerAlarm] = []
+        for _ in range(n_substeps):
+            # Thermal step
+            thermal_result = self._thermal_sim.step(self._sim_dt_s)
+            all_thermal_alarms.extend(thermal_result.alarms)
+            # Power step
+            if self._power_sim:
+                it_load = self._thermal_sim.state.total_it_load_kw
+                power_result = self._power_sim.step(self._sim_dt_s, it_load)
+                all_power_alarms.extend(power_result.alarms)
+        # Deduplicate alarms by type (keep most recent)
+        thermal_alarms = _dedupe_alarms_by_type(all_thermal_alarms)
+        power_alarms = _dedupe_alarms_by_type(all_power_alarms)
+        return thermal_alarms, power_alarms
+    def _update_alert(
+        self,
+        thermal_alarms: list[ThermalAlarm],
+        power_alarms: list[PowerAlarm],
+    ) -> None:
+        """Update the active alert string from current alarms."""
+        critical_messages: list[str] = []
+        for alarm in thermal_alarms:
+            if alarm.severity == "critical":
+                critical_messages.append(alarm.message)
+        for alarm in power_alarms:
+            if alarm.severity == "critical":
+                critical_messages.append(alarm.message)
+        if critical_messages:
+            self._alert = " | ".join(critical_messages[:3])  # Limit to 3 alerts
+        else:
+            # Check for warnings
+            warnings = []
+            for alarm in thermal_alarms:
+                if alarm.severity == "warning":
+                    warnings.append(alarm.message)
+            for alarm in power_alarms:
+                if alarm.severity == "warning":
+                    warnings.append(alarm.message)
+            if warnings:
+                self._alert = warnings[0]
+            else:
+                self._alert = ""
+    def _check_termination(
+        self,
+        thermal_alarms: list[ThermalAlarm],
+        power_alarms: list[PowerAlarm],
+    ) -> None:
+        """Check if episode should end."""
+        # Step budget exhausted
+        if self._state.step_count >= self._step_budget:
+            self._done = True
+            return
+        # Critical thermal failure: any rack above allowable max
+        for zone in self._thermal_sim.state.zones:
+            ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
+            if not ashrae:
+                continue
+            if zone.max_inlet_temp_c > ashrae.allowable_max_c + 5.0:
+                self._done = True
+                self._alert = (
+                    f"CRITICAL: Zone {zone.zone_id} inlet temp "
+                    f"{zone.max_inlet_temp_c:.1f}°C exceeds allowable max "
+                    f"{ashrae.allowable_max_c:.1f}°C by >5°C. Emergency shutdown."
+                )
+                return
+        # UPS battery exhausted
+        if self._power_sim:
+            for ups in self._power_sim.state.ups_units:
+                if ups.mode.value == "fault" and ups.battery_soc <= 0:
+                    self._done = True
+                    self._alert = f"CRITICAL: {ups.unit_id} battery exhausted. Unprotected load."
+                    return
+    def _apply_fault_injection(self, fault: dict) -> None:
+        """Apply a fault injection to the simulation.
+        Supported fault types:
+          - crac_fault: {"type": "crac_fault", "unit_id": "CRAC-3", "fault": "compressor"}
+          - utility_loss: {"type": "utility_loss"}
+          - ups_fault: {"type": "ups_fault", "unit_id": "UPS-1"}
+          - rack_load_change: {"type": "rack_load_change", "rack_id": "A-01", "load_kw": 15.0}
+          - outside_temp: {"type": "outside_temp", "temp_c": 42.0}
+        """
+        fault_type = fault.get("type", "")
+        if fault_type == "crac_fault":
+            from ..simulation.types import CRACFaultType
+            unit_id = fault.get("unit_id", "")
+            fault_name = fault.get("fault", "compressor")
+            try:
+                ft = CRACFaultType(fault_name)
+            except ValueError:
+                ft = CRACFaultType.COMPRESSOR
+            self._thermal_sim.inject_crac_fault(unit_id, ft)
+        elif fault_type == "utility_loss":
+            if self._power_sim:
+                self._power_sim.set_utility_available(False)
+        elif fault_type == "ups_fault":
+            if self._power_sim:
+                unit_id = fault.get("unit_id", "")
+                self._power_sim.inject_ups_fault(unit_id)
+        elif fault_type == "rack_load_change":
+            rack_id = fault.get("rack_id", "")
+            load_kw = fault.get("load_kw", 8.0)
+            self._thermal_sim.set_rack_load(rack_id, load_kw)
+        elif fault_type == "outside_temp":
+            temp_c = fault.get("temp_c", 35.0)
+            self._thermal_sim.set_outside_temp(temp_c)
+    def _make_observation(
+        self,
+        action_result: str = "",
+        reward: float = 0.0,
+    ) -> DcOpsObservation:
+        """Build the observation to return to the agent."""
+        dashboard = render_dashboard(
+            self._thermal_sim.state,
+            alert=self._alert,
+            step=self._state.step_count,
+            max_steps=self._step_budget,
+            scenario_type=self._scenario_type,
+        )
+        steps_remaining = max(0, self._step_budget - self._state.step_count)
+        # Build metadata with structured data
+        dc_state = self._thermal_sim.state
+        metadata = {
+            "sim_time_s": dc_state.sim_time_s,
+            "total_it_load_kw": dc_state.total_it_load_kw,
+            "total_cooling_power_kw": dc_state.total_cooling_power_kw,
+            "pue": dc_state.pue,
+            "outside_temp_c": dc_state.outside_temp_c,
+            "cumulative_reward": self._cumulative_reward,
+            "zones": {},
+        }
+        for zone in dc_state.zones:
+            metadata["zones"][zone.zone_id] = {
+                "cold_aisle_temp_c": zone.cold_aisle_temp_c,
+                "hot_aisle_temp_c": zone.hot_aisle_temp_c,
+                "max_inlet_temp_c": zone.max_inlet_temp_c,
+                "total_it_load_kw": zone.total_it_load_kw,
+            }
+        if self._power_sim:
+            power = self._power_sim.state
+            metadata["power"] = {
+                "utility_available": power.utility_available,
+                "on_generator": power.on_generator,
+                "total_ups_loss_kw": power.total_ups_loss_kw,
+                "total_pdu_loss_kw": power.total_pdu_loss_kw,
+            }
+            for ups in power.ups_units:
+                metadata["power"][ups.unit_id] = {
+                    "mode": ups.mode.value,
+                    "battery_soc": ups.battery_soc,
+                    "load_fraction": ups.load_fraction,
+                    "efficiency": ups.efficiency,
+                }
+        if self._scenario:
+            metadata["scenario"] = {
+                "id": self._scenario.scenario_id,
+                "name": self._scenario.name,
+                "difficulty": self._scenario.difficulty,
+            }
+        # Use scenario-specific actions if defined, otherwise all actions
+        actions = AVAILABLE_ACTIONS
+        if self._scenario and self._scenario.available_actions is not None:
+            actions = self._scenario.available_actions
+        return DcOpsObservation(
+            dashboard=dashboard,
+            available_actions=actions,
+            alert=self._alert,
+            scenario_type=self._scenario_type,
+            steps_remaining=steps_remaining,
+            action_result=action_result,
+            done=self._done,
+            reward=reward,
+            metadata=metadata,
+        )
+def _dedupe_alarms_by_type(alarms: list) -> list:
+    """Keep only the last alarm of each (component, alarm_type) pair."""
+    seen: dict[tuple[str, str], Any] = {}
+    for alarm in alarms:
+        key = (getattr(alarm, "component", ""), getattr(alarm, "alarm_type", ""))
+        seen[key] = alarm
+    return list(seen.values())

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+openenv[core]>=0.2.0
+fastapi>=0.115.0
+uvicorn>=0.24.0

server/static/index.html ADDED Viewed

	@@ -0,0 +1,911 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>DC-Ops | Datacenter Operations Console</title>
+<style>
+*,*::before,*::after{box-sizing:border-box;margin:0;padding:0}
+:root{
+  --bg:#0a0e17;--bg-card:#111827;--bg-card-hover:#1a2332;
+  --border:#1e2d3d;--border-active:#3b82f6;
+  --text:#e2e8f0;--text-dim:#94a3b8;--text-muted:#64748b;
+  --accent:#3b82f6;--accent-hover:#2563eb;
+  --green:#22c55e;--green-dim:#166534;
+  --red:#ef4444;--red-dim:#991b1b;
+  --yellow:#eab308;--yellow-dim:#854d0e;
+  --orange:#f97316;
+  --cyan:#06b6d4;
+  --terminal-bg:#0d1117;
+  --font-mono:'JetBrains Mono','Fira Code','SF Mono','Cascadia Code',Consolas,monospace;
+  --font-sans:'Inter',-apple-system,BlinkMacSystemFont,'Segoe UI',Roboto,sans-serif;
+  --radius:8px;--radius-lg:12px;
+}
+html{font-size:14px}
+body{background:var(--bg);color:var(--text);font-family:var(--font-sans);min-height:100vh;overflow-x:hidden}
+/* Layout */
+.app{display:grid;grid-template-rows:auto 1fr;height:100vh}
+.header{background:var(--bg-card);border-bottom:1px solid var(--border);padding:0.75rem 1.5rem;display:flex;align-items:center;justify-content:space-between;gap:1rem;flex-wrap:wrap}
+.header-left{display:flex;align-items:center;gap:0.75rem}
+.logo{font-size:1.25rem;font-weight:700;letter-spacing:-0.02em}
+.logo span{color:var(--accent)}
+.status-badge{display:inline-flex;align-items:center;gap:0.375rem;padding:0.25rem 0.75rem;border-radius:999px;font-size:0.75rem;font-weight:500}
+.status-badge.connected{background:var(--green-dim);color:var(--green)}
+.status-badge.disconnected{background:var(--red-dim);color:var(--red)}
+.status-badge.loading{background:var(--yellow-dim);color:var(--yellow)}
+.status-dot{width:6px;height:6px;border-radius:50%;background:currentColor}
+.status-badge.connected .status-dot{animation:pulse 2s infinite}
+@keyframes pulse{0%,100%{opacity:1}50%{opacity:0.5}}
+.main{display:grid;grid-template-columns:280px 1fr 300px;gap:0;overflow:hidden}
+/* Sidebar - Scenario Browser */
+.sidebar{background:var(--bg-card);border-right:1px solid var(--border);display:flex;flex-direction:column;overflow:hidden}
+.sidebar-header{padding:0.875rem 1rem;border-bottom:1px solid var(--border);font-weight:600;font-size:0.8rem;text-transform:uppercase;letter-spacing:0.05em;color:var(--text-dim)}
+.scenario-list{flex:1;overflow-y:auto;padding:0.5rem}
+.scenario-group{margin-bottom:0.75rem}
+.scenario-group-title{padding:0.375rem 0.75rem;font-size:0.65rem;font-weight:600;text-transform:uppercase;letter-spacing:0.1em;color:var(--text-muted)}
+.scenario-card{padding:0.625rem 0.75rem;margin:0.25rem 0;border-radius:var(--radius);cursor:pointer;transition:all 0.15s;border:1px solid transparent}
+.scenario-card:hover{background:var(--bg-card-hover);border-color:var(--border)}
+.scenario-card.active{background:rgba(59,130,246,0.08);border-color:var(--accent)}
+.scenario-card .sc-header{display:flex;align-items:center;justify-content:space-between;margin-bottom:0.25rem}
+.scenario-card .sc-id{font-weight:700;font-family:var(--font-mono);font-size:0.8rem;color:var(--accent)}
+.scenario-card .sc-diff{font-size:0.6rem;font-weight:700;padding:0.1rem 0.5rem;border-radius:999px;text-transform:uppercase;letter-spacing:0.05em}
+.sc-diff.easy{background:var(--green-dim);color:var(--green)}
+.sc-diff.medium{background:var(--yellow-dim);color:var(--yellow)}
+.sc-diff.hard{background:var(--red-dim);color:var(--red)}
+.scenario-card .sc-name{font-size:0.78rem;font-weight:500;color:var(--text);margin-bottom:0.125rem}
+.scenario-card .sc-desc{font-size:0.68rem;color:var(--text-muted);line-height:1.4}
+.sidebar-actions{padding:0.75rem;border-top:1px solid var(--border);display:flex;flex-direction:column;gap:0.5rem}
+.config-select{width:100%;padding:0.5rem 0.625rem;background:var(--terminal-bg);border:1px solid var(--border);border-radius:var(--radius);color:var(--text);font-size:0.78rem;font-family:var(--font-sans);appearance:none;background-image:url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 12 12'%3E%3Cpath fill='%2394a3b8' d='M3 5l3 3 3-3'/%3E%3C/svg%3E");background-repeat:no-repeat;background-position:right 0.5rem center;padding-right:1.5rem}
+.config-select:focus{outline:none;border-color:var(--accent)}
+.btn{padding:0.625rem 1rem;border-radius:var(--radius);border:none;cursor:pointer;font-weight:600;font-size:0.8rem;transition:all 0.15s;text-align:center;font-family:var(--font-sans);display:flex;align-items:center;justify-content:center;gap:0.5rem}
+.btn-primary{background:var(--accent);color:white}
+.btn-primary:hover:not(:disabled){background:var(--accent-hover)}
+.btn-primary:disabled{opacity:0.5;cursor:not-allowed}
+.btn-danger{background:var(--red-dim);color:var(--red);border:1px solid rgba(239,68,68,0.2)}
+.btn-danger:hover{background:var(--red);color:white}
+.btn-outline{background:transparent;color:var(--text-dim);border:1px solid var(--border)}
+.btn-outline:hover{border-color:var(--text-dim);color:var(--text)}
+/* Center Panel - Dashboard */
+.center{display:flex;flex-direction:column;overflow:hidden;min-width:0}
+.dashboard-container{flex:1;overflow-y:auto;padding:1rem}
+.dashboard-box{background:var(--terminal-bg);border:1px solid var(--border);border-radius:var(--radius-lg);overflow:hidden}
+.dashboard-title-bar{display:flex;align-items:center;justify-content:space-between;padding:0.5rem 1rem;background:rgba(255,255,255,0.03);border-bottom:1px solid var(--border)}
+.dashboard-title-bar .dots{display:flex;gap:6px}
+.dashboard-title-bar .dots span{width:10px;height:10px;border-radius:50%}
+.dashboard-title-bar .dots span:nth-child(1){background:#ef4444}
+.dashboard-title-bar .dots span:nth-child(2){background:#eab308}
+.dashboard-title-bar .dots span:nth-child(3){background:#22c55e}
+.dashboard-title-bar .title{font-size:0.72rem;color:var(--text-muted);font-family:var(--font-mono)}
+.dashboard-output{padding:1rem;font-family:var(--font-mono);font-size:0.75rem;line-height:1.2;white-space:pre;overflow-x:auto;min-height:200px;color:var(--green)}
+/* Action result box */
+.action-result{margin-top:0.75rem;background:var(--terminal-bg);border:1px solid var(--border);border-radius:var(--radius);padding:0.625rem 0.875rem;font-family:var(--font-mono);font-size:0.75rem;max-height:100px;overflow-y:auto;transition:all 0.2s}
+.action-result.error{color:var(--red);border-color:rgba(239,68,68,0.3)}
+.action-result.success{color:var(--cyan);border-color:rgba(6,182,212,0.3)}
+/* Welcome screen */
+.welcome{display:flex;flex-direction:column;align-items:center;justify-content:center;text-align:center;padding:3rem 2rem;min-height:300px;white-space:normal}
+.welcome h2{font-size:1.4rem;color:var(--text);margin-bottom:0.75rem;font-weight:700}
+.welcome p{max-width:380px;line-height:1.7;font-size:0.85rem;color:var(--text-dim)}
+.welcome .hint{margin-top:1.5rem;display:flex;align-items:center;gap:0.5rem;color:var(--accent);font-size:0.8rem;opacity:0.7}
+.welcome .hint svg{width:20px;height:20px}
+/* Command Bar */
+.command-bar{padding:0.75rem 1rem;border-top:1px solid var(--border);background:var(--bg-card)}
+.command-input-group{display:flex;gap:0.5rem}
+.command-input{flex:1;padding:0.625rem 0.875rem;background:var(--terminal-bg);border:1px solid var(--border);border-radius:var(--radius);color:var(--text);font-family:var(--font-mono);font-size:0.8rem;min-width:0}
+.command-input:focus{outline:none;border-color:var(--accent);box-shadow:0 0 0 3px rgba(59,130,246,0.12)}
+.command-input::placeholder{color:var(--text-muted)}
+.command-input:disabled{opacity:0.4}
+.quick-actions{display:flex;gap:0.375rem;margin-top:0.5rem;flex-wrap:wrap}
+.quick-btn{padding:0.2rem 0.5rem;background:rgba(255,255,255,0.03);border:1px solid var(--border);border-radius:999px;color:var(--text-dim);font-size:0.68rem;cursor:pointer;font-family:var(--font-mono);transition:all 0.15s;white-space:nowrap}
+.quick-btn:hover:not(:disabled){border-color:var(--accent);color:var(--accent)}
+.quick-btn:disabled{opacity:0.3;cursor:not-allowed}
+/* Right Panel - Metrics */
+.right-panel{background:var(--bg-card);border-left:1px solid var(--border);display:flex;flex-direction:column;overflow-y:auto}
+.panel-section{padding:0.875rem;border-bottom:1px solid var(--border)}
+.panel-section-title{font-size:0.65rem;font-weight:600;text-transform:uppercase;letter-spacing:0.1em;color:var(--text-muted);margin-bottom:0.625rem}
+/* Metrics grid */
+.metrics-grid{display:grid;grid-template-columns:1fr 1fr;gap:0.375rem}
+.metric{background:var(--terminal-bg);padding:0.5rem 0.625rem;border-radius:var(--radius);border:1px solid var(--border)}
+.metric-label{font-size:0.6rem;color:var(--text-muted);margin-bottom:0.2rem;text-transform:uppercase;letter-spacing:0.05em}
+.metric-value{font-size:1rem;font-weight:700;font-family:var(--font-mono)}
+.metric-value.good{color:var(--green)}
+.metric-value.warn{color:var(--yellow)}
+.metric-value.danger{color:var(--red)}
+.metric-value.neutral{color:var(--text)}
+/* Episode info */
+.episode-info{display:flex;flex-direction:column;gap:0.375rem}
+.episode-row{display:flex;justify-content:space-between;align-items:center;font-size:0.78rem}
+.episode-row .label{color:var(--text-muted)}
+.episode-row .value{font-family:var(--font-mono);font-weight:600}
+/* Progress bar */
+.progress-bar{height:5px;background:var(--terminal-bg);border-radius:999px;overflow:hidden;border:1px solid var(--border)}
+.progress-fill{height:100%;border-radius:999px;transition:width 0.3s;background:var(--accent)}
+.progress-fill.low{background:var(--green)}
+.progress-fill.mid{background:var(--yellow)}
+.progress-fill.high{background:var(--red)}
+/* Power status */
+.power-row{display:flex;justify-content:space-between;align-items:center;padding:0.375rem 0.5rem;background:var(--terminal-bg);border-radius:4px;font-size:0.72rem;font-family:var(--font-mono);border:1px solid var(--border);margin-bottom:0.375rem}
+.power-row .pw-label{color:var(--text-dim)}
+.power-row .pw-val{font-weight:600}
+.power-row .pw-val.ok{color:var(--green)}
+.power-row .pw-val.warn{color:var(--yellow)}
+.power-row .pw-val.bad{color:var(--red)}
+/* Reward history */
+.reward-history{display:flex;flex-direction:column;gap:0.25rem;max-height:180px;overflow-y:auto}
+.reward-entry{display:flex;justify-content:space-between;align-items:center;padding:0.3rem 0.5rem;background:var(--terminal-bg);border-radius:4px;font-size:0.7rem;font-family:var(--font-mono)}
+.reward-entry .step{color:var(--text-muted);width:24px;flex-shrink:0}
+.reward-entry .cmd{color:var(--text-dim);flex:1;margin:0 0.5rem;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
+.reward-entry .rew{font-weight:700;flex-shrink:0}
+.reward-entry .rew.pos{color:var(--green)}
+.reward-entry .rew.neg{color:var(--red)}
+.reward-entry .rew.zero{color:var(--text-muted)}
+/* Zone temps bar chart */
+.zone-bars{display:flex;flex-direction:column;gap:0.375rem}
+.zone-bar-row{display:flex;align-items:center;gap:0.5rem;font-size:0.72rem}
+.zone-bar-label{width:44px;color:var(--text-dim);font-family:var(--font-mono);flex-shrink:0}
+.zone-bar-track{flex:1;height:14px;background:var(--terminal-bg);border-radius:3px;position:relative;overflow:hidden;border:1px solid var(--border)}
+.zone-bar-fill{height:100%;border-radius:2px;transition:width 0.3s}
+.zone-bar-fill.safe{background:linear-gradient(90deg,var(--green-dim),var(--green))}
+.zone-bar-fill.warning{background:linear-gradient(90deg,var(--yellow-dim),var(--yellow))}
+.zone-bar-fill.critical{background:linear-gradient(90deg,var(--red-dim),var(--red))}
+.zone-bar-value{width:52px;text-align:right;font-family:var(--font-mono);font-weight:600;flex-shrink:0}
+/* Episode done banner */
+.episode-done-banner{padding:0.625rem 1rem;text-align:center;font-weight:600;font-size:0.8rem;border-radius:var(--radius);margin-bottom:0.75rem;display:none}
+.episode-done-banner.show{display:block}
+.episode-done-banner.resolved{background:var(--green-dim);color:var(--green);border:1px solid rgba(34,197,94,0.3)}
+.episode-done-banner.failed{background:var(--red-dim);color:var(--red);border:1px solid rgba(239,68,68,0.3)}
+.episode-done-banner.timeout{background:var(--yellow-dim);color:var(--yellow);border:1px solid rgba(234,179,8,0.3)}
+/* No data placeholder */
+.no-data{font-size:0.75rem;color:var(--text-muted);text-align:center;padding:0.75rem 0.5rem}
+/* Spinner */
+.spinner{display:inline-block;width:14px;height:14px;border:2px solid rgba(255,255,255,0.2);border-top-color:currentColor;border-radius:50%;animation:spin 0.5s linear infinite}
+@keyframes spin{to{transform:rotate(360deg)}}
+/* Scrollbar */
+::-webkit-scrollbar{width:5px;height:5px}
+::-webkit-scrollbar-track{background:transparent}
+::-webkit-scrollbar-thumb{background:var(--border);border-radius:3px}
+::-webkit-scrollbar-thumb:hover{background:var(--text-muted)}
+/* Responsive - Tablet */
+@media(max-width:1100px){
+  .main{grid-template-columns:1fr;grid-template-rows:auto 1fr auto}
+  .sidebar{border-right:none;border-bottom:1px solid var(--border);max-height:none}
+  .sidebar.collapsed{display:none}
+  .right-panel{border-left:none;border-top:1px solid var(--border);max-height:none}
+  .right-panel.collapsed{display:none}
+  .sidebar-header{display:none}
+  .scenario-list{display:flex;overflow-x:auto;overflow-y:hidden;padding:0.5rem;gap:0.5rem}
+  .scenario-group{display:flex;gap:0.5rem;margin:0;flex-shrink:0}
+  .scenario-group-title{writing-mode:vertical-lr;padding:0.5rem 0.25rem;font-size:0.6rem}
+  .scenario-card{min-width:160px;flex-shrink:0}
+  .sidebar-actions{flex-direction:row}
+  .config-select{width:auto;flex:1}
+  .right-panel .panel-section{padding:0.625rem 0.75rem}
+  .metrics-grid{grid-template-columns:repeat(4,1fr)}
+}
+/* Responsive - Mobile */
+@media(max-width:640px){
+  .header{padding:0.5rem 0.75rem}
+  .logo{font-size:1rem}
+  .dashboard-output{font-size:0.62rem;padding:0.5rem;line-height:1.2}
+  .metrics-grid{grid-template-columns:1fr 1fr}
+  html{font-size:13px}
+  .command-input{font-size:0.75rem}
+  .scenario-card{min-width:140px}
+  .sidebar-actions{flex-direction:column}
+}
+/* Toggle buttons for mobile */
+.mobile-toggles{display:none;gap:0.5rem}
+@media(max-width:1100px){.mobile-toggles{display:flex}}
+.toggle-btn{padding:0.25rem 0.625rem;background:transparent;border:1px solid var(--border);border-radius:var(--radius);color:var(--text-dim);font-size:0.7rem;cursor:pointer;font-family:var(--font-sans);transition:all 0.15s}
+.toggle-btn.active{border-color:var(--accent);color:var(--accent);background:rgba(59,130,246,0.08)}
+</style>
+</head>
+<body>
+<div class="app">
+  <!-- Header -->
+  <header class="header">
+    <div class="header-left">
+      <div class="logo">DC<span>-Ops</span></div>
+      <div id="statusBadge" class="status-badge disconnected">
+        <span class="status-dot"></span>
+        <span id="statusText">Disconnected</span>
+      </div>
+    </div>
+    <div class="mobile-toggles">
+      <button class="toggle-btn active" id="toggleScenarios" onclick="togglePanel('sidebar')">Scenarios</button>
+      <button class="toggle-btn active" id="toggleMetrics" onclick="togglePanel('right-panel')">Metrics</button>
+    </div>
+  </header>
+  <!-- Main Layout -->
+  <div class="main">
+    <!-- Left: Scenario Browser -->
+    <aside class="sidebar" id="sidebar">
+      <div class="sidebar-header">Scenario Browser</div>
+      <div class="scenario-list" id="scenarioList">
+        <div class="scenario-group">
+          <div class="scenario-group-title">Thermal</div>
+          <div class="scenario-card" data-id="A1" onclick="selectScenario('A1')">
+            <div class="sc-header">
+              <span class="sc-id">A1</span>
+              <span class="sc-diff easy">Easy</span>
+            </div>
+            <div class="sc-name">Cooling Setpoint Optimization</div>
+            <div class="sc-desc">CRACs overcooling at 15°C. Optimize for efficiency while staying in ASHRAE range.</div>
+          </div>
+          <div class="scenario-card" data-id="A2" onclick="selectScenario('A2')">
+            <div class="sc-header">
+              <span class="sc-id">A2</span>
+              <span class="sc-diff medium">Medium</span>
+            </div>
+            <div class="sc-name">Thermal Event Response</div>
+            <div class="sc-desc">CRAC-3 compressor failure. Diagnose and stabilize all zones.</div>
+          </div>
+          <div class="scenario-card" data-id="A4" onclick="selectScenario('A4')">
+            <div class="sc-header">
+              <span class="sc-id">A4</span>
+              <span class="sc-diff hard">Hard</span>
+            </div>
+            <div class="sc-name">CRAC Failure Cascade</div>
+            <div class="sc-desc">CRAC-1 compressor + CRAC-3 fan failure. Manage cascading thermal event.</div>
+          </div>
+        </div>
+        <div class="scenario-group">
+          <div class="scenario-group-title">Power</div>
+          <div class="scenario-card" data-id="B1" onclick="selectScenario('B1')">
+            <div class="sc-header">
+              <span class="sc-id">B1</span>
+              <span class="sc-diff medium">Medium</span>
+            </div>
+            <div class="sc-name">UPS Alarm Response</div>
+            <div class="sc-desc">UPS transferred to battery after utility event. Diagnose and acknowledge.</div>
+          </div>
+          <div class="scenario-card" data-id="B3" onclick="selectScenario('B3')">
+            <div class="sc-header">
+              <span class="sc-id">B3</span>
+              <span class="sc-diff easy">Easy</span>
+            </div>
+            <div class="sc-name">Generator Test Protocol</div>
+            <div class="sc-desc">Routine monthly generator test. Follow 5-step protocol correctly.</div>
+          </div>
+          <div class="scenario-card" data-id="B4" onclick="selectScenario('B4')">
+            <div class="sc-header">
+              <span class="sc-id">B4</span>
+              <span class="sc-diff hard">Hard</span>
+            </div>
+            <div class="sc-name">Power Failure Cascade</div>
+            <div class="sc-desc">Utility loss + extended generator warmup. Manage battery and thermal.</div>
+          </div>
+        </div>
+      </div>
+      <div class="sidebar-actions">
+        <select id="configSelect" class="config-select">
+          <option value="default">Default Facility (2 zones, 160 kW)</option>
+          <option value="small">Small Facility (1 zone, 80 kW)</option>
+          <option value="large">Large Facility (4 zones, 600 kW)</option>
+        </select>
+        <button id="startBtn" class="btn btn-primary" onclick="startEpisode()" disabled>
+          Select a Scenario
+        </button>
+        <button id="resetBtn" class="btn btn-outline" onclick="resetEpisode()" style="display:none">
+          Reset Episode
+        </button>
+      </div>
+    </aside>
+    <!-- Center: Dashboard Display -->
+    <div class="center">
+      <div class="dashboard-container" id="dashboardContainer">
+        <div id="doneBanner" class="episode-done-banner"></div>
+        <div class="dashboard-box">
+          <div class="dashboard-title-bar">
+            <div class="dots"><span></span><span></span><span></span></div>
+            <div class="title" id="terminalTitle">dc-ops-console</div>
+          </div>
+          <div class="dashboard-output" id="dashboardOutput"><div class="welcome">
+<h2>DC-Ops Operations Console</h2>
+<p>Select a scenario from the panel to begin a datacenter operations episode. Issue commands and monitor the facility in real-time.</p>
+<div class="hint">
+<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M19 12H5M12 19l-7-7 7-7"/></svg>
+Pick a scenario to start
+</div>
+</div></div>
+        </div>
+        <div id="actionResult" class="action-result" style="display:none"></div>
+      </div>
+      <!-- Command Bar -->
+      <div class="command-bar">
+        <div class="command-input-group">
+          <input type="text" id="commandInput" class="command-input"
+                 placeholder="Enter command (e.g., diagnose CRAC-3)"
+                 disabled autocomplete="off"
+                 onkeydown="if(event.key==='Enter'&&!event.shiftKey)sendCommand()">
+          <button id="sendBtn" class="btn btn-primary" onclick="sendCommand()" disabled>Send</button>
+        </div>
+        <div class="quick-actions" id="quickActions">
+          <button class="quick-btn" disabled onclick="quickCmd('check_status')">check_status</button>
+          <button class="quick-btn" disabled onclick="quickCmd('diagnose CRAC-1')">diagnose CRAC-1</button>
+          <button class="quick-btn" disabled onclick="quickCmd('diagnose CRAC-3')">diagnose CRAC-3</button>
+          <button class="quick-btn" disabled onclick="quickCmd('acknowledge_alarm')">ack_alarm</button>
+          <button class="quick-btn" disabled onclick="quickCmd('start_generator')">start_gen</button>
+          <button class="quick-btn" disabled onclick="quickCmd('wait')">wait</button>
+          <button class="quick-btn" disabled onclick="quickCmd('escalate')">escalate</button>
+        </div>
+      </div>
+    </div>
+    <!-- Right: Metrics Panel -->
+    <aside class="right-panel" id="right-panel">
+      <div class="panel-section">
+        <div class="panel-section-title">Episode</div>
+        <div class="episode-info">
+          <div class="episode-row">
+            <span class="label">Scenario</span>
+            <span class="value" id="metaScenario">--</span>
+          </div>
+          <div class="episode-row">
+            <span class="label">Step</span>
+            <span class="value"><span id="metaStep">0</span> / <span id="metaMaxSteps">--</span></span>
+          </div>
+          <div class="progress-bar">
+            <div class="progress-fill" id="stepProgress" style="width:0%"></div>
+          </div>
+          <div class="episode-row">
+            <span class="label">Total Reward</span>
+            <span class="value" id="metaCumReward" style="color:var(--text)">0.00</span>
+          </div>
+        </div>
+      </div>
+      <div class="panel-section">
+        <div class="panel-section-title">Key Metrics</div>
+        <div class="metrics-grid">
+          <div class="metric">
+            <div class="metric-label">PUE</div>
+            <div class="metric-value neutral" id="metricPUE">--</div>
+          </div>
+          <div class="metric">
+            <div class="metric-label">IT Load</div>
+            <div class="metric-value neutral" id="metricIT">--</div>
+          </div>
+          <div class="metric">
+            <div class="metric-label">Cooling</div>
+            <div class="metric-value neutral" id="metricCooling">--</div>
+          </div>
+          <div class="metric">
+            <div class="metric-label">Outside</div>
+            <div class="metric-value neutral" id="metricOutside">--</div>
+          </div>
+        </div>
+      </div>
+      <div class="panel-section">
+        <div class="panel-section-title">Zone Temperatures</div>
+        <div class="zone-bars" id="zoneBars">
+          <div class="no-data">No data</div>
+        </div>
+      </div>
+      <div class="panel-section">
+        <div class="panel-section-title">Power</div>
+        <div id="powerInfo">
+          <div class="no-data">No data</div>
+        </div>
+      </div>
+      <div class="panel-section">
+        <div class="panel-section-title">Reward History</div>
+        <div class="reward-history" id="rewardHistory">
+          <div class="no-data">No steps yet</div>
+        </div>
+      </div>
+    </aside>
+  </div>
+</div>
+<script>
+// ─── State ───────────────────────────────────────────────────────────
+let selectedScenario = null;
+let episodeActive = false;
+let stepCount = 0;
+let maxSteps = 0;
+let cumulativeReward = 0;
+let rewardEntries = [];
+let isProcessing = false;
+let ws = null;
+let pendingResolve = null; // For awaiting WS responses
+const BASE_URL = window.location.origin;
+// ─── Scenario metadata ──────────────────────────────────────────────
+const SCENARIOS = {
+  A1: { name: 'Cooling Setpoint Optimization', type: 'thermal', diff: 'Easy' },
+  A2: { name: 'Thermal Event Response', type: 'thermal', diff: 'Medium' },
+  A4: { name: 'CRAC Failure Cascade', type: 'thermal', diff: 'Hard' },
+  B1: { name: 'UPS Alarm Response', type: 'power', diff: 'Medium' },
+  B3: { name: 'Generator Test Protocol', type: 'power', diff: 'Easy' },
+  B4: { name: 'Power Failure Cascade', type: 'power', diff: 'Hard' },
+};
+// ─── WebSocket connection ────────────────────────────────────────────
+function connectWebSocket() {
+  return new Promise((resolve, reject) => {
+    const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
+    const wsUrl = `${wsProtocol}//${window.location.host}/ws`;
+    ws = new WebSocket(wsUrl);
+    ws.onopen = () => {
+      setStatus('connected');
+      resolve();
+    };
+    ws.onmessage = (event) => {
+      const msg = JSON.parse(event.data);
+      if (pendingResolve) {
+        const resolver = pendingResolve;
+        pendingResolve = null;
+        resolver(msg);
+      }
+    };
+    ws.onerror = (err) => {
+      setStatus('disconnected');
+      reject(new Error('WebSocket connection failed'));
+    };
+    ws.onclose = () => {
+      setStatus('disconnected');
+      ws = null;
+      if (episodeActive) {
+        episodeActive = false;
+        setControlsEnabled(false);
+        showActionResult('WebSocket disconnected. Reset to reconnect.', 'error');
+      }
+    };
+  });
+}
+function wsSend(message) {
+  return new Promise((resolve, reject) => {
+    if (!ws || ws.readyState !== WebSocket.OPEN) {
+      reject(new Error('WebSocket not connected'));
+      return;
+    }
+    pendingResolve = resolve;
+    ws.send(JSON.stringify(message));
+    // Timeout after 30s
+    setTimeout(() => {
+      if (pendingResolve === resolve) {
+        pendingResolve = null;
+        reject(new Error('WebSocket request timed out'));
+      }
+    }, 30000);
+  });
+}
+function closeWebSocket() {
+  if (ws) {
+    ws.close();
+    ws = null;
+  }
+  pendingResolve = null;
+}
+// ─── UI helpers ──────────────────────────────────────────────────────
+function selectScenario(id) {
+  if (episodeActive) return;
+  selectedScenario = id;
+  document.querySelectorAll('.scenario-card').forEach(c => c.classList.remove('active'));
+  const card = document.querySelector(`.scenario-card[data-id="${id}"]`);
+  if (card) card.classList.add('active');
+  const btn = document.getElementById('startBtn');
+  btn.disabled = false;
+  btn.textContent = `Start ${id}: ${SCENARIOS[id].name}`;
+}
+function togglePanel(id) {
+  const panel = document.getElementById(id);
+  panel.classList.toggle('collapsed');
+  const btnId = id === 'sidebar' ? 'toggleScenarios' : 'toggleMetrics';
+  document.getElementById(btnId).classList.toggle('active');
+}
+function setControlsEnabled(enabled) {
+  document.getElementById('commandInput').disabled = !enabled;
+  document.getElementById('sendBtn').disabled = !enabled;
+  document.querySelectorAll('.quick-btn').forEach(b => b.disabled = !enabled);
+}
+function quickCmd(cmd) {
+  if (!episodeActive || isProcessing) return;
+  document.getElementById('commandInput').value = cmd;
+  sendCommand();
+}
+function showActionResult(msg, type) {
+  const el = document.getElementById('actionResult');
+  el.style.display = 'block';
+  el.textContent = msg;
+  el.className = 'action-result ' + type;
+}
+function setStatus(state) {
+  const badge = document.getElementById('statusBadge');
+  const text = document.getElementById('statusText');
+  badge.className = 'status-badge ' + state;
+  text.textContent = state === 'connected' ? 'Connected' :
+                     state === 'loading' ? 'Loading...' : 'Disconnected';
+}
+// ─── Dashboard text parsing ──────────────────────────────────────────
+function parseDashboard(dashboard) {
+  const metrics = {};
+  // PUE
+  const pueMatch = dashboard.match(/PUE:\s+([\d.]+)/);
+  if (pueMatch) metrics.pue = parseFloat(pueMatch[1]);
+  // IT Load
+  const itMatch = dashboard.match(/IT Load:\s+([\d.]+)\s*kW/);
+  if (itMatch) metrics.itLoad = parseFloat(itMatch[1]);
+  // Cooling
+  const coolMatch = dashboard.match(/Cooling:\s+([\d.]+)\s*kW/);
+  if (coolMatch) metrics.cooling = parseFloat(coolMatch[1]);
+  // Outside temp
+  const outMatch = dashboard.match(/Outside:\s+([\d.]+)°C/);
+  if (outMatch) metrics.outside = parseFloat(outMatch[1]);
+  // Zone temperatures
+  metrics.zones = [];
+  const zoneRegex = /(zone_\w+)\s+([\d.]+)°C\s+([\d.]+)°C\s+([\d.]+)°C/g;
+  let zm;
+  while ((zm = zoneRegex.exec(dashboard)) !== null) {
+    metrics.zones.push({
+      id: zm[1],
+      cold: parseFloat(zm[2]),
+      hot: parseFloat(zm[3]),
+      inlet: parseFloat(zm[4])
+    });
+  }
+  // Power info
+  const utilMatch = dashboard.match(/Utility:\s+(\w+)/);
+  if (utilMatch) metrics.utility = utilMatch[1];
+  const genMatch = dashboard.match(/Gen:\s+([^\n|]+)/);
+  if (genMatch) metrics.generator = genMatch[1].trim();
+  const atsMatch = dashboard.match(/ATS:\s+(\w+)/);
+  if (atsMatch) metrics.ats = atsMatch[1];
+  const upsMatch = dashboard.match(/UPS:\s+(.+)/);
+  if (upsMatch) metrics.ups = upsMatch[1].trim();
+  return metrics;
+}
+// ─── Start episode ───────────────────────────────────────────────────
+async function startEpisode() {
+  if (!selectedScenario || isProcessing) return;
+  isProcessing = true;
+  const btn = document.getElementById('startBtn');
+  btn.disabled = true;
+  btn.innerHTML = '<span class="spinner"></span> Starting...';
+  setStatus('loading');
+  try {
+    // Close any existing WebSocket connection
+    closeWebSocket();
+    // Open a fresh WebSocket session (each WS gets its own env instance)
+    await connectWebSocket();
+    // Send reset via WebSocket
+    const resetData = { scenario: selectedScenario };
+    const configName = document.getElementById('configSelect').value;
+    if (configName && configName !== 'default') {
+      resetData.config_name = configName;
+    }
+    const resp = await wsSend({ type: 'reset', data: resetData });
+    if (resp.type === 'error') {
+      throw new Error(resp.data?.message || 'Reset failed');
+    }
+    // resp: { type: "observation", data: { observation: {...}, reward: float, done: bool } }
+    episodeActive = true;
+    stepCount = 0;
+    cumulativeReward = 0;
+    rewardEntries = [];
+    processResponse(resp.data);
+    setControlsEnabled(true);
+    document.getElementById('startBtn').style.display = 'none';
+    document.getElementById('resetBtn').style.display = 'block';
+    document.getElementById('doneBanner').classList.remove('show');
+    document.getElementById('rewardHistory').innerHTML = '<div class="no-data">No steps yet</div>';
+    document.getElementById('actionResult').style.display = 'none';
+    const info = SCENARIOS[selectedScenario];
+    document.getElementById('metaScenario').textContent = `${selectedScenario} - ${info.name}`;
+  } catch (e) {
+    setStatus('disconnected');
+    showActionResult('Failed to start: ' + e.message, 'error');
+    btn.disabled = false;
+    btn.textContent = `Start ${selectedScenario}: ${SCENARIOS[selectedScenario].name}`;
+    closeWebSocket();
+  } finally {
+    isProcessing = false;
+  }
+}
+// ─── Reset episode ───────────────────────────────────────────────────
+function resetEpisode() {
+  episodeActive = false;
+  setControlsEnabled(false);
+  closeWebSocket();
+  document.getElementById('startBtn').style.display = 'block';
+  document.getElementById('startBtn').disabled = false;
+  document.getElementById('startBtn').textContent =
+    selectedScenario ? `Start ${selectedScenario}: ${SCENARIOS[selectedScenario].name}` : 'Select a Scenario';
+  document.getElementById('resetBtn').style.display = 'none';
+  document.getElementById('doneBanner').classList.remove('show');
+  document.getElementById('terminalTitle').textContent = 'dc-ops-console';
+  setStatus('disconnected');
+}
+// ─── Send command ────────────────────────────────────────────────────
+async function sendCommand() {
+  const input = document.getElementById('commandInput');
+  const cmd = input.value.trim();
+  if (!cmd || !episodeActive || isProcessing) return;
+  input.value = '';
+  isProcessing = true;
+  setControlsEnabled(false);
+  const sendBtn = document.getElementById('sendBtn');
+  sendBtn.disabled = false;
+  sendBtn.innerHTML = '<span class="spinner"></span>';
+  try {
+    // WebSocket step: { type: "step", data: { command: "...", reasoning: "" } }
+    const resp = await wsSend({
+      type: 'step',
+      data: { command: cmd, reasoning: '' }
+    });
+    if (resp.type === 'error') {
+      throw new Error(resp.data?.message || 'Step failed');
+    }
+    stepCount++;
+    processResponse(resp.data, cmd);
+  } catch (e) {
+    showActionResult('Error: ' + e.message, 'error');
+  } finally {
+    isProcessing = false;
+    sendBtn.textContent = 'Send';
+    if (episodeActive) {
+      setControlsEnabled(true);
+      input.focus();
+    }
+  }
+}
+// ─── Process API response ────────────────────────────────────────────
+function processResponse(data, command = null) {
+  // Response format: { observation: {...}, reward: float|null, done: bool }
+  const obs = data.observation || {};
+  const reward = data.reward || 0;
+  const done = data.done || false;
+  // ── Dashboard display ──
+  const dashEl = document.getElementById('dashboardOutput');
+  const dashboard = obs.dashboard || '';
+  if (dashboard) {
+    dashEl.textContent = dashboard;
+  }
+  // Auto-scroll dashboard to bottom
+  const container = document.getElementById('dashboardContainer');
+  container.scrollTop = container.scrollHeight;
+  // ── Action result ──
+  if (obs.action_result && command) {
+    const isErr = /error|invalid|unknown|unrecognized|fail/i.test(obs.action_result);
+    showActionResult(obs.action_result, isErr ? 'error' : 'success');
+  }
+  // ── Parse metrics from dashboard text ──
+  const metrics = parseDashboard(dashboard);
+  // ── Steps ──
+  const stepsRemaining = obs.steps_remaining || 0;
+  maxSteps = stepsRemaining + stepCount;
+  document.getElementById('metaStep').textContent = stepCount;
+  document.getElementById('metaMaxSteps').textContent = maxSteps;
+  const pct = maxSteps > 0 ? (stepCount / maxSteps) * 100 : 0;
+  const progEl = document.getElementById('stepProgress');
+  progEl.style.width = pct + '%';
+  progEl.className = 'progress-fill ' + (pct < 50 ? 'low' : pct < 80 ? 'mid' : 'high');
+  // ── Cumulative reward ──
+  cumulativeReward += reward;
+  const cumEl = document.getElementById('metaCumReward');
+  cumEl.textContent = cumulativeReward.toFixed(2);
+  cumEl.style.color = cumulativeReward > 0 ? 'var(--green)' : cumulativeReward < -0.5 ? 'var(--red)' : 'var(--text)';
+  // ── Key metrics from parsed dashboard ──
+  if (metrics.pue !== undefined) {
+    const el = document.getElementById('metricPUE');
+    el.textContent = metrics.pue.toFixed(2);
+    el.className = 'metric-value ' + (metrics.pue < 1.5 ? 'good' : metrics.pue < 1.8 ? 'warn' : 'danger');
+  }
+  if (metrics.itLoad !== undefined) {
+    document.getElementById('metricIT').textContent = metrics.itLoad.toFixed(0) + ' kW';
+    document.getElementById('metricIT').className = 'metric-value neutral';
+  }
+  if (metrics.cooling !== undefined) {
+    document.getElementById('metricCooling').textContent = metrics.cooling.toFixed(0) + ' kW';
+    document.getElementById('metricCooling').className = 'metric-value neutral';
+  }
+  if (metrics.outside !== undefined) {
+    document.getElementById('metricOutside').textContent = metrics.outside.toFixed(0) + '°C';
+    document.getElementById('metricOutside').className = 'metric-value neutral';
+  }
+  // ── Zone temperature bars ──
+  if (metrics.zones && metrics.zones.length > 0) {
+    updateZoneBars(metrics.zones);
+  }
+  // ── Power info ──
+  updatePowerInfo(metrics);
+  // ── Reward history ──
+  if (command) {
+    rewardEntries.push({ step: stepCount, cmd: command, reward: reward });
+    updateRewardHistory();
+  }
+  // ── Terminal title ──
+  document.getElementById('terminalTitle').textContent =
+    `dc-ops — ${selectedScenario} — step ${stepCount}/${maxSteps}`;
+  // ── Episode done ──
+  if (done) {
+    episodeActive = false;
+    setControlsEnabled(false);
+    const banner = document.getElementById('doneBanner');
+    banner.classList.add('show');
+    const alert = obs.alert || '';
+    if (alert.toLowerCase().includes('resolved') || alert.toLowerCase().includes('success') ||
+        alert.toLowerCase().includes('complete')) {
+      banner.className = 'episode-done-banner show resolved';
+      banner.textContent = 'Scenario Resolved Successfully';
+    } else if (alert.toLowerCase().includes('critical') || alert.toLowerCase().includes('emergency') ||
+               alert.toLowerCase().includes('shutdown')) {
+      banner.className = 'episode-done-banner show failed';
+      banner.textContent = 'Episode Ended — Critical Failure';
+    } else {
+      banner.className = 'episode-done-banner show timeout';
+      banner.textContent = `Episode Ended — ${stepCount >= maxSteps ? 'Budget exhausted' : 'Terminated'}`;
+    }
+  }
+}
+// ─── Zone bars ───────────────────────────────────────────────────────
+function updateZoneBars(zones) {
+  const container = document.getElementById('zoneBars');
+  container.innerHTML = '';
+  for (const z of zones) {
+    const temp = z.inlet;
+    const pct = Math.max(0, Math.min(100, ((temp - 15) / 30) * 100));
+    const cls = temp <= 27 ? 'safe' : temp <= 35 ? 'warning' : 'critical';
+    const colorVar = cls === 'safe' ? '--green' : cls === 'warning' ? '--yellow' : '--red';
+    const label = z.id.replace('zone_', '').toUpperCase();
+    const row = document.createElement('div');
+    row.className = 'zone-bar-row';
+    row.innerHTML = `
+      <span class="zone-bar-label">${label}</span>
+      <div class="zone-bar-track">
+        <div class="zone-bar-fill ${cls}" style="width:${pct}%"></div>
+      </div>
+      <span class="zone-bar-value" style="color:var(${colorVar})">${temp.toFixed(1)}°C</span>`;
+    container.appendChild(row);
+  }
+}
+// ─── Power info ──────────────────────────────────────────────────────
+function updatePowerInfo(metrics) {
+  const container = document.getElementById('powerInfo');
+  let html = '';
+  if (metrics.utility) {
+    const cls = metrics.utility === 'NORMAL' ? 'ok' : 'bad';
+    html += `<div class="power-row"><span class="pw-label">Utility</span><span class="pw-val ${cls}">${metrics.utility}</span></div>`;
+  }
+  if (metrics.generator) {
+    const cls = metrics.generator.startsWith('OFF') ? 'ok' :
+                metrics.generator.startsWith('LOADED') ? 'warn' : 'warn';
+    html += `<div class="power-row"><span class="pw-label">Generator</span><span class="pw-val ${cls}">${metrics.generator}</span></div>`;
+  }
+  if (metrics.ats) {
+    const cls = metrics.ats === 'UTILITY' ? 'ok' : 'warn';
+    html += `<div class="power-row"><span class="pw-label">ATS</span><span class="pw-val ${cls}">${metrics.ats}</span></div>`;
+  }
+  if (metrics.ups) {
+    const parts = metrics.ups.split('|').map(s => s.trim()).filter(Boolean);
+    for (const p of parts) {
+      const hasBattery = /BATTERY/i.test(p);
+      const hasFault = /FAULT/i.test(p);
+      const cls = hasFault ? 'bad' : hasBattery ? 'warn' : 'ok';
+      html += `<div class="power-row"><span class="pw-label">UPS</span><span class="pw-val ${cls}">${p}</span></div>`;
+    }
+  }
+  container.innerHTML = html || '<div class="no-data">No data</div>';
+}
+// ─── Reward history ──────────────────────────────────────────────────
+function updateRewardHistory() {
+  const container = document.getElementById('rewardHistory');
+  container.innerHTML = '';
+  for (let i = rewardEntries.length - 1; i >= 0; i--) {
+    const e = rewardEntries[i];
+    const cls = e.reward > 0.005 ? 'pos' : e.reward < -0.005 ? 'neg' : 'zero';
+    const sign = e.reward >= 0 ? '+' : '';
+    const div = document.createElement('div');
+    div.className = 'reward-entry';
+    div.innerHTML = `
+      <span class="step">${e.step}</span>
+      <span class="cmd" title="${e.cmd}">${e.cmd}</span>
+      <span class="rew ${cls}">${sign}${e.reward.toFixed(3)}</span>`;
+    container.appendChild(div);
+  }
+}
+// ─── Health check ────────────────────────────────────────────────────
+async function checkHealth() {
+  try {
+    const resp = await fetch(`${BASE_URL}/health`);
+    if (resp.ok) setStatus('connected');
+    else setStatus('disconnected');
+  } catch (e) {
+    setStatus('disconnected');
+  }
+}
+// ─── Init ────────────────────────────────────────────────────────────
+checkHealth();
+</script>
+</body>
+</html>

simulation/__init__.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Datacenter simulation engine."""
+from .power import PowerAlarm, PowerSimulation, PowerStepResult
+from .thermal import ThermalAlarm, ThermalSimulation, ThermalStepResult
+from .types import (
+    ATSPosition,
+    ATSState,
+    CRACFaultType,
+    CRACState,
+    CRACStatus,
+    DatacenterState,
+    GeneratorState,
+    GensetState,
+    PDUState,
+    PowerState,
+    RackState,
+    UPSMode,
+    UPSState,
+    ZoneState,
+)
+__all__ = [
+    "PowerAlarm",
+    "PowerSimulation",
+    "PowerStepResult",
+    "ThermalAlarm",
+    "ThermalSimulation",
+    "ThermalStepResult",
+    "ATSPosition",
+    "ATSState",
+    "CRACFaultType",
+    "CRACState",
+    "CRACStatus",
+    "DatacenterState",
+    "GeneratorState",
+    "GensetState",
+    "PDUState",
+    "PowerState",
+    "RackState",
+    "UPSMode",
+    "UPSState",
+    "ZoneState",
+]

simulation/power.py ADDED Viewed

	@@ -0,0 +1,668 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Power subsystem simulation: UPS, PDU, Generator, ATS.
+Models the electrical power chain from utility/generator through UPS and PDU
+to IT loads. Tracks efficiency losses, battery state-of-charge, generator
+fuel consumption, and automatic transfer switching.
+Physics references:
+  - UPS quadratic loss model: APC White Paper 108
+  - PDU three-phase power: P = √3 × V_LL × I_L × PF
+  - Generator fuel: linear with load fraction + 10% idle
+  - ATS transfer: mechanical switch timing (50-200 ms)
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass, field
+from ..config import (
+    ATSConfig,
+    GeneratorConfig,
+    PDUConfig,
+    PowerConfig,
+    UPSConfig,
+)
+from .types import (
+    ATSPosition,
+    ATSState,
+    GeneratorState,
+    GensetState,
+    PDUState,
+    PowerState,
+    UPSMode,
+    UPSState,
+)
+# ---------------------------------------------------------------------------
+# Power step result
+# ---------------------------------------------------------------------------
+@dataclass
+class PowerAlarm:
+    """A power subsystem alarm."""
+    component: str        # e.g. "UPS-1", "PDU-A1", "GEN-1", "ATS-1"
+    alarm_type: str       # e.g. "on_battery", "low_battery", "overload", "fuel_low"
+    severity: str         # "warning", "critical"
+    message: str
+    value: float = 0.0    # Relevant numeric value (SOC, load%, fuel level, etc.)
+@dataclass
+class PowerStepResult:
+    """Result of a single power simulation step."""
+    total_ups_loss_kw: float = 0.0
+    total_pdu_loss_kw: float = 0.0
+    total_power_overhead_kw: float = 0.0
+    generator_output_kw: float = 0.0
+    generator_fuel_remaining_liters: float = 0.0
+    utility_available: bool = True
+    on_generator: bool = False
+    power_available: bool = True
+    alarms: list[PowerAlarm] = field(default_factory=list)
+# ---------------------------------------------------------------------------
+# Power simulation
+# ---------------------------------------------------------------------------
+class PowerSimulation:
+    """Simulates the datacenter power distribution chain.
+    Power flow:
+        Utility/Generator → ATS → UPS(es) → PDU(s) → IT Load
+    Each step():
+      1. ATS: detect utility loss/restoration, manage transfer
+      2. Generator: state machine (off → start_delay → cranking → warming → ready → loaded)
+      3. UPS: compute efficiency, manage battery SOC
+      4. PDU: compute losses, check phase currents
+    """
+    def __init__(self, power_config: PowerConfig, it_load_kw: float = 160.0) -> None:
+        self._config = power_config
+        self._state = self._init_state(power_config)
+        self._it_load_kw = it_load_kw
+    @property
+    def state(self) -> PowerState:
+        return self._state
+    @staticmethod
+    def _init_state(config: PowerConfig) -> PowerState:
+        """Initialize power state from configuration."""
+        ups_units = []
+        for uc in config.ups_units:
+            ups = UPSState(
+                unit_id=uc.unit_id,
+                mode=UPSMode(uc.initial_mode),
+                rated_capacity_kw=uc.rated_capacity_kw,
+                loss_c0=uc.loss_c0,
+                loss_c1=uc.loss_c1,
+                loss_c2=uc.loss_c2,
+                battery_capacity_kwh=uc.battery_capacity_kwh,
+                battery_discharge_efficiency=uc.battery_discharge_efficiency,
+                battery_aging_factor=uc.battery_aging_factor,
+                recharge_rate_kw=uc.recharge_rate_kw,
+                battery_soc=1.0,
+            )
+            ups_units.append(ups)
+        pdus = []
+        for pc in config.pdus:
+            pdu = PDUState(
+                pdu_id=pc.pdu_id,
+                voltage_ll_v=pc.voltage_ll_v,
+                max_current_per_phase_a=pc.max_current_per_phase_a,
+                num_phases=pc.num_phases,
+                breaker_rating_a=pc.breaker_rating_a,
+                efficiency=pc.efficiency,
+                continuous_derating=pc.continuous_derating,
+            )
+            pdus.append(pdu)
+        gen_cfg = config.generator
+        generator = GensetState(
+            gen_id=gen_cfg.gen_id,
+            rated_capacity_kw=gen_cfg.rated_capacity_kw,
+            start_delay_s=gen_cfg.start_delay_s,
+            crank_time_s=gen_cfg.crank_time_s,
+            warmup_time_s=gen_cfg.warmup_time_s,
+            cooldown_time_s=gen_cfg.cooldown_time_s,
+            fuel_tank_liters=gen_cfg.fuel_tank_liters,
+            fuel_level_liters=gen_cfg.fuel_tank_liters,
+            consumption_lph_full=gen_cfg.consumption_lph_full,
+        )
+        ats_cfg = config.ats
+        ats = ATSState(
+            ats_id=ats_cfg.ats_id,
+            transfer_time_ms=ats_cfg.transfer_time_ms,
+            retransfer_delay_s=ats_cfg.retransfer_delay_s,
+        )
+        return PowerState(
+            ups_units=ups_units,
+            pdus=pdus,
+            generator=generator,
+            ats=ats,
+            utility_available=config.utility_available,
+            utility_voltage_v=config.utility_voltage_v,
+        )
+    def step(self, dt_s: float, it_load_kw: float) -> PowerStepResult:
+        """Advance the power simulation by dt_s seconds.
+        Args:
+            dt_s: Timestep in seconds.
+            it_load_kw: Total IT power demand in kW.
+        Returns:
+            PowerStepResult with losses, alarms, and status.
+        """
+        self._it_load_kw = it_load_kw
+        alarms: list[PowerAlarm] = []
+        # 1. ATS logic: detect utility state changes
+        self._step_ats(dt_s, alarms)
+        # 2. Generator state machine
+        self._step_generator(dt_s, alarms)
+        # 3. Determine if load-side power is available
+        power_available = self._state.power_available
+        # 4. UPS: efficiency, battery, losses
+        total_ups_loss = self._step_ups_units(dt_s, it_load_kw, alarms)
+        # 5. PDU: losses, phase currents
+        total_pdu_loss = self._step_pdus(it_load_kw, alarms)
+        return PowerStepResult(
+            total_ups_loss_kw=total_ups_loss,
+            total_pdu_loss_kw=total_pdu_loss,
+            total_power_overhead_kw=total_ups_loss + total_pdu_loss,
+            generator_output_kw=self._state.generator.output_power_kw,
+            generator_fuel_remaining_liters=self._state.generator.fuel_level_liters,
+            utility_available=self._state.utility_available,
+            on_generator=self._state.on_generator,
+            power_available=power_available,
+            alarms=alarms,
+        )
+    # -------------------------------------------------------------------
+    # ATS
+    # -------------------------------------------------------------------
+    def _step_ats(self, dt_s: float, alarms: list[PowerAlarm]) -> None:
+        """Handle ATS transfer logic."""
+        ats = self._state.ats
+        gen = self._state.generator
+        utility_ok = self._state.utility_available
+        if ats.position == ATSPosition.UTILITY:
+            if not utility_ok:
+                # Utility lost — initiate transfer to generator
+                ats.position = ATSPosition.TRANSFERRING
+                ats.transfer_elapsed_ms = 0.0
+                ats.retransfer_timer_s = 0.0
+                # Start generator if not already running
+                if gen.state == GeneratorState.OFF:
+                    gen.state = GeneratorState.START_DELAY
+                    gen.state_elapsed_s = 0.0
+                alarms.append(PowerAlarm(
+                    component=ats.ats_id,
+                    alarm_type="utility_lost",
+                    severity="critical",
+                    message="Utility power lost, initiating transfer to generator",
+                ))
+        elif ats.position == ATSPosition.TRANSFERRING:
+            ats.transfer_elapsed_ms += dt_s * 1000.0
+            if ats.transfer_elapsed_ms >= ats.transfer_time_ms:
+                # Transfer complete
+                if utility_ok:
+                    # Utility came back during transfer — go back to utility
+                    ats.position = ATSPosition.UTILITY
+                    ats.transfer_elapsed_ms = 0.0
+                elif gen.is_available:
+                    ats.position = ATSPosition.GENERATOR
+                    ats.transfer_elapsed_ms = 0.0
+                    alarms.append(PowerAlarm(
+                        component=ats.ats_id,
+                        alarm_type="on_generator",
+                        severity="warning",
+                        message="Load transferred to generator",
+                    ))
+                # else: stay transferring until generator is ready
+        elif ats.position == ATSPosition.GENERATOR:
+            if utility_ok:
+                # Utility restored — wait retransfer delay before switching back
+                ats.retransfer_timer_s += dt_s
+                if ats.retransfer_timer_s >= ats.retransfer_delay_s:
+                    ats.position = ATSPosition.TRANSFERRING
+                    ats.transfer_elapsed_ms = 0.0
+                    alarms.append(PowerAlarm(
+                        component=ats.ats_id,
+                        alarm_type="retransfer",
+                        severity="warning",
+                        message="Utility restored, initiating retransfer",
+                    ))
+            else:
+                ats.retransfer_timer_s = 0.0
+    # -------------------------------------------------------------------
+    # Generator
+    # -------------------------------------------------------------------
+    def _step_generator(self, dt_s: float, alarms: list[PowerAlarm]) -> None:
+        """Advance generator state machine."""
+        gen = self._state.generator
+        if gen.state == GeneratorState.OFF:
+            gen.output_power_kw = 0.0
+            gen.load_fraction = 0.0
+            gen.fuel_consumption_lph = 0.0
+            return
+        gen.state_elapsed_s += dt_s
+        if gen.state == GeneratorState.START_DELAY:
+            if gen.state_elapsed_s >= gen.start_delay_s:
+                gen.state = GeneratorState.CRANKING
+                gen.state_elapsed_s = 0.0
+        elif gen.state == GeneratorState.CRANKING:
+            if gen.state_elapsed_s >= gen.crank_time_s:
+                gen.state = GeneratorState.WARMING
+                gen.state_elapsed_s = 0.0
+                alarms.append(PowerAlarm(
+                    component=gen.gen_id,
+                    alarm_type="engine_started",
+                    severity="warning",
+                    message="Generator engine started, warming up",
+                ))
+        elif gen.state == GeneratorState.WARMING:
+            # Idle fuel consumption during warmup
+            gen.fuel_consumption_lph = gen.consumption_lph_full * 0.1
+            self._consume_fuel(gen, dt_s)
+            if gen.state_elapsed_s >= gen.warmup_time_s:
+                gen.state = GeneratorState.READY
+                gen.state_elapsed_s = 0.0
+                alarms.append(PowerAlarm(
+                    component=gen.gen_id,
+                    alarm_type="ready",
+                    severity="warning",
+                    message="Generator ready to accept load",
+                ))
+        elif gen.state == GeneratorState.READY:
+            gen.fuel_consumption_lph = gen.consumption_lph_full * 0.1
+            self._consume_fuel(gen, dt_s)
+            # If ATS has switched to generator, transition to loaded
+            if self._state.ats.position == ATSPosition.GENERATOR:
+                gen.state = GeneratorState.LOADED
+                gen.state_elapsed_s = 0.0
+        elif gen.state == GeneratorState.LOADED:
+            gen.load_fraction = min(self._it_load_kw / gen.rated_capacity_kw, 1.0)
+            gen.output_power_kw = min(self._it_load_kw, gen.rated_capacity_kw)
+            gen.fuel_consumption_lph = gen.compute_fuel_consumption_lph()
+            self._consume_fuel(gen, dt_s)
+            # Check fuel level
+            if gen.fuel_level_liters <= 0:
+                gen.fuel_level_liters = 0.0
+                gen.state = GeneratorState.OFF
+                gen.output_power_kw = 0.0
+                alarms.append(PowerAlarm(
+                    component=gen.gen_id,
+                    alarm_type="fuel_exhausted",
+                    severity="critical",
+                    message="Generator fuel exhausted — engine shutdown",
+                ))
+            elif gen.fuel_remaining_hours < 2.0:
+                alarms.append(PowerAlarm(
+                    component=gen.gen_id,
+                    alarm_type="fuel_low",
+                    severity="warning",
+                    message=f"Generator fuel low: {gen.fuel_level_liters:.0f}L "
+                            f"(~{gen.fuel_remaining_hours:.1f}h remaining)",
+                    value=gen.fuel_level_liters,
+                ))
+            # If utility is back and ATS has switched away, go to cooldown
+            if self._state.ats.position != ATSPosition.GENERATOR:
+                gen.state = GeneratorState.COOLDOWN
+                gen.state_elapsed_s = 0.0
+                gen.output_power_kw = 0.0
+                gen.load_fraction = 0.0
+        elif gen.state == GeneratorState.COOLDOWN:
+            gen.output_power_kw = 0.0
+            gen.load_fraction = 0.0
+            gen.fuel_consumption_lph = gen.consumption_lph_full * 0.1
+            self._consume_fuel(gen, dt_s)
+            if gen.state_elapsed_s >= gen.cooldown_time_s:
+                gen.state = GeneratorState.OFF
+                gen.state_elapsed_s = 0.0
+                gen.fuel_consumption_lph = 0.0
+                alarms.append(PowerAlarm(
+                    component=gen.gen_id,
+                    alarm_type="shutdown",
+                    severity="warning",
+                    message="Generator cooldown complete, engine off",
+                ))
+    @staticmethod
+    def _consume_fuel(gen: GensetState, dt_s: float) -> None:
+        """Consume fuel for the given timestep."""
+        if gen.fuel_consumption_lph > 0:
+            consumed = gen.fuel_consumption_lph * dt_s / 3600.0  # hours → seconds
+            gen.fuel_level_liters = max(0.0, gen.fuel_level_liters - consumed)
+    # -------------------------------------------------------------------
+    # UPS
+    # -------------------------------------------------------------------
+    def _step_ups_units(
+        self, dt_s: float, it_load_kw: float, alarms: list[PowerAlarm]
+    ) -> float:
+        """Step all UPS units and return total UPS losses in kW."""
+        if not self._state.ups_units:
+            return 0.0
+        # Distribute IT load evenly across UPS units
+        load_per_ups = it_load_kw / len(self._state.ups_units)
+        total_loss = 0.0
+        for ups in self._state.ups_units:
+            loss = self._step_single_ups(ups, dt_s, load_per_ups, alarms)
+            total_loss += loss
+        return total_loss
+    def _step_single_ups(
+        self,
+        ups: UPSState,
+        dt_s: float,
+        load_kw: float,
+        alarms: list[PowerAlarm],
+    ) -> float:
+        """Step a single UPS unit. Returns loss in kW."""
+        ups.output_power_kw = load_kw
+        ups.load_fraction = load_kw / ups.rated_capacity_kw if ups.rated_capacity_kw > 0 else 0.0
+        utility_ok = self._state.utility_available
+        ats_ok = self._state.ats.load_powered
+        # Mode transitions
+        if ups.mode == UPSMode.FAULT:
+            # Fault state: no output, no charging
+            ups.efficiency = 0.0
+            ups.heat_output_kw = 0.0
+            ups.input_power_kw = 0.0
+            ups.battery_power_kw = 0.0
+            return 0.0
+        if ups.mode == UPSMode.BYPASS:
+            # Bypass: no UPS processing, minimal losses
+            ups.efficiency = 1.0
+            ups.heat_output_kw = 0.0
+            ups.input_power_kw = load_kw
+            ups.battery_power_kw = 0.0
+            return 0.0
+        # Check if we need to switch to battery
+        source_ok = utility_ok and ats_ok
+        if ups.mode == UPSMode.ON_BATTERY:
+            if source_ok:
+                # Source restored — switch back to normal mode
+                ups.mode = UPSMode.DOUBLE_CONVERSION
+                alarms.append(PowerAlarm(
+                    component=ups.unit_id,
+                    alarm_type="utility_restored",
+                    severity="warning",
+                    message=f"UPS {ups.unit_id} back on utility power",
+                ))
+        elif not source_ok and ups.mode in (
+            UPSMode.DOUBLE_CONVERSION, UPSMode.LINE_INTERACTIVE, UPSMode.ECO
+        ):
+            ups.mode = UPSMode.ON_BATTERY
+            alarms.append(PowerAlarm(
+                component=ups.unit_id,
+                alarm_type="on_battery",
+                severity="critical",
+                message=f"UPS {ups.unit_id} switched to battery",
+                value=ups.battery_soc,
+            ))
+        # Compute efficiency based on mode
+        if ups.mode == UPSMode.ECO:
+            # Eco mode: ~99% efficiency (minimal processing)
+            ups.efficiency = 0.99
+        elif ups.mode == UPSMode.LINE_INTERACTIVE:
+            # Line interactive: ~97% (some processing)
+            ups.efficiency = min(0.97, ups.compute_efficiency() + 0.03)
+        else:
+            # Double conversion or on_battery: full quadratic model
+            ups.efficiency = ups.compute_efficiency()
+        # Compute losses
+        if ups.efficiency > 0:
+            ups_loss = load_kw * (1.0 / ups.efficiency - 1.0)
+        else:
+            ups_loss = ups.rated_capacity_kw * ups.loss_c0
+        ups.heat_output_kw = ups_loss
+        ups.input_power_kw = load_kw + ups_loss
+        # Battery management
+        if ups.mode == UPSMode.ON_BATTERY:
+            # Discharging: SOC decreases
+            # P_discharge = P_output / η_discharge (battery must supply more than output)
+            p_discharge = load_kw / ups.battery_discharge_efficiency if ups.battery_discharge_efficiency > 0 else load_kw
+            ups.battery_power_kw = p_discharge
+            energy_used_kwh = p_discharge * dt_s / 3600.0
+            effective_capacity = ups.battery_capacity_kwh * ups.battery_aging_factor
+            if effective_capacity > 0:
+                ups.battery_soc -= energy_used_kwh / effective_capacity
+            ups.battery_soc = max(0.0, ups.battery_soc)
+            ups.battery_time_remaining_s = ups.compute_battery_time_remaining_s()
+            ups.input_power_kw = 0.0  # Not drawing from mains
+            # Battery alarms
+            if ups.battery_soc <= 0.0:
+                ups.mode = UPSMode.FAULT
+                alarms.append(PowerAlarm(
+                    component=ups.unit_id,
+                    alarm_type="battery_exhausted",
+                    severity="critical",
+                    message=f"UPS {ups.unit_id} battery exhausted — load unprotected",
+                ))
+            elif ups.battery_soc < 0.10:
+                alarms.append(PowerAlarm(
+                    component=ups.unit_id,
+                    alarm_type="battery_critical",
+                    severity="critical",
+                    message=f"UPS {ups.unit_id} battery critical: {ups.battery_soc*100:.0f}%",
+                    value=ups.battery_soc,
+                ))
+            elif ups.battery_soc < 0.25:
+                alarms.append(PowerAlarm(
+                    component=ups.unit_id,
+                    alarm_type="battery_low",
+                    severity="warning",
+                    message=f"UPS {ups.unit_id} battery low: {ups.battery_soc*100:.0f}%",
+                    value=ups.battery_soc,
+                ))
+        else:
+            # On mains — charge battery if not full
+            ups.battery_power_kw = 0.0
+            ups.battery_time_remaining_s = float("inf")
+            if ups.battery_soc < 1.0:
+                charge_kw = min(ups.recharge_rate_kw, ups.rated_capacity_kw * 0.1)
+                energy_charged_kwh = charge_kw * dt_s / 3600.0
+                effective_capacity = ups.battery_capacity_kwh * ups.battery_aging_factor
+                if effective_capacity > 0:
+                    ups.battery_soc += energy_charged_kwh / effective_capacity
+                ups.battery_soc = min(1.0, ups.battery_soc)
+                ups.battery_power_kw = -charge_kw  # Negative = charging
+                ups.input_power_kw += charge_kw  # Charging draws additional power
+        # Overload alarm
+        if ups.load_fraction > 1.0:
+            alarms.append(PowerAlarm(
+                component=ups.unit_id,
+                alarm_type="overload",
+                severity="critical",
+                message=f"UPS {ups.unit_id} overloaded at {ups.load_fraction*100:.0f}%",
+                value=ups.load_fraction,
+            ))
+        return ups_loss
+    # -------------------------------------------------------------------
+    # PDU
+    # -------------------------------------------------------------------
+    def _step_pdus(
+        self, it_load_kw: float, alarms: list[PowerAlarm]
+    ) -> float:
+        """Step all PDUs and return total PDU losses in kW."""
+        if not self._state.pdus:
+            return 0.0
+        # Distribute IT load evenly across PDUs
+        load_per_pdu = it_load_kw / len(self._state.pdus)
+        total_loss = 0.0
+        for pdu in self._state.pdus:
+            loss = self._step_single_pdu(pdu, load_per_pdu, alarms)
+            total_loss += loss
+        return total_loss
+    def _step_single_pdu(
+        self,
+        pdu: PDUState,
+        load_kw: float,
+        alarms: list[PowerAlarm],
+    ) -> float:
+        """Step a single PDU. Returns loss in kW."""
+        pdu.output_power_kw = load_kw
+        pdu.input_power_kw = load_kw / pdu.efficiency if pdu.efficiency > 0 else load_kw
+        pdu_loss = pdu.input_power_kw - pdu.output_power_kw
+        pdu.heat_output_kw = pdu_loss
+        # Compute per-phase currents (assume balanced load across phases)
+        # P = √3 × V_LL × I_L × PF (assume PF = 1.0 for IT loads with PFC)
+        if pdu.voltage_ll_v > 0:
+            total_current = (load_kw * 1000.0) / (math.sqrt(3) * pdu.voltage_ll_v)
+            per_phase = total_current / pdu.num_phases if pdu.num_phases > 0 else total_current
+            pdu.phase_currents_a = [per_phase] * pdu.num_phases
+        else:
+            pdu.phase_currents_a = [0.0] * pdu.num_phases
+        # Load fraction of derated capacity
+        derated = pdu.derated_capacity_kw
+        pdu.load_fraction = load_kw / derated if derated > 0 else 0.0
+        # Phase imbalance (0 for balanced load — will be nonzero when
+        # individual rack loads are modeled)
+        pdu.phase_imbalance_pct = pdu.compute_phase_imbalance()
+        # Check overload
+        max_phase_current = max(pdu.phase_currents_a) if pdu.phase_currents_a else 0.0
+        if max_phase_current > pdu.max_current_per_phase_a:
+            pdu.overload = True
+            alarms.append(PowerAlarm(
+                component=pdu.pdu_id,
+                alarm_type="phase_overcurrent",
+                severity="critical",
+                message=f"PDU {pdu.pdu_id} phase overcurrent: "
+                        f"{max_phase_current:.1f}A > {pdu.max_current_per_phase_a:.0f}A",
+                value=max_phase_current,
+            ))
+        else:
+            pdu.overload = False
+        # Breaker trip check (per-branch, simplified as aggregate)
+        if max_phase_current > pdu.breaker_rating_a / pdu.continuous_derating:
+            pdu.breaker_tripped = True
+            alarms.append(PowerAlarm(
+                component=pdu.pdu_id,
+                alarm_type="breaker_trip",
+                severity="critical",
+                message=f"PDU {pdu.pdu_id} breaker tripped",
+                value=max_phase_current,
+            ))
+        # Warn on high utilization
+        if pdu.load_fraction > 0.80 and not pdu.overload:
+            alarms.append(PowerAlarm(
+                component=pdu.pdu_id,
+                alarm_type="high_utilization",
+                severity="warning",
+                message=f"PDU {pdu.pdu_id} at {pdu.load_fraction*100:.0f}% of derated capacity",
+                value=pdu.load_fraction,
+            ))
+        return pdu_loss
+    # -------------------------------------------------------------------
+    # Mutation helpers (for agent actions)
+    # -------------------------------------------------------------------
+    def set_utility_available(self, available: bool) -> None:
+        """Set utility power availability (for scenario injection)."""
+        self._state.utility_available = available
+    def set_ups_mode(self, unit_id: str, mode: UPSMode) -> bool:
+        """Manually set UPS operating mode. Returns True if found."""
+        for ups in self._state.ups_units:
+            if ups.unit_id == unit_id:
+                ups.mode = mode
+                return True
+        return False
+    def inject_ups_fault(self, unit_id: str) -> bool:
+        """Put a UPS into fault mode. Returns True if found."""
+        return self.set_ups_mode(unit_id, UPSMode.FAULT)
+    def clear_ups_fault(self, unit_id: str) -> bool:
+        """Restore a faulted UPS to double conversion. Returns True if found."""
+        for ups in self._state.ups_units:
+            if ups.unit_id == unit_id and ups.mode == UPSMode.FAULT:
+                ups.mode = UPSMode.DOUBLE_CONVERSION
+                return True
+        return False
+    def start_generator(self) -> None:
+        """Manually start the generator."""
+        gen = self._state.generator
+        if gen.state == GeneratorState.OFF:
+            gen.state = GeneratorState.START_DELAY
+            gen.state_elapsed_s = 0.0
+    def stop_generator(self) -> None:
+        """Initiate generator cooldown/shutdown."""
+        gen = self._state.generator
+        if gen.state in (GeneratorState.READY, GeneratorState.LOADED):
+            gen.state = GeneratorState.COOLDOWN
+            gen.state_elapsed_s = 0.0
+            gen.output_power_kw = 0.0
+            gen.load_fraction = 0.0
+    def refuel_generator(self, liters: float | None = None) -> None:
+        """Refuel the generator (default: full tank)."""
+        gen = self._state.generator
+        if liters is None:
+            gen.fuel_level_liters = gen.fuel_tank_liters
+        else:
+            gen.fuel_level_liters = min(
+                gen.fuel_level_liters + liters,
+                gen.fuel_tank_liters,
+            )

simulation/thermal.py ADDED Viewed

	@@ -0,0 +1,515 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+RC thermal network simulation for datacenter zones.
+Physics model (lumped-capacitance, per zone):
+    C_zone × dT_zone/dt = Q_IT - Q_cooling + Q_envelope + Q_internal
+Where:
+    C_zone  = C_air + C_equipment                        [J/K]
+    Q_IT    = sum of rack IT loads × 1000                 [W]
+    Q_cool  = sum of CRAC cooling outputs × 1000          [W]
+    Q_env   = (T_outside - T_zone) / R_envelope           [W]
+    Q_int   = UPS losses + PDU losses + lighting          [W]
+Cold aisle temperature accounts for hot-air recirculation:
+    T_cold_effective = (1-r) × T_supply_weighted + r × T_hot_aisle
+where r is the recirculation factor (0 = perfect containment).
+Hot aisle temperature from server energy balance:
+    T_hot = T_cold + Q_IT / (m_dot_rack × c_p)
+Integration: Forward Euler with configurable dt (default 1.0 s).
+Target: <1 ms per step for a 20-rack, 4-CRAC datacenter.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from ..config import (
+    AIR_DENSITY_KG_M3,
+    AIR_SPECIFIC_HEAT_J_KGK,
+    ASHRAE_CLASSES,
+    DatacenterConfig,
+    RackConfig,
+    CRACConfig,
+    ZoneConfig,
+    cfm_to_m3s,
+    make_default_datacenter_config,
+)
+from .types import (
+    CRACFaultType,
+    CRACState,
+    CRACStatus,
+    DatacenterState,
+    RackState,
+    ZoneState,
+)
+@dataclass
+class ThermalAlarm:
+    """An active thermal alarm."""
+    rack_id: str
+    zone_id: str
+    inlet_temp_c: float
+    threshold_c: float
+    severity: str  # "warning" (recommended exceeded) or "critical" (allowable exceeded)
+@dataclass
+class ThermalStepResult:
+    """Result of a single simulation step."""
+    state: DatacenterState
+    alarms: list[ThermalAlarm] = field(default_factory=list)
+    total_cooling_output_kw: float = 0.0
+    total_cooling_power_kw: float = 0.0
+    energy_consumed_kwh: float = 0.0  # Energy consumed in this step
+class ThermalSimulation:
+    """Multi-zone RC thermal network simulation.
+    Owns the DatacenterState and advances it forward in time.
+    Each call to step() integrates the thermal ODEs by dt seconds.
+    """
+    def __init__(self, config: DatacenterConfig | None = None):
+        if config is None:
+            config = make_default_datacenter_config()
+        self._config = config
+        self._state = self._build_initial_state(config)
+        self._dt = config.simulation_dt_s
+    @property
+    def state(self) -> DatacenterState:
+        return self._state
+    @property
+    def config(self) -> DatacenterConfig:
+        return self._config
+    @property
+    def dt(self) -> float:
+        return self._dt
+    # ------------------------------------------------------------------
+    # Initialization
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _build_initial_state(config: DatacenterConfig) -> DatacenterState:
+        """Construct the initial DatacenterState from configuration."""
+        zones: list[ZoneState] = []
+        for zc in config.zones:
+            racks = ThermalSimulation._build_racks(zc, zc.initial_cold_aisle_temp_c)
+            cracs = ThermalSimulation._build_cracs(zc)
+            zone = ZoneState(
+                zone_id=zc.zone_id,
+                cold_aisle_temp_c=zc.initial_cold_aisle_temp_c,
+                hot_aisle_temp_c=zc.initial_cold_aisle_temp_c + 15.0,  # Initial estimate
+                humidity_rh=zc.initial_humidity_rh,
+                recirculation_factor=zc.recirculation_factor,
+                racks=racks,
+                crac_units=cracs,
+                air_volume_m3=zc.air_volume_m3,
+                envelope_r_kw=zc.envelope_r_kw,
+                ashrae_class=zc.ashrae_class,
+            )
+            zones.append(zone)
+        state = DatacenterState(
+            zones=zones,
+            outside_temp_c=config.outside_temp_c,
+            outside_humidity_rh=config.outside_humidity_rh,
+            lighting_power_kw=config.lighting_w_per_m2 * config.floor_area_m2 / 1000.0,
+            ups_loss_fraction=config.ups_loss_fraction,
+            pdu_loss_fraction=config.pdu_loss_fraction,
+            sim_time_s=0.0,
+        )
+        # Run a few settling steps so initial temps are physically consistent
+        sim = ThermalSimulation.__new__(ThermalSimulation)
+        sim._state = state
+        sim._config = make_default_datacenter_config()
+        sim._dt = 1.0
+        for _ in range(300):
+            sim._integrate_step(1.0)
+        return state
+    @staticmethod
+    def _build_racks(zone_config: ZoneConfig, initial_temp_c: float) -> list[RackState]:
+        racks: list[RackState] = []
+        for rc in zone_config.racks:
+            airflow_cfm = rc.airflow_cfm_per_kw * rc.it_load_kw
+            airflow_m3s = cfm_to_m3s(airflow_cfm)
+            thermal_mass = rc.num_servers_2u * rc.server_thermal_mass_jk
+            rack = RackState(
+                rack_id=rc.rack_id,
+                row=rc.row,
+                position=rc.position,
+                it_load_kw=rc.it_load_kw,
+                inlet_temp_c=initial_temp_c,
+                outlet_temp_c=initial_temp_c + 15.0,  # Will be corrected by settling
+                airflow_m3s=airflow_m3s,
+                thermal_mass_jk=thermal_mass,
+            )
+            racks.append(rack)
+        return racks
+    @staticmethod
+    def _build_cracs(zone_config: ZoneConfig) -> list[CRACState]:
+        cracs: list[CRACState] = []
+        for cc in zone_config.crac_units:
+            crac = CRACState(
+                unit_id=cc.unit_id,
+                setpoint_c=cc.initial_setpoint_c,
+                supply_temp_c=cc.initial_setpoint_c,
+                fan_speed_pct=cc.initial_fan_speed_pct,
+                max_airflow_m3s=cfm_to_m3s(cc.max_airflow_cfm),
+                rated_capacity_kw=cc.rated_capacity_kw,
+                rated_return_temp_c=cc.rated_return_temp_c,
+                capacity_slope_per_c=cc.capacity_slope_per_c,
+                fan_rated_power_kw=cc.fan_rated_power_kw,
+                cop_rated=cc.cop_rated,
+                cop_degradation_per_c=cc.cop_degradation_per_c,
+                supply_temp_lag_s=cc.supply_temp_lag_s,
+            )
+            cracs.append(crac)
+        return cracs
+    # ------------------------------------------------------------------
+    # Simulation step
+    # ------------------------------------------------------------------
+    def step(self, dt: float | None = None) -> ThermalStepResult:
+        """Advance the simulation by dt seconds.
+        Returns a ThermalStepResult with updated state, alarms, and energy metrics.
+        """
+        if dt is None:
+            dt = self._dt
+        result = self._integrate_step(dt)
+        self._state.sim_time_s += dt
+        return result
+    def step_n(self, n: int, dt: float | None = None) -> ThermalStepResult:
+        """Advance simulation by n steps. Returns result of the last step."""
+        result = ThermalStepResult(state=self._state)
+        for _ in range(n):
+            result = self.step(dt)
+        return result
+    def _integrate_step(self, dt: float) -> ThermalStepResult:
+        """Core integration: one Forward Euler step across all zones.
+        Physics model — **cold aisle energy balance** (not total-zone):
+        The cold aisle is a mixing volume. Heat flows into/out of it:
+          q_crac   = m_dot_crac × c_p × (T_supply − T_cold)   [cooling from CRACs]
+          q_recirc = r × m_dot_crac × c_p × (T_hot − T_cold)  [recirculated hot air]
+          q_env    = (T_outside − T_cold) / R_envelope          [building heat gain]
+          q_int    = UPS losses + PDU losses + lighting          [internal gains]
+        IT heat does NOT appear directly — servers move cold air to the hot
+        aisle, raising T_hot.  IT heat affects the cold aisle only through
+        recirculation (hot air leaking back) and indirectly via CRAC return
+        temperature.
+        Hot aisle temperature (algebraic, not ODE):
+          T_hot = T_cold + Q_IT / (m_dot_rack × c_p)
+        CRAC return air temperature accounts for bypass airflow:
+          When CRAC airflow > rack airflow, excess cold air bypasses servers
+          and returns directly to the CRAC at T_cold, lowering the effective
+          return air temperature and thus CRAC cooling output.
+          T_return = (1 − bypass) × T_hot + bypass × T_cold
+        """
+        state = self._state
+        alarms: list[ThermalAlarm] = []
+        total_cooling_output_kw = 0.0
+        total_cooling_power_kw = 0.0
+        total_power_kw = 0.0
+        for zone in state.zones:
+            # 1. Update CRAC supply temperatures (first-order lag toward setpoint)
+            for crac in zone.crac_units:
+                crac.update_supply_temp(dt)
+            # 2. Airflow quantities
+            q_it_w = zone.total_it_load_kw * 1000.0
+            m_dot_rack = zone.total_rack_airflow_m3s * AIR_DENSITY_KG_M3   # kg/s
+            m_dot_crac = zone.total_crac_airflow_m3s * AIR_DENSITY_KG_M3   # kg/s
+            # Server temperature rise [°C]
+            if m_dot_rack > 0:
+                dt_server = q_it_w / (m_dot_rack * AIR_SPECIFIC_HEAT_J_KGK)
+            else:
+                dt_server = 50.0  # No airflow → extreme rise
+            t_hot = zone.cold_aisle_temp_c + dt_server
+            # 3. Bypass fraction: excess CRAC airflow that bypasses servers
+            if m_dot_crac > 0 and m_dot_rack > 0:
+                bypass_frac = max(0.0, 1.0 - m_dot_rack / m_dot_crac)
+            else:
+                bypass_frac = 0.0
+            # CRAC return air temp (mixed hot exhaust + bypassed cold air)
+            t_return = (1.0 - bypass_frac) * t_hot + bypass_frac * zone.cold_aisle_temp_c
+            # 4. CRAC cooling output (based on bypass-corrected return temp)
+            q_cooling_extracted_w = 0.0
+            zone_cooling_power_kw = 0.0
+            for crac in zone.crac_units:
+                q_crac_kw = crac.compute_cooling_output_kw(t_return)
+                q_cooling_extracted_w += q_crac_kw * 1000.0
+                total_cooling_output_kw += q_crac_kw
+                p_crac_kw = crac.compute_power_consumption_kw(q_crac_kw, state.outside_temp_c)
+                zone_cooling_power_kw += p_crac_kw
+                total_cooling_power_kw += p_crac_kw
+            # 5. Cold aisle energy balance [all in Watts]
+            # CRAC supply mixing: each CRAC injects air into the cold aisle.
+            # Running CRACs inject air at their supply temp (near setpoint).
+            # Compressor-faulted CRACs with fans running inject air at the
+            # return air temp (air passes through the inactive coil unconditioned).
+            q_crac_mixing_w = 0.0
+            for crac in zone.crac_units:
+                crac_flow = crac.current_airflow_m3s * AIR_DENSITY_KG_M3
+                if crac_flow <= 0:
+                    continue
+                if crac.fault_type in (CRACFaultType.COMPRESSOR, CRACFaultType.REFRIGERANT_LEAK):
+                    effective_supply = t_return  # No cooling — just recirculating
+                else:
+                    effective_supply = crac.supply_temp_c
+                q_crac_mixing_w += crac_flow * AIR_SPECIFIC_HEAT_J_KGK * (
+                    effective_supply - zone.cold_aisle_temp_c
+                )
+            # Hot air entering cold aisle from two mechanisms:
+            #
+            # (a) Containment recirculation: fraction r of air leaks through
+            #     containment gaps regardless of CRAC flow balance.
+            #     Uses max(m_dot_rack, m_dot_crac) — recirculation is driven
+            #     by pressure differentials from whichever airflow is dominant.
+            #     When CRACs are off, server fans still drive leakage.
+            r = zone.recirculation_factor
+            m_dot_dominant = max(m_dot_rack, m_dot_crac)
+            q_recirc_w = r * m_dot_dominant * AIR_SPECIFIC_HEAT_J_KGK * dt_server
+            # (b) Natural return: when CRAC airflow < rack airflow, servers
+            #     exhaust more hot air than CRACs can capture. The uncaptured
+            #     fraction returns to the cold aisle via natural convection.
+            #     When CRACs are completely off, ALL server exhaust returns
+            #     (= Q_IT returns to cold aisle as heat).
+            if m_dot_rack > 0 and m_dot_crac < m_dot_rack:
+                natural_return_frac = 1.0 - m_dot_crac / m_dot_rack
+                q_natural_return_w = (
+                    natural_return_frac * m_dot_rack * AIR_SPECIFIC_HEAT_J_KGK * dt_server
+                )
+            else:
+                q_natural_return_w = 0.0
+            # Envelope heat gain
+            if zone.envelope_r_kw > 0:
+                q_envelope_w = (state.outside_temp_c - zone.cold_aisle_temp_c) / zone.envelope_r_kw
+            else:
+                q_envelope_w = 0.0
+            # Internal gains (UPS/PDU losses + lighting)
+            q_ups_w = zone.total_it_load_kw * state.ups_loss_fraction * 1000.0
+            q_pdu_w = zone.total_it_load_kw * state.pdu_loss_fraction * 1000.0
+            num_zones = len(state.zones) if state.zones else 1
+            q_lighting_w = state.lighting_power_kw * 1000.0 / num_zones
+            q_internal_w = q_ups_w + q_pdu_w + q_lighting_w
+            # 6. Net heat into cold aisle [W]
+            q_net_w = (
+                q_crac_mixing_w + q_recirc_w + q_natural_return_w
+                + q_envelope_w + q_internal_w
+            )
+            # 7. Forward Euler integration
+            c_total = zone.compute_thermal_capacitance_jk()
+            if c_total > 0:
+                dT = q_net_w * dt / c_total
+                zone.cold_aisle_temp_c += dT
+            # 8. Update hot aisle (algebraic: T_hot = T_cold + server ΔT)
+            if m_dot_rack > 0:
+                zone.hot_aisle_temp_c = (
+                    zone.cold_aisle_temp_c
+                    + q_it_w / (m_dot_rack * AIR_SPECIFIC_HEAT_J_KGK)
+                )
+            else:
+                zone.hot_aisle_temp_c = zone.cold_aisle_temp_c + 50.0
+            # 9. Update individual rack inlet/outlet temperatures
+            for rack in zone.racks:
+                rack.inlet_temp_c = zone.cold_aisle_temp_c
+                rack.outlet_temp_c = rack.compute_outlet_temp()
+            # 10. Check ASHRAE alarms
+            ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
+            if ashrae:
+                for rack in zone.racks:
+                    if rack.inlet_temp_c > ashrae.allowable_max_c:
+                        alarms.append(ThermalAlarm(
+                            rack_id=rack.rack_id,
+                            zone_id=zone.zone_id,
+                            inlet_temp_c=rack.inlet_temp_c,
+                            threshold_c=ashrae.allowable_max_c,
+                            severity="critical",
+                        ))
+                    elif rack.inlet_temp_c > ashrae.recommended_max_c:
+                        alarms.append(ThermalAlarm(
+                            rack_id=rack.rack_id,
+                            zone_id=zone.zone_id,
+                            inlet_temp_c=rack.inlet_temp_c,
+                            threshold_c=ashrae.recommended_max_c,
+                            severity="warning",
+                        ))
+            total_power_kw += zone.total_it_load_kw
+        # Energy consumed in this step [kWh]
+        total_facility_kw = total_power_kw + total_cooling_power_kw + (
+            total_power_kw * (state.ups_loss_fraction + state.pdu_loss_fraction)
+            + state.lighting_power_kw
+        )
+        energy_kwh = total_facility_kw * dt / 3600.0
+        return ThermalStepResult(
+            state=state,
+            alarms=alarms,
+            total_cooling_output_kw=total_cooling_output_kw,
+            total_cooling_power_kw=total_cooling_power_kw,
+            energy_consumed_kwh=energy_kwh,
+        )
+    @staticmethod
+    def _compute_weighted_supply_temp(zone: ZoneState) -> float | None:
+        """Flow-weighted average of CRAC supply temperatures.
+        T_supply_weighted = Σ(T_supply_i × m_dot_i) / Σ(m_dot_i)
+        Returns None if no CRACs are producing airflow.
+        """
+        total_flow = 0.0
+        weighted_temp = 0.0
+        for crac in zone.crac_units:
+            flow = crac.current_airflow_m3s
+            if flow > 0:
+                weighted_temp += crac.supply_temp_c * flow
+                total_flow += flow
+        if total_flow <= 0:
+            return None
+        return weighted_temp / total_flow
+    # ------------------------------------------------------------------
+    # Mutation helpers (used by action parser in later phases)
+    # ------------------------------------------------------------------
+    def set_crac_setpoint(self, unit_id: str, setpoint_c: float) -> bool:
+        """Adjust a CRAC unit's supply air temperature setpoint. Returns success."""
+        crac = self._find_crac(unit_id)
+        if crac is None:
+            return False
+        crac.setpoint_c = setpoint_c
+        return True
+    def set_crac_fan_speed(self, unit_id: str, speed_pct: float) -> bool:
+        """Set CRAC fan speed (0-100%). Returns success."""
+        crac = self._find_crac(unit_id)
+        if crac is None:
+            return False
+        crac.fan_speed_pct = max(0.0, min(100.0, speed_pct))
+        return True
+    def set_crac_status(self, unit_id: str, status: CRACStatus) -> bool:
+        """Change CRAC operating status. Returns success."""
+        crac = self._find_crac(unit_id)
+        if crac is None:
+            return False
+        crac.status = status
+        return True
+    def inject_crac_fault(
+        self, unit_id: str, fault_type: CRACFaultType
+    ) -> bool:
+        """Inject a fault into a CRAC unit. Returns success."""
+        crac = self._find_crac(unit_id)
+        if crac is None:
+            return False
+        crac.status = CRACStatus.FAULT
+        crac.fault_type = fault_type
+        return True
+    def clear_crac_fault(self, unit_id: str) -> bool:
+        """Clear a CRAC fault and return to running. Returns success."""
+        crac = self._find_crac(unit_id)
+        if crac is None:
+            return False
+        crac.status = CRACStatus.RUNNING
+        crac.fault_type = CRACFaultType.NONE
+        return True
+    def set_rack_load(self, rack_id: str, load_kw: float) -> bool:
+        """Change a rack's IT load. Returns success."""
+        rack = self._find_rack(rack_id)
+        if rack is None:
+            return False
+        rack.it_load_kw = max(0.0, load_kw)
+        # Update airflow proportionally (servers spin fans with load)
+        from ..config import RackConfig
+        default_cfm_per_kw = RackConfig().airflow_cfm_per_kw
+        rack.airflow_m3s = cfm_to_m3s(default_cfm_per_kw * rack.it_load_kw)
+        return True
+    def set_outside_temp(self, temp_c: float) -> None:
+        """Set outside temperature."""
+        self._state.outside_temp_c = temp_c
+    def _find_crac(self, unit_id: str) -> CRACState | None:
+        for zone in self._state.zones:
+            for crac in zone.crac_units:
+                if crac.unit_id == unit_id:
+                    return crac
+        return None
+    def _find_rack(self, rack_id: str) -> RackState | None:
+        for zone in self._state.zones:
+            for rack in zone.racks:
+                if rack.rack_id == rack_id:
+                    return rack
+        return None
+    def find_zone_for_crac(self, unit_id: str) -> ZoneState | None:
+        """Find the zone containing a given CRAC unit."""
+        for zone in self._state.zones:
+            for crac in zone.crac_units:
+                if crac.unit_id == unit_id:
+                    return zone
+        return None
+    def find_zone_for_rack(self, rack_id: str) -> ZoneState | None:
+        """Find the zone containing a given rack."""
+        for zone in self._state.zones:
+            for rack in zone.racks:
+                if rack.rack_id == rack_id:
+                    return zone
+        return None

simulation/types.py ADDED Viewed

	@@ -0,0 +1,598 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Runtime state dataclasses for the datacenter simulation.
+These are plain dataclasses (not Pydantic) for performance on the simulation
+hot path. Pydantic models are only used at the API boundary (models.py).
+All values in SI units:
+  - Temperature: °C
+  - Power/Heat: kW (for readability; converted to W in physics calculations)
+  - Airflow: m³/s
+  - Thermal capacitance: J/K
+  - Thermal resistance: K/W
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import Enum
+class CRACStatus(Enum):
+    RUNNING = "running"
+    STANDBY = "standby"
+    FAULT = "fault"
+    MAINTENANCE = "maintenance"
+class CRACFaultType(Enum):
+    NONE = "none"
+    COMPRESSOR = "compressor"
+    FAN = "fan"
+    REFRIGERANT_LEAK = "refrigerant_leak"
+    SENSOR = "sensor"
+    ELECTRICAL = "electrical"
+# ---------------------------------------------------------------------------
+# Power subsystem enums
+# ---------------------------------------------------------------------------
+class UPSMode(Enum):
+    """UPS operating mode."""
+    DOUBLE_CONVERSION = "double_conversion"  # Normal: AC→DC→AC, full protection
+    LINE_INTERACTIVE = "line_interactive"     # Reduced losses, slower transfer
+    ECO = "eco"                              # Bypass with monitoring, minimal losses
+    BYPASS = "bypass"                        # Manual bypass, no protection
+    ON_BATTERY = "on_battery"               # Utility lost, discharging battery
+    FAULT = "fault"                          # UPS fault, load on raw utility or dead
+class GeneratorState(Enum):
+    """Diesel generator state machine states."""
+    OFF = "off"                  # Not running
+    START_DELAY = "start_delay"  # Programmed delay before cranking
+    CRANKING = "cranking"        # Engine cranking
+    WARMING = "warming"          # Warm-up period before load acceptance
+    READY = "ready"              # Running, ready to accept load
+    LOADED = "loaded"            # Running under load
+    COOLDOWN = "cooldown"        # Unloaded cool-down before shutdown
+class ATSPosition(Enum):
+    """ATS switch position."""
+    UTILITY = "utility"
+    GENERATOR = "generator"
+    TRANSFERRING = "transferring"  # Mid-transfer (load momentarily interrupted)
+@dataclass
+class RackState:
+    """Runtime state of a single server rack."""
+    rack_id: str
+    row: str
+    position: int
+    # Electrical / thermal load
+    it_load_kw: float            # Current IT power draw (≈ heat dissipation)
+    # Temperatures
+    inlet_temp_c: float          # Cold aisle side (server intake)
+    outlet_temp_c: float         # Hot aisle side (server exhaust)
+    # Airflow
+    airflow_m3s: float           # Total server fan airflow through this rack
+    # Thermal inertia
+    thermal_mass_jk: float       # Equipment thermal capacitance [J/K]
+    def compute_outlet_temp(self) -> float:
+        """Compute outlet temp from energy balance: Q = m_dot * c_p * dT.
+        Returns outlet temperature in °C.
+        """
+        from ..config import AIR_DENSITY_KG_M3, AIR_SPECIFIC_HEAT_J_KGK
+        m_dot = self.airflow_m3s * AIR_DENSITY_KG_M3  # kg/s
+        if m_dot <= 0:
+            # No airflow — temperature rises unboundedly in theory;
+            # clamp to a high value to signal danger
+            return self.inlet_temp_c + 50.0
+        q_w = self.it_load_kw * 1000.0  # Convert kW → W
+        delta_t = q_w / (m_dot * AIR_SPECIFIC_HEAT_J_KGK)
+        return self.inlet_temp_c + delta_t
+@dataclass
+class CRACState:
+    """Runtime state of a CRAC/CRAH cooling unit."""
+    unit_id: str
+    # Operating status
+    status: CRACStatus = CRACStatus.RUNNING
+    fault_type: CRACFaultType = CRACFaultType.NONE
+    # Setpoints and actuals
+    setpoint_c: float = 18.0          # Desired supply air temperature
+    supply_temp_c: float = 18.0       # Actual supply air temperature (lags setpoint)
+    fan_speed_pct: float = 100.0      # 0-100
+    # Rated specifications (from config, immutable during episode)
+    max_airflow_m3s: float = 0.0      # At 100% fan speed
+    rated_capacity_kw: float = 70.0   # At rated return temp
+    rated_return_temp_c: float = 24.0  # Return temp for rated capacity
+    capacity_slope_per_c: float = 0.03 # Fractional capacity change per °C
+    fan_rated_power_kw: float = 5.0
+    cop_rated: float = 3.5
+    cop_degradation_per_c: float = 0.04
+    supply_temp_lag_s: float = 30.0    # First-order lag time constant
+    @property
+    def current_airflow_m3s(self) -> float:
+        """Actual airflow based on fan speed and status."""
+        if self.status != CRACStatus.RUNNING:
+            return 0.0
+        if self.fault_type == CRACFaultType.FAN:
+            return 0.0
+        return self.max_airflow_m3s * (self.fan_speed_pct / 100.0)
+    def compute_cooling_output_kw(self, return_air_temp_c: float) -> float:
+        """Compute actual cooling output [kW].
+        Cooling capacity depends on return air temperature:
+            Q_actual = Q_rated × [1 + α × (T_return - T_rated)]
+        But is also limited by airflow × deltaT:
+            Q_airflow = m_dot × c_p × (T_return - T_supply)
+        The actual output is the minimum of both limits.
+        """
+        from ..config import AIR_DENSITY_KG_M3, AIR_SPECIFIC_HEAT_J_KGK
+        if self.status != CRACStatus.RUNNING:
+            return 0.0
+        if self.fault_type in (CRACFaultType.COMPRESSOR, CRACFaultType.REFRIGERANT_LEAK):
+            return 0.0
+        # Capacity limit (refrigeration cycle capacity)
+        delta_return = return_air_temp_c - self.rated_return_temp_c
+        q_capacity = self.rated_capacity_kw * (1.0 + self.capacity_slope_per_c * delta_return)
+        q_capacity = max(q_capacity, 0.0)
+        # Airflow limit
+        m_dot = self.current_airflow_m3s * AIR_DENSITY_KG_M3  # kg/s
+        if m_dot <= 0:
+            return 0.0
+        delta_t = return_air_temp_c - self.supply_temp_c
+        if delta_t <= 0:
+            return 0.0
+        q_airflow = m_dot * AIR_SPECIFIC_HEAT_J_KGK * delta_t / 1000.0  # W → kW
+        return min(q_capacity, q_airflow)
+    def compute_power_consumption_kw(
+        self, cooling_output_kw: float, outside_temp_c: float
+    ) -> float:
+        """Compute CRAC electrical power consumption [kW].
+        Fan power: cubic relationship with speed (affinity laws).
+            P_fan = P_rated × (speed/100)³
+        Compressor power: Q_cooling / COP
+            COP degrades at higher outside temperatures.
+        """
+        if self.status != CRACStatus.RUNNING:
+            return 0.0
+        # Fan power (affinity law: power ∝ speed³)
+        speed_frac = self.fan_speed_pct / 100.0
+        p_fan = self.fan_rated_power_kw * (speed_frac ** 3)
+        # Compressor power
+        cop = self.cop_rated
+        if outside_temp_c > 35.0:
+            # COP degrades linearly above 35°C
+            cop *= max(0.3, 1.0 - self.cop_degradation_per_c * (outside_temp_c - 35.0))
+        if self.fault_type in (CRACFaultType.COMPRESSOR, CRACFaultType.REFRIGERANT_LEAK):
+            p_compressor = 0.0
+        elif cop > 0 and cooling_output_kw > 0:
+            p_compressor = cooling_output_kw / cop
+        else:
+            p_compressor = 0.0
+        return p_fan + p_compressor
+    def update_supply_temp(self, dt_s: float) -> None:
+        """First-order lag: supply temp approaches setpoint with time constant.
+        T_supply(t+dt) = T_supply(t) + (T_setpoint - T_supply(t)) × (1 - e^(-dt/τ))
+        For small dt/τ this approximates: T += (T_set - T) × dt/τ
+        """
+        import math
+        if self.status != CRACStatus.RUNNING:
+            return
+        if self.fault_type == CRACFaultType.COMPRESSOR:
+            # Compressor fault: supply temp drifts toward return air (no cooling)
+            return
+        if self.supply_temp_lag_s <= 0:
+            self.supply_temp_c = self.setpoint_c
+            return
+        alpha = 1.0 - math.exp(-dt_s / self.supply_temp_lag_s)
+        self.supply_temp_c += (self.setpoint_c - self.supply_temp_c) * alpha
+@dataclass
+class ZoneState:
+    """Runtime state of a thermal zone (a section of the datacenter)."""
+    zone_id: str
+    # Temperatures
+    cold_aisle_temp_c: float = 20.0
+    hot_aisle_temp_c: float = 35.0
+    # Humidity (tracked, not yet fully modeled psychrometrically)
+    humidity_rh: float = 0.45          # Fraction 0-1
+    # Containment / recirculation
+    recirculation_factor: float = 0.08  # 0 = perfect containment
+    # Equipment
+    racks: list[RackState] = field(default_factory=list)
+    crac_units: list[CRACState] = field(default_factory=list)
+    # Zone thermal properties
+    air_volume_m3: float = 500.0
+    envelope_r_kw: float = 0.02       # Thermal resistance to outside [K/W]
+    # ASHRAE class for this zone
+    ashrae_class: str = "A2"
+    @property
+    def total_it_load_kw(self) -> float:
+        return sum(r.it_load_kw for r in self.racks)
+    @property
+    def total_rack_airflow_m3s(self) -> float:
+        return sum(r.airflow_m3s for r in self.racks)
+    @property
+    def total_crac_airflow_m3s(self) -> float:
+        return sum(c.current_airflow_m3s for c in self.crac_units)
+    @property
+    def max_inlet_temp_c(self) -> float:
+        if not self.racks:
+            return self.cold_aisle_temp_c
+        return max(r.inlet_temp_c for r in self.racks)
+    def compute_thermal_capacitance_jk(self) -> float:
+        """Total thermal capacitance of this zone [J/K].
+        C_total = C_air + C_equipment
+        C_air = ρ × V × c_p  (~1-2 kJ/K for typical zone)
+        C_equipment = Σ rack thermal masses  (dominant term, ~100+ kJ/K)
+        """
+        from ..config import AIR_DENSITY_KG_M3, AIR_SPECIFIC_HEAT_J_KGK
+        c_air = AIR_DENSITY_KG_M3 * self.air_volume_m3 * AIR_SPECIFIC_HEAT_J_KGK
+        c_equipment = sum(r.thermal_mass_jk for r in self.racks)
+        return c_air + c_equipment
+# ---------------------------------------------------------------------------
+# Power subsystem state
+# ---------------------------------------------------------------------------
+@dataclass
+class UPSState:
+    """Runtime state of a UPS unit."""
+    unit_id: str
+    # Operating mode
+    mode: UPSMode = UPSMode.DOUBLE_CONVERSION
+    # Load
+    input_power_kw: float = 0.0       # Power entering UPS from utility/generator
+    output_power_kw: float = 0.0      # Power delivered to IT load
+    load_fraction: float = 0.0        # output / rated_capacity (0-1)
+    # Efficiency and losses
+    efficiency: float = 0.97          # Current operating efficiency
+    heat_output_kw: float = 0.0       # Waste heat = input - output
+    # Battery
+    battery_soc: float = 1.0          # State of charge (0-1)
+    battery_power_kw: float = 0.0     # Positive = discharging, negative = charging
+    battery_time_remaining_s: float = 0.0  # Estimated time at current draw
+    # Rated specs (from config, immutable during episode)
+    rated_capacity_kw: float = 500.0
+    loss_c0: float = 0.013
+    loss_c1: float = 0.006
+    loss_c2: float = 0.011
+    battery_capacity_kwh: float = 8.3
+    battery_discharge_efficiency: float = 0.90
+    battery_aging_factor: float = 0.85
+    recharge_rate_kw: float = 5.0
+    def compute_efficiency(self) -> float:
+        """UPS efficiency using quadratic loss model.
+        η(x) = x / (x + c_0 + c_1·x + c_2·x²)
+        where x = load_fraction.
+        At very low loads (x < 0.01), efficiency is undefined / near-zero.
+        """
+        x = self.load_fraction
+        if x < 0.01:
+            return 0.0
+        denominator = x + self.loss_c0 + self.loss_c1 * x + self.loss_c2 * x * x
+        if denominator <= 0:
+            return 0.0
+        return x / denominator
+    def compute_losses_kw(self) -> float:
+        """Power losses at current load.
+        P_loss = P_output × (1/η - 1)
+        """
+        eta = self.compute_efficiency()
+        if eta <= 0:
+            # No-load losses: just transformer/control board idle draw
+            return self.rated_capacity_kw * self.loss_c0
+        return self.output_power_kw * (1.0 / eta - 1.0)
+    def compute_battery_time_remaining_s(self) -> float:
+        """Estimate remaining battery runtime at current discharge rate.
+        t = (SOC × E_battery × η_discharge × aging) / P_discharge
+        """
+        if self.battery_power_kw <= 0:
+            return float("inf")
+        usable_kwh = (
+            self.battery_soc
+            * self.battery_capacity_kwh
+            * self.battery_discharge_efficiency
+            * self.battery_aging_factor
+        )
+        return usable_kwh / self.battery_power_kw * 3600.0  # hours → seconds
+@dataclass
+class PDUState:
+    """Runtime state of a three-phase PDU."""
+    pdu_id: str
+    # Per-phase currents [A]
+    phase_currents_a: list[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])
+    # Power
+    input_power_kw: float = 0.0
+    output_power_kw: float = 0.0
+    heat_output_kw: float = 0.0       # Transformer losses
+    # Utilization
+    load_fraction: float = 0.0        # Of derated capacity
+    phase_imbalance_pct: float = 0.0  # Max deviation from average phase current
+    # Alarms
+    breaker_tripped: bool = False
+    overload: bool = False
+    # Rated specs (from config)
+    voltage_ll_v: float = 208.0
+    max_current_per_phase_a: float = 24.0
+    num_phases: int = 3
+    breaker_rating_a: float = 20.0
+    efficiency: float = 0.98
+    continuous_derating: float = 0.80
+    @property
+    def nameplate_capacity_kw(self) -> float:
+        """Total nameplate capacity: P = √3 × V_LL × I_phase × num_phases_factor."""
+        import math
+        return math.sqrt(3) * self.voltage_ll_v * self.max_current_per_phase_a / 1000.0
+    @property
+    def derated_capacity_kw(self) -> float:
+        """NEC 80% continuous derating applied."""
+        return self.nameplate_capacity_kw * self.continuous_derating
+    def compute_phase_imbalance(self) -> float:
+        """Phase imbalance as percentage deviation from average.
+        imbalance = max(|I_phase - I_avg|) / I_avg × 100
+        Returns 0 if no load.
+        """
+        if not self.phase_currents_a:
+            return 0.0
+        avg = sum(self.phase_currents_a) / len(self.phase_currents_a)
+        if avg <= 0:
+            return 0.0
+        max_dev = max(abs(i - avg) for i in self.phase_currents_a)
+        return max_dev / avg * 100.0
+    def compute_heat_output_kw(self) -> float:
+        """PDU transformer losses."""
+        return self.input_power_kw * (1.0 - self.efficiency)
+@dataclass
+class GensetState:
+    """Runtime state of a diesel standby generator."""
+    gen_id: str
+    # State machine
+    state: GeneratorState = GeneratorState.OFF
+    state_elapsed_s: float = 0.0      # Time in current state
+    # Output
+    output_power_kw: float = 0.0
+    load_fraction: float = 0.0        # output / rated_capacity
+    # Fuel
+    fuel_level_liters: float = 2000.0
+    fuel_consumption_lph: float = 0.0  # Current consumption rate
+    # Timing specs (from config)
+    rated_capacity_kw: float = 750.0
+    start_delay_s: float = 4.0
+    crank_time_s: float = 5.0
+    warmup_time_s: float = 8.0
+    cooldown_time_s: float = 300.0
+    fuel_tank_liters: float = 2000.0
+    consumption_lph_full: float = 180.0
+    @property
+    def is_available(self) -> bool:
+        """Generator is ready to accept load."""
+        return self.state in (GeneratorState.READY, GeneratorState.LOADED)
+    @property
+    def fuel_remaining_hours(self) -> float:
+        """Estimated hours of fuel at current consumption rate."""
+        if self.fuel_consumption_lph <= 0:
+            return float("inf")
+        return self.fuel_level_liters / self.fuel_consumption_lph
+    def compute_fuel_consumption_lph(self) -> float:
+        """Fuel consumption scales roughly linearly with load.
+        Includes ~10% idle consumption when running unloaded.
+        """
+        if self.state == GeneratorState.OFF:
+            return 0.0
+        if self.state in (GeneratorState.CRANKING, GeneratorState.START_DELAY):
+            return 0.0  # Not yet burning fuel
+        # Idle + proportional: consumption = full × (0.1 + 0.9 × load_fraction)
+        return self.consumption_lph_full * (0.1 + 0.9 * self.load_fraction)
+@dataclass
+class ATSState:
+    """Runtime state of an Automatic Transfer Switch."""
+    ats_id: str
+    position: ATSPosition = ATSPosition.UTILITY
+    transfer_elapsed_ms: float = 0.0   # Progress through transfer
+    # Specs (from config)
+    transfer_time_ms: float = 100.0
+    retransfer_delay_s: float = 300.0
+    # Timer for retransfer delay (counts up when utility is restored)
+    retransfer_timer_s: float = 0.0
+    @property
+    def load_powered(self) -> bool:
+        """Whether the load side has power (False only during transfer gap)."""
+        return self.position != ATSPosition.TRANSFERRING
+@dataclass
+class PowerState:
+    """Aggregated power subsystem state."""
+    ups_units: list[UPSState] = field(default_factory=list)
+    pdus: list[PDUState] = field(default_factory=list)
+    generator: GensetState = field(default_factory=lambda: GensetState(gen_id="GEN-1"))
+    ats: ATSState = field(default_factory=lambda: ATSState(ats_id="ATS-1"))
+    # Utility
+    utility_available: bool = True
+    utility_voltage_v: float = 480.0
+    @property
+    def total_ups_loss_kw(self) -> float:
+        return sum(u.heat_output_kw for u in self.ups_units)
+    @property
+    def total_pdu_loss_kw(self) -> float:
+        return sum(p.heat_output_kw for p in self.pdus)
+    @property
+    def total_power_overhead_kw(self) -> float:
+        """Total electrical overhead from power distribution."""
+        return self.total_ups_loss_kw + self.total_pdu_loss_kw
+    @property
+    def on_generator(self) -> bool:
+        return self.ats.position == ATSPosition.GENERATOR
+    @property
+    def power_available(self) -> bool:
+        """Whether load-side power is available (from any source)."""
+        if not self.ats.load_powered:
+            return False
+        if self.ats.position == ATSPosition.UTILITY:
+            return self.utility_available
+        if self.ats.position == ATSPosition.GENERATOR:
+            return self.generator.is_available
+        return False
+@dataclass
+class DatacenterState:
+    """Top-level simulation state aggregating all subsystems."""
+    zones: list[ZoneState] = field(default_factory=list)
+    # Power subsystem (None = use stub loss fractions for backward compat)
+    power: PowerState | None = None
+    # Environment
+    outside_temp_c: float = 35.0
+    outside_humidity_rh: float = 0.40
+    # Facility overhead
+    lighting_power_kw: float = 5.0       # Total lighting load
+    # Power distribution stub losses (fractions of IT load)
+    # Used only when power subsystem is not initialized
+    ups_loss_fraction: float = 0.05
+    pdu_loss_fraction: float = 0.02
+    # Simulation clock
+    sim_time_s: float = 0.0
+    @property
+    def total_it_load_kw(self) -> float:
+        return sum(z.total_it_load_kw for z in self.zones)
+    @property
+    def total_cooling_power_kw(self) -> float:
+        total = 0.0
+        for zone in self.zones:
+            for crac in zone.crac_units:
+                q_cool = crac.compute_cooling_output_kw(zone.hot_aisle_temp_c)
+                total += crac.compute_power_consumption_kw(q_cool, self.outside_temp_c)
+        return total
+    @property
+    def pue(self) -> float:
+        """Dynamic PUE = Total Facility Power / IT Power.
+        When power subsystem is active, uses real UPS/PDU losses.
+        Otherwise falls back to stub loss fractions.
+        """
+        p_it = self.total_it_load_kw
+        if p_it <= 0:
+            return 1.0
+        p_cooling = self.total_cooling_power_kw
+        if self.power is not None:
+            p_distribution_loss = self.power.total_power_overhead_kw
+        else:
+            p_distribution_loss = p_it * (self.ups_loss_fraction + self.pdu_loss_fraction)
+        p_total = p_it + p_cooling + p_distribution_loss + self.lighting_power_kw
+        return p_total / p_it

tests/__init__.py ADDED Viewed

File without changes

tests/test_environment.py ADDED Viewed

	@@ -0,0 +1,439 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tests for the DC-Ops environment, action parser, and dashboard renderer.
+Validates:
+  - OpenEnv interface contract (reset/step/state)
+  - Action parsing (valid and invalid commands)
+  - Dashboard rendering output format
+  - Episode termination conditions
+  - Fault injection
+  - Reward computation
+"""
+from __future__ import annotations
+import pytest
+from dc_ops_env.models import DcOpsAction, DcOpsObservation
+from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
+# ===========================================================================
+# OpenEnv Interface Contract
+# ===========================================================================
+class TestOpenEnvContract:
+    """Verify the environment satisfies the OpenEnv Environment ABC."""
+    def test_reset_returns_observation(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        assert isinstance(obs, DcOpsObservation)
+        assert obs.done is False
+        assert obs.reward == 0.0
+    def test_reset_has_dashboard(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        assert len(obs.dashboard) > 100
+        assert "DC-OPS MONITORING DASHBOARD" in obs.dashboard
+    def test_reset_has_available_actions(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        assert len(obs.available_actions) > 5
+    def test_step_returns_observation(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="check_status"))
+        assert isinstance(obs, DcOpsObservation)
+        assert obs.done is False
+    def test_step_advances_step_count(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        assert env.state.step_count == 0
+        env.step(DcOpsAction(command="wait"))
+        assert env.state.step_count == 1
+        env.step(DcOpsAction(command="wait"))
+        assert env.state.step_count == 2
+    def test_state_has_episode_id(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        assert env.state.episode_id is not None
+        assert len(env.state.episode_id) > 0
+    def test_reset_changes_episode_id(self) -> None:
+        env = DcOpsEnvironment()
+        obs1 = env.reset()
+        ep1 = env.state.episode_id
+        obs2 = env.reset()
+        ep2 = env.state.episode_id
+        assert ep1 != ep2
+    def test_observation_metadata_populated(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        assert "total_it_load_kw" in obs.metadata
+        assert "pue" in obs.metadata
+        assert "zones" in obs.metadata
+        assert obs.metadata["total_it_load_kw"] == pytest.approx(160.0, rel=0.01)
+    def test_observation_has_power_metadata(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        assert "power" in obs.metadata
+        assert obs.metadata["power"]["utility_available"] is True
+# ===========================================================================
+# Action Parser Tests
+# ===========================================================================
+class TestActionParser:
+    """Test command parsing and execution."""
+    def test_diagnose_crac(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="diagnose CRAC-1"))
+        assert "Diagnostic Report" in obs.action_result
+        assert "CRAC-1" in obs.action_result
+        assert obs.reward > -0.5  # Valid action should not be heavily penalized
+    def test_diagnose_ups(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="diagnose UPS-1"))
+        assert "Diagnostic Report" in obs.action_result
+        assert "UPS-1" in obs.action_result
+    def test_diagnose_nonexistent(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="diagnose CRAC-99"))
+        assert "not found" in obs.action_result
+    def test_adjust_setpoint_valid(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="adjust_setpoint CRAC-1 22"))
+        assert "adjusted" in obs.action_result.lower()
+        assert "22.0" in obs.action_result
+    def test_adjust_setpoint_out_of_range(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="adjust_setpoint CRAC-1 50"))
+        assert "out of safe range" in obs.action_result.lower() or "out of" in obs.action_result.lower()
+    def test_set_fan_speed(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="set_fan_speed CRAC-2 80"))
+        assert "fan speed" in obs.action_result.lower()
+        assert "80" in obs.action_result
+    def test_set_rack_load(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="set_rack_load A-01 12"))
+        assert "12.0" in obs.action_result
+    def test_start_generator(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="start_generator"))
+        assert "generator" in obs.action_result.lower()
+    def test_wait_command(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="wait"))
+        assert "no action" in obs.action_result.lower()
+    def test_check_status(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="check_status"))
+        assert "status" in obs.action_result.lower()
+    def test_invalid_command(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="fly_to_the_moon"))
+        assert "unknown" in obs.action_result.lower()
+    def test_empty_command(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command=""))
+        assert "empty" in obs.action_result.lower()
+    def test_case_insensitive(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="DIAGNOSE CRAC-1"))
+        assert "Diagnostic Report" in obs.action_result
+    def test_start_stop_crac(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="stop_crac CRAC-2"))
+        assert "standby" in obs.action_result.lower()
+        obs = env.step(DcOpsAction(command="start_crac CRAC-2"))
+        assert "started" in obs.action_result.lower()
+# ===========================================================================
+# Dashboard Rendering Tests
+# ===========================================================================
+class TestDashboardRendering:
+    """Test dashboard output format and content."""
+    def test_dashboard_has_cooling_section(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        assert "COOLING UNITS" in obs.dashboard
+        assert "CRAC-1" in obs.dashboard
+    def test_dashboard_has_zone_temps(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        assert "ZONE TEMPERATURES" in obs.dashboard
+        assert "zone_a" in obs.dashboard
+    def test_dashboard_has_rack_temps(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        assert "RACK TEMPERATURES" in obs.dashboard
+    def test_dashboard_has_power(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        assert "POWER" in obs.dashboard
+        assert "PUE" in obs.dashboard
+    def test_dashboard_has_environment(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        assert "ENVIRONMENT" in obs.dashboard
+        assert "35.0°C" in obs.dashboard
+    def test_dashboard_shows_alert(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(alert="Test alert message")
+        assert "ALERT" in obs.dashboard
+        assert "Test alert message" in obs.dashboard
+    def test_dashboard_shows_step_count(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        assert "Step: 0/15" in obs.dashboard
+        obs = env.step(DcOpsAction(command="wait"))
+        assert "Step: 1/15" in obs.dashboard
+    def test_dashboard_shows_ups_status(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        assert "UPS-1" in obs.dashboard
+        assert "UPS-2" in obs.dashboard
+# ===========================================================================
+# Episode Termination Tests
+# ===========================================================================
+class TestEpisodeTermination:
+    """Test episode termination conditions."""
+    def test_step_budget_exhaustion(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset(step_budget=3)
+        obs = env.step(DcOpsAction(command="wait"))
+        assert obs.done is False
+        obs = env.step(DcOpsAction(command="wait"))
+        assert obs.done is False
+        obs = env.step(DcOpsAction(command="wait"))
+        assert obs.done is True  # Step 3/3
+    def test_escalation_terminates(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="escalate"))
+        assert obs.done is True
+        assert obs.reward < 0  # Penalty for escalating
+    def test_step_after_done_is_noop(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="escalate"))
+        assert obs.done is True
+        obs2 = env.step(DcOpsAction(command="wait"))
+        assert obs2.done is True
+        assert "already ended" in obs2.action_result.lower()
+# ===========================================================================
+# Fault Injection Tests
+# ===========================================================================
+class TestFaultInjection:
+    """Test scenario fault injection at reset."""
+    def test_crac_fault_injection(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(
+            fault_injection={
+                "type": "crac_fault",
+                "unit_id": "CRAC-3",
+                "fault": "compressor",
+            },
+        )
+        # Dashboard should show the fault
+        assert "COMPRESSOR" in obs.dashboard or "FAULT" in obs.dashboard
+    def test_utility_loss_injection(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(
+            fault_injection={"type": "utility_loss"},
+        )
+        assert "DOWN" in obs.dashboard or "BATTERY" in obs.dashboard
+    def test_outside_temp_injection(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(
+            fault_injection={"type": "outside_temp", "temp_c": 45.0},
+        )
+        assert "45.0°C" in obs.dashboard
+    def test_alert_in_observation(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(
+            alert="HIGH TEMPERATURE in Zone B",
+            scenario_type="thermal",
+        )
+        assert obs.alert == "HIGH TEMPERATURE in Zone B"
+        assert obs.scenario_type == "thermal"
+# ===========================================================================
+# Reward Tests
+# ===========================================================================
+class TestReward:
+    """Test reward computation."""
+    def test_valid_action_positive_component(self) -> None:
+        """Valid actions should get a positive action reward component."""
+        env = DcOpsEnvironment()
+        env.reset()
+        obs_valid = env.step(DcOpsAction(command="check_status"))
+        r_valid = obs_valid.reward
+        env.reset()
+        obs_invalid = env.step(DcOpsAction(command="nonsense_command"))
+        r_invalid = obs_invalid.reward
+        # Valid action should yield higher reward than invalid
+        assert r_valid > r_invalid
+    def test_pue_affects_reward(self) -> None:
+        """Reward should be sensitive to PUE."""
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        # Just verify PUE is in metadata and reward is computed
+        pue = obs.metadata["pue"]
+        assert pue > 1.0  # PUE should always be > 1
+    def test_cumulative_reward_tracked(self) -> None:
+        """Cumulative reward should be tracked in metadata."""
+        env = DcOpsEnvironment()
+        env.reset()
+        obs = env.step(DcOpsAction(command="wait"))
+        assert "cumulative_reward" in obs.metadata
+        r1 = obs.metadata["cumulative_reward"]
+        obs = env.step(DcOpsAction(command="wait"))
+        r2 = obs.metadata["cumulative_reward"]
+        # Cumulative should change (it's the sum of per-step rewards)
+        assert r2 != 0 or r1 != 0  # At least one should be non-zero
+# ===========================================================================
+# Simulation Integration Tests
+# ===========================================================================
+class TestSimulationIntegration:
+    """Test that the environment properly advances the simulation."""
+    def test_simulation_time_advances(self) -> None:
+        """Each step should advance sim time by game_time_per_step."""
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        t0 = obs.metadata["sim_time_s"]
+        obs = env.step(DcOpsAction(command="wait"))
+        t1 = obs.metadata["sim_time_s"]
+        # Default: 60s per step
+        assert t1 - t0 == pytest.approx(60.0, rel=0.01)
+    def test_custom_game_time_per_step(self) -> None:
+        """Custom game_time_per_step should be respected."""
+        env = DcOpsEnvironment()
+        obs = env.reset(game_time_per_step_s=120.0)
+        t0 = obs.metadata["sim_time_s"]
+        obs = env.step(DcOpsAction(command="wait"))
+        t1 = obs.metadata["sim_time_s"]
+        assert t1 - t0 == pytest.approx(120.0, rel=0.01)
+    def test_setpoint_change_affects_temperature(self) -> None:
+        """Changing setpoint should cause temperature change over steps."""
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        t_cold_before = obs.metadata["zones"]["zone_a"]["cold_aisle_temp_c"]
+        # Raise setpoint significantly
+        env.step(DcOpsAction(command="adjust_setpoint CRAC-1 25"))
+        env.step(DcOpsAction(command="adjust_setpoint CRAC-2 25"))
+        # Wait a few steps for temp to change
+        for _ in range(3):
+            obs = env.step(DcOpsAction(command="wait"))
+        t_cold_after = obs.metadata["zones"]["zone_a"]["cold_aisle_temp_c"]
+        # Cold aisle should have increased
+        assert t_cold_after > t_cold_before + 0.5, \
+            f"Expected temp increase: {t_cold_before:.1f} → {t_cold_after:.1f}"
+# ===========================================================================
+# Performance Test
+# ===========================================================================
+class TestPerformance:
+    """Ensure full environment steps are fast enough."""
+    def test_episode_performance(self) -> None:
+        """Full 15-step episode should complete in < 5 seconds."""
+        import time
+        env = DcOpsEnvironment()
+        start = time.perf_counter()
+        env.reset()
+        for _ in range(15):
+            env.step(DcOpsAction(command="wait"))
+        elapsed = time.perf_counter() - start
+        assert elapsed < 5.0, f"Episode took {elapsed:.2f}s, should be < 5s"

tests/test_integration.py ADDED Viewed

	@@ -0,0 +1,535 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Integration tests: full episode playback, config loading, cross-facility.
+Validates:
+  - Known-good action sequences resolve each scenario
+  - Reward signals are well-behaved across full episodes
+  - YAML config loading produces valid, runnable environments
+  - Different facility sizes work correctly
+  - Episode metrics (PUE, temps, rewards) are in expected ranges
+"""
+from __future__ import annotations
+import time
+from pathlib import Path
+import pytest
+from dc_ops_env.config import (
+    BUILTIN_CONFIGS,
+    DatacenterConfig,
+    load_datacenter_config,
+    make_default_datacenter_config,
+)
+from dc_ops_env.models import DcOpsAction, DcOpsObservation
+from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
+from dc_ops_env.scenarios.registry import registered_scenario_ids
+# ===========================================================================
+# Config Loading Tests
+# ===========================================================================
+class TestConfigLoading:
+    """Validate YAML config loading and built-in configs."""
+    def test_builtin_configs_exist(self) -> None:
+        """All built-in config files should exist on disk."""
+        for name, path in BUILTIN_CONFIGS.items():
+            assert path.exists(), f"Built-in config '{name}' not found at {path}"
+    @pytest.mark.parametrize("config_name", ["default", "small", "large"])
+    def test_load_builtin(self, config_name: str) -> None:
+        """Each built-in config should load without error."""
+        cfg = load_datacenter_config(config_name)
+        assert isinstance(cfg, DatacenterConfig)
+        assert len(cfg.zones) > 0
+        for zone in cfg.zones:
+            assert len(zone.racks) > 0
+            assert len(zone.crac_units) > 0
+    def test_load_by_path(self) -> None:
+        """Loading by explicit path should work."""
+        path = BUILTIN_CONFIGS["default"]
+        cfg = load_datacenter_config(path)
+        assert cfg.name == "DC-OPS Default Facility"
+    def test_load_nonexistent_raises(self) -> None:
+        """Loading a missing file should raise FileNotFoundError."""
+        with pytest.raises(FileNotFoundError):
+            load_datacenter_config("/nonexistent/path.yaml")
+    def test_default_yaml_matches_programmatic(self) -> None:
+        """YAML default config should match make_default_datacenter_config()."""
+        yaml_cfg = load_datacenter_config("default")
+        prog_cfg = make_default_datacenter_config()
+        assert yaml_cfg.name == prog_cfg.name
+        assert len(yaml_cfg.zones) == len(prog_cfg.zones)
+        assert yaml_cfg.outside_temp_c == prog_cfg.outside_temp_c
+        # Same number of racks and CRACs
+        yaml_racks = sum(len(z.racks) for z in yaml_cfg.zones)
+        prog_racks = sum(len(z.racks) for z in prog_cfg.zones)
+        assert yaml_racks == prog_racks
+        yaml_cracs = sum(len(z.crac_units) for z in yaml_cfg.zones)
+        prog_cracs = sum(len(z.crac_units) for z in prog_cfg.zones)
+        assert yaml_cracs == prog_cracs
+    def test_small_facility_dimensions(self) -> None:
+        """Small facility should have correct dimensions."""
+        cfg = load_datacenter_config("small")
+        assert len(cfg.zones) == 1
+        total_racks = sum(len(z.racks) for z in cfg.zones)
+        assert total_racks == 10
+        total_it = sum(r.it_load_kw for z in cfg.zones for r in z.racks)
+        assert total_it == pytest.approx(80.0)
+        assert len(cfg.power.ups_units) == 1
+    def test_large_facility_dimensions(self) -> None:
+        """Large facility should have correct dimensions."""
+        cfg = load_datacenter_config("large")
+        assert len(cfg.zones) == 4
+        total_racks = sum(len(z.racks) for z in cfg.zones)
+        assert total_racks == 60
+        total_it = sum(r.it_load_kw for z in cfg.zones for r in z.racks)
+        assert total_it == pytest.approx(600.0)
+        assert len(cfg.power.ups_units) == 4
+    def test_large_facility_has_h1_zone(self) -> None:
+        """Large facility should include an H1 high-density zone."""
+        cfg = load_datacenter_config("large")
+        h1_zones = [z for z in cfg.zones if z.ashrae_class == "H1"]
+        assert len(h1_zones) == 1
+        # H1 zone should have higher per-rack load
+        for rack in h1_zones[0].racks:
+            assert rack.it_load_kw == 20.0
+# ===========================================================================
+# Config-to-Environment Tests
+# ===========================================================================
+class TestConfigToEnvironment:
+    """Validate that loaded configs produce runnable environments."""
+    @pytest.mark.parametrize("config_name", ["default", "small", "large"])
+    def test_env_runs_with_config(self, config_name: str) -> None:
+        """Environment should initialize and run steps with each config."""
+        cfg = load_datacenter_config(config_name)
+        env = DcOpsEnvironment()
+        obs = env.reset(config=cfg)
+        assert isinstance(obs, DcOpsObservation)
+        assert obs.done is False
+        obs = env.step(DcOpsAction(command="check_status"))
+        assert isinstance(obs, DcOpsObservation)
+    def test_small_facility_pue(self) -> None:
+        """Small facility PUE should be realistic after warmup."""
+        cfg = load_datacenter_config("small")
+        env = DcOpsEnvironment()
+        obs = env.reset(config=cfg)
+        pue = obs.metadata["pue"]
+        assert 1.1 < pue < 2.5, f"PUE {pue} out of realistic range"
+    def test_large_facility_total_load(self) -> None:
+        """Large facility total IT load should match config."""
+        cfg = load_datacenter_config("large")
+        env = DcOpsEnvironment()
+        obs = env.reset(config=cfg)
+        total_it = obs.metadata["total_it_load_kw"]
+        assert total_it == pytest.approx(600.0, rel=0.01)
+# ===========================================================================
+# Full Episode Playback: Thermal Scenarios
+# ===========================================================================
+class TestEpisodePlaybackThermal:
+    """Full episode playback with known-good action sequences for thermal scenarios."""
+    def test_a1_optimal_episode(self) -> None:
+        """A1 (Cooling Setpoint Optimization): raise setpoints to reduce PUE.
+        Optimal sequence: check_status → raise each CRAC setpoint → wait for convergence.
+        PUE should improve significantly from baseline.
+        """
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="A1")
+        pue_initial = obs.metadata["pue"]
+        # 1. Check status first (procedure bonus)
+        obs = env.step(DcOpsAction(command="check_status"))
+        assert not obs.done
+        # 2. Raise setpoints on all 4 CRACs from 15°C → 24°C (aggressive)
+        for crac_id in ["CRAC-1", "CRAC-2", "CRAC-3", "CRAC-4"]:
+            obs = env.step(DcOpsAction(command=f"adjust_setpoint {crac_id} 24"))
+        # 3. Wait for temps to converge
+        for _ in range(5):
+            obs = env.step(DcOpsAction(command="wait"))
+            if obs.done:
+                break
+        pue_final = obs.metadata["pue"]
+        # PUE should have improved (lower is better)
+        assert pue_final < pue_initial, (
+            f"PUE should improve: {pue_initial:.2f} → {pue_final:.2f}"
+        )
+    def test_a2_optimal_episode(self) -> None:
+        """A2 (Thermal Event Response): diagnose CRAC-3, compensate with remaining units.
+        Optimal: diagnose → increase fan speeds on survivors → adjust setpoints.
+        """
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="A2")
+        # 1. Diagnose the failed CRAC
+        obs = env.step(DcOpsAction(command="diagnose CRAC-3"))
+        assert "COMPRESSOR" in obs.action_result or "compressor" in obs.action_result.lower()
+        # 2. Increase fan speed on remaining CRACs
+        for crac_id in ["CRAC-1", "CRAC-2", "CRAC-4"]:
+            obs = env.step(DcOpsAction(command=f"set_fan_speed {crac_id} 100"))
+        # 3. Lower setpoints slightly on surviving units to compensate
+        for crac_id in ["CRAC-1", "CRAC-2", "CRAC-4"]:
+            obs = env.step(DcOpsAction(command=f"adjust_setpoint {crac_id} 16"))
+        # 4. Wait for stabilization
+        for _ in range(8):
+            obs = env.step(DcOpsAction(command="wait"))
+            if obs.done:
+                break
+        # Should resolve or be close — temps within recommended for 2+ steps
+        # Even if not fully resolved, reward should be reasonable
+        assert obs.metadata["cumulative_reward"] > -5.0
+    def test_a4_episode_with_load_shedding(self) -> None:
+        """A4 (CRAC Failure Cascade): diagnose both, compensate, shed load.
+        This is the hardest thermal scenario — two CRACs down.
+        """
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="A4")
+        # 1. Diagnose both failed units
+        obs = env.step(DcOpsAction(command="diagnose CRAC-1"))
+        obs = env.step(DcOpsAction(command="diagnose CRAC-3"))
+        # 2. Max out surviving CRACs
+        obs = env.step(DcOpsAction(command="set_fan_speed CRAC-2 100"))
+        obs = env.step(DcOpsAction(command="set_fan_speed CRAC-4 100"))
+        obs = env.step(DcOpsAction(command="adjust_setpoint CRAC-2 15"))
+        obs = env.step(DcOpsAction(command="adjust_setpoint CRAC-4 15"))
+        # 3. Shed load on hottest racks
+        for rack_id in ["A-01", "A-02", "B-01", "B-02"]:
+            obs = env.step(DcOpsAction(command=f"set_rack_load {rack_id} 4"))
+        # 4. Wait and monitor
+        for _ in range(10):
+            obs = env.step(DcOpsAction(command="wait"))
+            if obs.done:
+                break
+        # Hard scenario — may not fully resolve, but should make progress
+        assert obs.metadata["cumulative_reward"] > -10.0
+# ===========================================================================
+# Full Episode Playback: Power Scenarios
+# ===========================================================================
+class TestEpisodePlaybackPower:
+    """Full episode playback with known-good action sequences for power scenarios."""
+    def test_b1_optimal_episode(self) -> None:
+        """B1 (UPS Alarm Response): diagnose UPS, acknowledge alarm.
+        Simple 2-step resolution.
+        """
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="B1")
+        # 1. Diagnose UPS status
+        obs = env.step(DcOpsAction(command="diagnose UPS-1"))
+        assert not obs.done
+        # 2. Acknowledge the alarm
+        obs = env.step(DcOpsAction(command="acknowledge_alarm"))
+        assert obs.done, "B1 should resolve after diagnose + acknowledge"
+        # Speed bonus: (10 - 2) / 10 = 0.8
+        assert obs.reward > 0.5, "Should have significant speed bonus"
+    def test_b3_optimal_episode(self) -> None:
+        """B3 (Generator Test Protocol): follow the correct test sequence.
+        check_status → start_generator → diagnose GEN-1 → stop_generator → acknowledge.
+        """
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="B3")
+        # Follow correct protocol
+        obs = env.step(DcOpsAction(command="check_status"))
+        assert not obs.done
+        obs = env.step(DcOpsAction(command="start_generator"))
+        assert not obs.done
+        # Wait for generator to start (30s game time per step, gen startup ~17s)
+        obs = env.step(DcOpsAction(command="wait"))
+        obs = env.step(DcOpsAction(command="diagnose GEN-1"))
+        assert not obs.done
+        obs = env.step(DcOpsAction(command="stop_generator"))
+        assert not obs.done
+        # Wait for cooldown
+        obs = env.step(DcOpsAction(command="wait"))
+        obs = env.step(DcOpsAction(command="acknowledge_alarm"))
+        assert obs.done, "B3 should resolve after full protocol"
+    def test_b4_episode_with_load_shedding(self) -> None:
+        """B4 (Power Failure Cascade): manage battery, wait for generator.
+        Generator starts automatically on utility loss. Agent monitors
+        and sheds load to extend battery life.
+        """
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="B4")
+        # 1. Diagnose to understand the situation
+        obs = env.step(DcOpsAction(command="diagnose UPS-1"))
+        obs = env.step(DcOpsAction(command="diagnose UPS-2"))
+        # 2. Shed non-critical load to extend battery
+        obs = env.step(DcOpsAction(command="set_rack_load A-01 4"))
+        obs = env.step(DcOpsAction(command="set_rack_load B-01 4"))
+        # 3. Check generator status
+        obs = env.step(DcOpsAction(command="diagnose GEN-1"))
+        # 4. Wait for generator to come online and stabilize
+        for _ in range(14):
+            obs = env.step(DcOpsAction(command="wait"))
+            if obs.done:
+                break
+        # B4 is hard — may or may not resolve, but should make progress
+        assert obs.metadata["cumulative_reward"] > -10.0
+# ===========================================================================
+# Reward Signal Quality
+# ===========================================================================
+class TestRewardSignalQuality:
+    """Validate that reward signals are well-behaved across full episodes."""
+    def test_rewards_bounded_per_step(self) -> None:
+        """Every per-step reward should be bounded."""
+        env = DcOpsEnvironment()
+        env.reset(scenario="A2")
+        for _ in range(15):
+            obs = env.step(DcOpsAction(command="wait"))
+            # Base reward is [-1, 1], speed bonus can add up to 1.0
+            assert -2.0 <= obs.reward <= 2.0, f"Reward {obs.reward} out of bounds"
+            if obs.done:
+                break
+    def test_good_actions_beat_bad_actions(self) -> None:
+        """An optimal sequence should yield higher cumulative reward than a bad one."""
+        env = DcOpsEnvironment()
+        # Good episode: diagnose then fix
+        env.reset(scenario="B1")
+        env.step(DcOpsAction(command="diagnose UPS-1"))
+        obs_good = env.step(DcOpsAction(command="acknowledge_alarm"))
+        r_good = obs_good.metadata["cumulative_reward"]
+        # Bad episode: just wait
+        env.reset(scenario="B1")
+        for _ in range(10):
+            obs_bad = env.step(DcOpsAction(command="wait"))
+            if obs_bad.done:
+                break
+        r_bad = obs_bad.metadata["cumulative_reward"]
+        assert r_good > r_bad, f"Good ({r_good:.2f}) should beat bad ({r_bad:.2f})"
+    def test_procedure_bonus_visible(self) -> None:
+        """Following correct procedure should yield higher cumulative reward.
+        Full episode comparison: both episodes do the same actions, but one
+        follows procedure (check_status first) and the other doesn't.
+        """
+        env = DcOpsEnvironment()
+        # With procedure: check_status → adjust_setpoint → wait
+        env.reset(scenario="A1")
+        env.step(DcOpsAction(command="check_status"))
+        env.step(DcOpsAction(command="adjust_setpoint CRAC-1 22"))
+        obs_proc = env.step(DcOpsAction(command="wait"))
+        r_with = obs_proc.metadata["cumulative_reward"]
+        # Without procedure: wait → adjust_setpoint → wait (no check_status)
+        env.reset(scenario="A1")
+        env.step(DcOpsAction(command="wait"))
+        env.step(DcOpsAction(command="adjust_setpoint CRAC-1 22"))
+        obs_noproc = env.step(DcOpsAction(command="wait"))
+        r_without = obs_noproc.metadata["cumulative_reward"]
+        assert r_with > r_without, (
+            f"Procedure bonus not visible: with={r_with:.3f} vs without={r_without:.3f}"
+        )
+    @pytest.mark.parametrize("scenario_id", registered_scenario_ids())
+    def test_no_nan_rewards(self, scenario_id: str) -> None:
+        """No scenario should produce NaN rewards."""
+        import math
+        env = DcOpsEnvironment()
+        env.reset(scenario=scenario_id)
+        for _ in range(5):
+            obs = env.step(DcOpsAction(command="check_status"))
+            assert not math.isnan(obs.reward), f"NaN reward in {scenario_id}"
+            assert not math.isinf(obs.reward), f"Inf reward in {scenario_id}"
+            if obs.done:
+                break
+# ===========================================================================
+# Cross-Facility Scenario Tests
+# ===========================================================================
+class TestCrossFacility:
+    """Validate scenarios work with different facility configs."""
+    def test_scenario_with_small_facility(self) -> None:
+        """Scenarios should adapt to smaller configs that have compatible CRACs."""
+        cfg = load_datacenter_config("small")
+        env = DcOpsEnvironment()
+        # Run without a scenario, just with small config
+        obs = env.reset(config=cfg, step_budget=5)
+        assert obs.done is False
+        # Basic operations should work
+        obs = env.step(DcOpsAction(command="check_status"))
+        assert "status" in obs.action_result.lower()
+        obs = env.step(DcOpsAction(command="diagnose CRAC-1"))
+        assert "Diagnostic Report" in obs.action_result
+    def test_large_facility_steady_state(self) -> None:
+        """Large facility should reach reasonable steady state."""
+        cfg = load_datacenter_config("large")
+        env = DcOpsEnvironment()
+        obs = env.reset(config=cfg, step_budget=10)
+        pue = obs.metadata["pue"]
+        assert 1.1 < pue < 3.0, f"Large facility PUE {pue} unrealistic"
+        total_cooling = obs.metadata["total_cooling_power_kw"]
+        total_it = obs.metadata["total_it_load_kw"]
+        assert total_cooling > 0
+        assert total_it > 0
+# ===========================================================================
+# Episode Metrics & Physics Consistency
+# ===========================================================================
+class TestEpisodeMetrics:
+    """Validate physics consistency across episode metrics."""
+    def test_pue_always_above_one(self) -> None:
+        """PUE should always be >= 1.0 (physically impossible otherwise)."""
+        env = DcOpsEnvironment()
+        env.reset(scenario="A1")
+        for _ in range(10):
+            obs = env.step(DcOpsAction(command="wait"))
+            assert obs.metadata["pue"] >= 1.0
+            if obs.done:
+                break
+    def test_higher_load_raises_temperature(self) -> None:
+        """Adding rack load should cause temperature to rise."""
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        t_before = obs.metadata["zones"]["zone_a"]["cold_aisle_temp_c"]
+        # Significantly increase multiple racks' load
+        env.step(DcOpsAction(command="set_rack_load A-01 15"))
+        env.step(DcOpsAction(command="set_rack_load A-02 15"))
+        env.step(DcOpsAction(command="set_rack_load A-03 15"))
+        # Wait for thermal response
+        for _ in range(7):
+            obs = env.step(DcOpsAction(command="wait"))
+        t_after = obs.metadata["zones"]["zone_a"]["cold_aisle_temp_c"]
+        assert t_after > t_before, (
+            f"Temp should rise with more load: {t_before:.1f} → {t_after:.1f}"
+        )
+    def test_sim_time_monotonically_increases(self) -> None:
+        """Simulation time should always advance."""
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        prev_time = obs.metadata["sim_time_s"]
+        for _ in range(5):
+            obs = env.step(DcOpsAction(command="wait"))
+            assert obs.metadata["sim_time_s"] > prev_time
+            prev_time = obs.metadata["sim_time_s"]
+# ===========================================================================
+# Performance Tests
+# ===========================================================================
+class TestIntegrationPerformance:
+    """Validate performance across different facility sizes."""
+    @pytest.mark.parametrize("config_name", ["default", "small", "large"])
+    def test_episode_completes_fast(self, config_name: str) -> None:
+        """Full episode should complete quickly for any facility size."""
+        cfg = load_datacenter_config(config_name)
+        env = DcOpsEnvironment()
+        start = time.perf_counter()
+        env.reset(config=cfg, step_budget=10)
+        for _ in range(10):
+            env.step(DcOpsAction(command="wait"))
+        elapsed = time.perf_counter() - start
+        assert elapsed < 10.0, (
+            f"{config_name} facility 10-step episode took {elapsed:.2f}s, should be <10s"
+        )
+    def test_all_scenarios_full_episode_under_10s(self) -> None:
+        """Running every scenario for its full step budget should be fast."""
+        env = DcOpsEnvironment()
+        total_start = time.perf_counter()
+        for sid in registered_scenario_ids():
+            env.reset(scenario=sid)
+            for _ in range(20):  # Max budget across scenarios
+                obs = env.step(DcOpsAction(command="wait"))
+                if obs.done:
+                    break
+        total_elapsed = time.perf_counter() - total_start
+        assert total_elapsed < 15.0, (
+            f"All {len(registered_scenario_ids())} scenarios took {total_elapsed:.2f}s"
+        )

tests/test_power.py ADDED Viewed

	@@ -0,0 +1,743 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tests for the power subsystem simulation.
+Validates:
+  - UPS quadratic efficiency model against published data
+  - UPS battery discharge/charge dynamics
+  - PDU loss calculations and three-phase current distribution
+  - Generator state machine and fuel consumption
+  - ATS transfer timing
+  - Full utility-loss → generator-takeover scenario
+"""
+from __future__ import annotations
+import math
+import pytest
+from dc_ops_env.config import (
+    ATSConfig,
+    GeneratorConfig,
+    PDUConfig,
+    PowerConfig,
+    UPSConfig,
+)
+from dc_ops_env.simulation.power import PowerAlarm, PowerSimulation, PowerStepResult
+from dc_ops_env.simulation.types import (
+    ATSPosition,
+    GeneratorState,
+    UPSMode,
+)
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def make_simple_power_config(
+    num_ups: int = 1,
+    num_pdus: int = 1,
+    ups_capacity_kw: float = 500.0,
+) -> PowerConfig:
+    """Create a minimal power config for testing."""
+    return PowerConfig(
+        ups_units=[
+            UPSConfig(unit_id=f"UPS-{i+1}", rated_capacity_kw=ups_capacity_kw)
+            for i in range(num_ups)
+        ],
+        pdus=[
+            PDUConfig(pdu_id=f"PDU-{i+1}")
+            for i in range(num_pdus)
+        ],
+        generator=GeneratorConfig(),
+        ats=ATSConfig(),
+    )
+# ===========================================================================
+# UPS Efficiency Tests
+# ===========================================================================
+class TestUPSEfficiency:
+    """Validate UPS quadratic loss model against reference data.
+    APC WP-108 Table: 500 kVA double-conversion UPS efficiency
+      25% load → ~90.5%
+      50% load → ~93.6%
+      75% load → ~94.0%
+      100% load → ~93.9%
+    """
+    def test_efficiency_at_25_percent(self) -> None:
+        """Efficiency at 25% load: η = 0.25/(0.25+0.013+0.006×0.25+0.011×0.0625) ≈ 94.3%."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=125.0)
+        sim.step(1.0, 125.0)  # 125/500 = 25%
+        ups = sim.state.ups_units[0]
+        assert 0.93 <= ups.efficiency <= 0.96, f"η={ups.efficiency:.3f}"
+    def test_efficiency_at_50_percent(self) -> None:
+        """Efficiency at 50% load: η ≈ 96.4%."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=250.0)
+        sim.step(1.0, 250.0)
+        ups = sim.state.ups_units[0]
+        assert 0.95 <= ups.efficiency <= 0.97, f"η={ups.efficiency:.3f}"
+    def test_efficiency_at_75_percent(self) -> None:
+        """Efficiency at 75% load: η ≈ 96.9%."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=375.0)
+        sim.step(1.0, 375.0)
+        ups = sim.state.ups_units[0]
+        assert 0.96 <= ups.efficiency <= 0.98, f"η={ups.efficiency:.3f}"
+    def test_efficiency_at_100_percent(self) -> None:
+        """Efficiency at 100% load: η ≈ 97.1%."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=500.0)
+        sim.step(1.0, 500.0)
+        ups = sim.state.ups_units[0]
+        assert 0.96 <= ups.efficiency <= 0.98, f"η={ups.efficiency:.3f}"
+    def test_efficiency_peak_around_75_percent(self) -> None:
+        """Peak efficiency should occur around 50-75% load, not at extremes."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=0.0)
+        efficiencies = {}
+        for load_pct in [10, 25, 50, 75, 100]:
+            load_kw = 500.0 * load_pct / 100.0
+            sim2 = PowerSimulation(make_simple_power_config(), it_load_kw=load_kw)
+            sim2.step(1.0, load_kw)
+            efficiencies[load_pct] = sim2.state.ups_units[0].efficiency
+        # Peak should be between 50-100%, not at 10%
+        peak_pct = max(efficiencies, key=efficiencies.get)
+        assert peak_pct >= 50, f"Peak at {peak_pct}%, efficiencies: {efficiencies}"
+    def test_losses_are_positive(self) -> None:
+        """UPS losses should always be positive (waste heat)."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        sim.step(1.0, 160.0)
+        ups = sim.state.ups_units[0]
+        assert ups.heat_output_kw > 0, "UPS must produce waste heat"
+    def test_eco_mode_higher_efficiency(self) -> None:
+        """Eco mode should have higher efficiency than double conversion."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        sim.step(1.0, 160.0)
+        eta_dc = sim.state.ups_units[0].efficiency
+        sim2 = PowerSimulation(make_simple_power_config(), it_load_kw=160.0)
+        sim2.set_ups_mode("UPS-1", UPSMode.ECO)
+        sim2.step(1.0, 160.0)
+        eta_eco = sim2.state.ups_units[0].efficiency
+        assert eta_eco > eta_dc, f"Eco {eta_eco:.3f} should > DC {eta_dc:.3f}"
+# ===========================================================================
+# UPS Battery Tests
+# ===========================================================================
+class TestUPSBattery:
+    """Validate battery discharge and charge dynamics."""
+    def test_battery_discharge_on_utility_loss(self) -> None:
+        """Battery SOC should decrease when utility is lost."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        # Verify initial SOC = 100%
+        assert sim.state.ups_units[0].battery_soc == 1.0
+        # Kill utility
+        sim.set_utility_available(False)
+        # Run for 60 seconds
+        for _ in range(60):
+            sim.step(1.0, 160.0)
+        ups = sim.state.ups_units[0]
+        assert ups.mode == UPSMode.ON_BATTERY
+        assert ups.battery_soc < 1.0, "SOC should decrease on battery"
+        assert ups.battery_soc > 0.5, "SOC shouldn't drop too fast in 60s"
+    def test_battery_runtime_estimation(self) -> None:
+        """Battery time remaining estimate should be reasonable.
+        8.3 kWh battery, 0.9 discharge eff, 0.85 aging, 160 kW load:
+        usable = 8.3 × 0.9 × 0.85 = 6.35 kWh
+        At 160 kW: ~143 seconds (~2.4 min)
+        """
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        sim.set_utility_available(False)
+        sim.step(1.0, 160.0)
+        ups = sim.state.ups_units[0]
+        assert ups.mode == UPSMode.ON_BATTERY
+        assert 60 < ups.battery_time_remaining_s < 300, \
+            f"Runtime {ups.battery_time_remaining_s:.0f}s should be 1-5 min for 160kW"
+    def test_battery_exhaustion(self) -> None:
+        """Battery should eventually exhaust and UPS should fault."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        sim.set_utility_available(False)
+        # Run until battery dies (should be ~2-3 min)
+        max_steps = 600  # 10 min max
+        exhausted = False
+        for _ in range(max_steps):
+            result = sim.step(1.0, 160.0)
+            if sim.state.ups_units[0].mode == UPSMode.FAULT:
+                exhausted = True
+                break
+        assert exhausted, "Battery should exhaust within 10 minutes at 160 kW"
+        assert sim.state.ups_units[0].battery_soc == 0.0
+    def test_battery_recharge_after_utility_restored(self) -> None:
+        """Battery should recharge when utility is restored."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=80.0)
+        # Discharge for 30 seconds
+        sim.set_utility_available(False)
+        for _ in range(30):
+            sim.step(1.0, 80.0)
+        soc_after_discharge = sim.state.ups_units[0].battery_soc
+        # Restore utility
+        sim.set_utility_available(True)
+        for _ in range(300):  # 5 min recharge
+            sim.step(1.0, 80.0)
+        soc_after_recharge = sim.state.ups_units[0].battery_soc
+        assert soc_after_recharge > soc_after_discharge, \
+            f"SOC should increase: {soc_after_discharge:.3f} → {soc_after_recharge:.3f}"
+    def test_battery_low_alarm(self) -> None:
+        """Should get low battery alarm when SOC drops below 25%."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        sim.set_utility_available(False)
+        all_alarms: list[PowerAlarm] = []
+        for _ in range(600):
+            result = sim.step(1.0, 160.0)
+            all_alarms.extend(result.alarms)
+            if sim.state.ups_units[0].battery_soc < 0.10:
+                break
+        alarm_types = [a.alarm_type for a in all_alarms]
+        assert "battery_low" in alarm_types or "battery_critical" in alarm_types, \
+            f"Should have low battery alarm, got: {alarm_types}"
+# ===========================================================================
+# PDU Tests
+# ===========================================================================
+class TestPDU:
+    """Validate PDU power distribution and loss calculations."""
+    def test_pdu_losses_at_nominal(self) -> None:
+        """PDU losses should be ~2% of load (98% efficiency)."""
+        config = make_simple_power_config(num_pdus=1)
+        sim = PowerSimulation(config, it_load_kw=5.0)
+        result = sim.step(1.0, 5.0)
+        pdu = sim.state.pdus[0]
+        expected_loss = 5.0 * (1.0 / 0.98 - 1.0)  # ~0.102 kW
+        assert abs(pdu.heat_output_kw - expected_loss) < 0.01, \
+            f"PDU loss {pdu.heat_output_kw:.3f} kW, expected {expected_loss:.3f}"
+    def test_phase_current_calculation(self) -> None:
+        """Phase currents should match P = √3 × V_LL × I_L formula.
+        5 kW load at 208V: I_total = 5000 / (√3 × 208) = 13.88 A
+        Per phase (balanced): 13.88 / 3 = 4.63 A
+        """
+        config = make_simple_power_config(num_pdus=1)
+        sim = PowerSimulation(config, it_load_kw=5.0)
+        sim.step(1.0, 5.0)
+        pdu = sim.state.pdus[0]
+        expected_total = 5000.0 / (math.sqrt(3) * 208.0)
+        expected_per_phase = expected_total / 3.0
+        for i, current in enumerate(pdu.phase_currents_a):
+            assert abs(current - expected_per_phase) < 0.1, \
+                f"Phase {i} current {current:.2f}A, expected {expected_per_phase:.2f}A"
+    def test_pdu_nameplate_capacity(self) -> None:
+        """Nameplate capacity = √3 × 208V × 24A ≈ 8.65 kW."""
+        config = make_simple_power_config(num_pdus=1)
+        sim = PowerSimulation(config, it_load_kw=1.0)
+        sim.step(1.0, 1.0)
+        pdu = sim.state.pdus[0]
+        expected = math.sqrt(3) * 208.0 * 24.0 / 1000.0
+        assert abs(pdu.nameplate_capacity_kw - expected) < 0.01
+    def test_pdu_derated_capacity(self) -> None:
+        """Derated capacity = nameplate × 0.80."""
+        config = make_simple_power_config(num_pdus=1)
+        sim = PowerSimulation(config, it_load_kw=1.0)
+        sim.step(1.0, 1.0)
+        pdu = sim.state.pdus[0]
+        expected = pdu.nameplate_capacity_kw * 0.80
+        assert abs(pdu.derated_capacity_kw - expected) < 0.01
+    def test_pdu_overcurrent_alarm(self) -> None:
+        """Overloading a PDU beyond phase current limit should trigger alarm.
+        Phase current = P / (√3 × V_LL) / num_phases_factor
+        For total_current > 24A per-phase: need I_total > 72A
+        I_total = P / (√3 × 208) = P / 360.2
+        So P > 72 × 360.2 / 3 ≈ 8.65 kW won't do it because per_phase = I_total/3
+        Actually: per_phase = (P×1000)/(√3×208) / 3, need per_phase > 24A
+        per_phase > 24 → P > 24 × 3 × √3 × 208 / 1000 = 25.95 kW
+        """
+        config = make_simple_power_config(num_pdus=1)
+        sim = PowerSimulation(config, it_load_kw=27.0)
+        result = sim.step(1.0, 27.0)
+        alarm_types = [a.alarm_type for a in result.alarms]
+        assert "phase_overcurrent" in alarm_types, f"Expected overcurrent alarm, got {alarm_types}"
+    def test_multiple_pdus_share_load(self) -> None:
+        """Load should be distributed across PDUs."""
+        config = make_simple_power_config(num_pdus=4)
+        sim = PowerSimulation(config, it_load_kw=20.0)
+        sim.step(1.0, 20.0)
+        for pdu in sim.state.pdus:
+            assert abs(pdu.output_power_kw - 5.0) < 0.01
+# ===========================================================================
+# Generator Tests
+# ===========================================================================
+class TestGenerator:
+    """Validate generator state machine and fuel consumption."""
+    def test_generator_startup_sequence(self) -> None:
+        """Generator should progress: OFF → START_DELAY → CRANKING → WARMING → READY."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        gen = sim.state.generator
+        assert gen.state == GeneratorState.OFF
+        # Start generator
+        sim.start_generator()
+        assert gen.state == GeneratorState.START_DELAY
+        # Run through start delay (4s)
+        for _ in range(5):
+            sim.step(1.0, 160.0)
+        assert gen.state == GeneratorState.CRANKING
+        # Run through cranking (5s)
+        for _ in range(6):
+            sim.step(1.0, 160.0)
+        assert gen.state == GeneratorState.WARMING
+        # Run through warmup (8s)
+        for _ in range(9):
+            sim.step(1.0, 160.0)
+        assert gen.state == GeneratorState.READY
+    def test_generator_total_startup_time(self) -> None:
+        """Total startup time should be ~17s (4 + 5 + 8)."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        sim.start_generator()
+        # Run until ready
+        steps = 0
+        for steps in range(1, 100):
+            sim.step(1.0, 160.0)
+            if sim.state.generator.is_available:
+                break
+        # 4s delay + 5s crank + 8s warmup = 17s, allow ±2s
+        assert 15 <= steps <= 20, f"Startup took {steps}s, expected ~17s"
+    def test_fuel_consumption_under_load(self) -> None:
+        """Fuel should be consumed when generator is loaded."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        gen = sim.state.generator
+        initial_fuel = gen.fuel_level_liters
+        # Trigger utility loss to get generator running and loaded
+        sim.set_utility_available(False)
+        # Run for 30 seconds (enough for startup + some loaded time)
+        for _ in range(30):
+            sim.step(1.0, 160.0)
+        assert gen.fuel_level_liters < initial_fuel, "Fuel should be consumed"
+    def test_fuel_consumption_rate(self) -> None:
+        """Fuel rate = full_rate × (0.1 + 0.9 × load_fraction).
+        At 160kW / 750kW = 21.3% load:
+        rate = 180 × (0.1 + 0.9 × 0.213) = 180 × 0.292 = 52.6 L/hr
+        In 1 hour: ~52.6 liters consumed
+        """
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        # Must disable utility so ATS stays on generator for full hour
+        sim.set_utility_available(False)
+        # Manually put generator into loaded state for cleaner test
+        gen = sim.state.generator
+        gen.state = GeneratorState.LOADED
+        gen.load_fraction = 160.0 / 750.0
+        gen.output_power_kw = 160.0
+        sim.state.ats.position = ATSPosition.GENERATOR
+        initial_fuel = gen.fuel_level_liters
+        # Run for 1 hour
+        for _ in range(3600):
+            sim.step(1.0, 160.0)
+        consumed = initial_fuel - gen.fuel_level_liters
+        expected_rate = 180.0 * (0.1 + 0.9 * (160.0 / 750.0))
+        # Allow 10% tolerance
+        assert abs(consumed - expected_rate) < expected_rate * 0.15, \
+            f"Consumed {consumed:.1f}L/hr, expected ~{expected_rate:.1f}L/hr"
+    def test_generator_cooldown(self) -> None:
+        """Generator should cool down for 5 minutes before shutdown."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        # Get generator running
+        gen = sim.state.generator
+        gen.state = GeneratorState.LOADED
+        gen.output_power_kw = 160.0
+        # Stop generator
+        sim.stop_generator()
+        assert gen.state == GeneratorState.COOLDOWN
+        # Run cooldown (300s)
+        for i in range(299):
+            sim.step(1.0, 160.0)
+            assert gen.state == GeneratorState.COOLDOWN, f"Still cooling at {i+1}s"
+        # Should transition to OFF after 300s
+        sim.step(1.0, 160.0)
+        assert gen.state == GeneratorState.OFF
+    def test_fuel_exhaustion(self) -> None:
+        """Generator should shut down when fuel runs out."""
+        config = PowerConfig(
+            ups_units=[UPSConfig(unit_id="UPS-1")],
+            pdus=[PDUConfig(pdu_id="PDU-1")],
+            generator=GeneratorConfig(fuel_tank_liters=1.0),  # Very small tank
+            ats=ATSConfig(),
+        )
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        gen = sim.state.generator
+        gen.state = GeneratorState.LOADED
+        gen.load_fraction = 160.0 / 750.0
+        gen.output_power_kw = 160.0
+        sim.state.ats.position = ATSPosition.GENERATOR
+        sim.set_utility_available(False)
+        # Run until fuel runs out (1L / 52.6 L/hr ≈ 68 seconds)
+        all_alarms: list[PowerAlarm] = []
+        for _ in range(200):
+            result = sim.step(1.0, 160.0)
+            all_alarms.extend(result.alarms)
+            if gen.state == GeneratorState.OFF:
+                break
+        assert gen.state == GeneratorState.OFF
+        assert gen.fuel_level_liters == 0.0
+        alarm_types = [a.alarm_type for a in all_alarms]
+        assert "fuel_exhausted" in alarm_types
+# ===========================================================================
+# ATS Tests
+# ===========================================================================
+class TestATS:
+    """Validate Automatic Transfer Switch behavior."""
+    def test_ats_starts_on_utility(self) -> None:
+        """ATS should start in UTILITY position."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        assert sim.state.ats.position == ATSPosition.UTILITY
+    def test_ats_transfers_on_utility_loss(self) -> None:
+        """ATS should begin transfer when utility is lost."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        sim.set_utility_available(False)
+        sim.step(0.001, 160.0)  # Tiny step to trigger detection
+        assert sim.state.ats.position == ATSPosition.TRANSFERRING
+    def test_ats_waits_for_generator(self) -> None:
+        """ATS should stay TRANSFERRING until generator is ready."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        sim.set_utility_available(False)
+        # Run for 5 seconds (generator still starting up)
+        for _ in range(5):
+            sim.step(1.0, 160.0)
+        # Should still be transferring because generator isn't ready yet
+        gen = sim.state.generator
+        assert not gen.is_available
+        assert sim.state.ats.position == ATSPosition.TRANSFERRING
+    def test_ats_completes_transfer_to_generator(self) -> None:
+        """ATS should transfer to generator once it's ready."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        sim.set_utility_available(False)
+        # Run long enough for generator startup (~17s) + transfer
+        for _ in range(25):
+            sim.step(1.0, 160.0)
+        assert sim.state.ats.position == ATSPosition.GENERATOR
+        assert sim.state.generator.state == GeneratorState.LOADED
+    def test_ats_retransfer_delay(self) -> None:
+        """ATS should wait retransfer_delay (300s) before switching back to utility."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        # Lose utility and get on generator
+        sim.set_utility_available(False)
+        for _ in range(25):
+            sim.step(1.0, 160.0)
+        assert sim.state.ats.position == ATSPosition.GENERATOR
+        # Restore utility
+        sim.set_utility_available(True)
+        # Run for 200s — should still be on generator
+        for _ in range(200):
+            sim.step(1.0, 160.0)
+        assert sim.state.ats.position == ATSPosition.GENERATOR
+        # Run past 300s retransfer delay
+        for _ in range(150):
+            sim.step(1.0, 160.0)
+        # Should be transferring back or on utility
+        ats_pos = sim.state.ats.position
+        assert ats_pos in (ATSPosition.TRANSFERRING, ATSPosition.UTILITY), \
+            f"Expected transfer back, got {ats_pos}"
+# ===========================================================================
+# Full Scenario Tests
+# ===========================================================================
+class TestUtilityLossScenario:
+    """End-to-end utility loss and recovery scenario."""
+    def test_full_utility_loss_and_recovery(self) -> None:
+        """Complete scenario: utility loss → battery bridge → generator → recovery.
+        Timeline:
+          t=0: Utility fails
+          t=0-17s: UPS on battery, generator starting
+          t=17s: Generator ready, ATS transfers
+          t=17s+: On generator power
+          t=100s: Utility restored
+          t=400s: Retransfer to utility (after 300s delay)
+        """
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        # Phase 1: Utility loss
+        sim.set_utility_available(False)
+        # Run through startup sequence
+        ups_on_battery = False
+        gen_ready = False
+        on_generator = False
+        for t in range(1, 30):
+            result = sim.step(1.0, 160.0)
+            if sim.state.ups_units[0].mode == UPSMode.ON_BATTERY:
+                ups_on_battery = True
+            if sim.state.generator.is_available:
+                gen_ready = True
+            if sim.state.ats.position == ATSPosition.GENERATOR:
+                on_generator = True
+        assert ups_on_battery, "UPS should have been on battery"
+        assert gen_ready, "Generator should be ready by 30s"
+        assert on_generator, "Should be on generator by 30s"
+        # Phase 2: Running on generator
+        result = sim.step(1.0, 160.0)
+        assert result.on_generator
+        assert sim.state.generator.state == GeneratorState.LOADED
+        # Phase 3: Utility restored
+        sim.set_utility_available(True)
+        # Run past retransfer delay (300s)
+        for _ in range(350):
+            sim.step(1.0, 160.0)
+        # Should be back on utility (or transferring)
+        assert sim.state.ats.position in (ATSPosition.UTILITY, ATSPosition.TRANSFERRING)
+    def test_power_available_during_transfer(self) -> None:
+        """UPS should bridge the gap during ATS transfer."""
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        # Initial: power available
+        result = sim.step(1.0, 160.0)
+        assert result.power_available
+        # During utility loss, UPS provides power
+        sim.set_utility_available(False)
+        for _ in range(5):
+            result = sim.step(1.0, 160.0)
+        # UPS is on battery, still providing power
+        assert sim.state.ups_units[0].mode == UPSMode.ON_BATTERY
+        # The IT load is still being served
+        assert sim.state.ups_units[0].output_power_kw > 0
+# ===========================================================================
+# Integration with DatacenterState Tests
+# ===========================================================================
+class TestPowerStateIntegration:
+    """Test PowerState integration with DatacenterState."""
+    def test_datacenter_state_with_power(self) -> None:
+        """DatacenterState should use PowerState for PUE when available."""
+        from dc_ops_env.simulation.types import DatacenterState, PowerState, UPSState, PDUState
+        ups = UPSState(unit_id="UPS-1", heat_output_kw=5.0)
+        pdu = PDUState(pdu_id="PDU-1", heat_output_kw=1.0)
+        power = PowerState(ups_units=[ups], pdus=[pdu])
+        state = DatacenterState(
+            power=power,
+            lighting_power_kw=5.0,
+        )
+        # With no zones (no IT load), PUE should be 1.0
+        assert state.pue == 1.0
+    def test_datacenter_state_without_power_uses_stubs(self) -> None:
+        """DatacenterState without PowerState should use stub fractions."""
+        from dc_ops_env.simulation.types import DatacenterState
+        state = DatacenterState(
+            ups_loss_fraction=0.05,
+            pdu_loss_fraction=0.02,
+        )
+        # Should use the stub loss fractions (backward compat)
+        assert state.power is None
+# ===========================================================================
+# Performance Test
+# ===========================================================================
+class TestPerformance:
+    """Ensure power simulation is fast enough for RL training."""
+    def test_steps_per_second(self) -> None:
+        """Power sim should sustain >10,000 steps/sec."""
+        import time
+        config = make_simple_power_config(num_ups=2, num_pdus=20)
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        n_steps = 5000
+        start = time.perf_counter()
+        for _ in range(n_steps):
+            sim.step(1.0, 160.0)
+        elapsed = time.perf_counter() - start
+        steps_per_sec = n_steps / elapsed
+        assert steps_per_sec > 10_000, \
+            f"Only {steps_per_sec:.0f} steps/sec, need >10,000"
+# ===========================================================================
+# Mutation Helper Tests
+# ===========================================================================
+class TestMutationHelpers:
+    """Test convenience methods for scenario injection."""
+    def test_set_utility_available(self) -> None:
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        assert sim.state.utility_available is True
+        sim.set_utility_available(False)
+        assert sim.state.utility_available is False
+    def test_set_ups_mode(self) -> None:
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        assert sim.set_ups_mode("UPS-1", UPSMode.ECO)
+        assert sim.state.ups_units[0].mode == UPSMode.ECO
+        assert not sim.set_ups_mode("UPS-999", UPSMode.ECO)
+    def test_inject_and_clear_ups_fault(self) -> None:
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        assert sim.inject_ups_fault("UPS-1")
+        assert sim.state.ups_units[0].mode == UPSMode.FAULT
+        assert sim.clear_ups_fault("UPS-1")
+        assert sim.state.ups_units[0].mode == UPSMode.DOUBLE_CONVERSION
+    def test_start_stop_generator(self) -> None:
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        sim.start_generator()
+        assert sim.state.generator.state == GeneratorState.START_DELAY
+        # Run to READY
+        for _ in range(20):
+            sim.step(1.0, 160.0)
+        assert sim.state.generator.is_available
+        sim.stop_generator()
+        assert sim.state.generator.state == GeneratorState.COOLDOWN
+    def test_refuel_generator(self) -> None:
+        config = make_simple_power_config()
+        sim = PowerSimulation(config, it_load_kw=160.0)
+        gen = sim.state.generator
+        gen.fuel_level_liters = 500.0
+        sim.refuel_generator(200.0)
+        assert gen.fuel_level_liters == 700.0
+        sim.refuel_generator()  # Full tank
+        assert gen.fuel_level_liters == gen.fuel_tank_liters

tests/test_rewards.py ADDED Viewed

	@@ -0,0 +1,650 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tests for the multi-objective reward function.
+Validates:
+  - softplus numerical stability
+  - Individual reward component behavior and bounds
+  - Weight profiles sum to 1.0
+  - Delta-based progress tracking
+  - Action quality heuristics
+  - End-to-end reward computation
+  - Integration with the full environment
+"""
+from __future__ import annotations
+import math
+import pytest
+from dc_ops_env.rewards.reward_function import (
+    RewardComponents,
+    RewardFunction,
+    RewardWeights,
+    WEIGHT_PROFILES,
+    softplus,
+)
+from dc_ops_env.config import (
+    ASHRAE_CLASSES,
+    make_default_datacenter_config,
+)
+from dc_ops_env.simulation.thermal import ThermalSimulation
+from dc_ops_env.simulation.power import PowerSimulation
+from dc_ops_env.simulation.types import UPSMode
+from dc_ops_env.actions.parser import CommandResult
+from dc_ops_env.scenarios.base import ScenarioResult
+# ===========================================================================
+# Helpers
+# ===========================================================================
+def _make_thermal_sim(setpoint_c: float = 20.0) -> ThermalSimulation:
+    """Create a warmed-up thermal simulation with a given CRAC setpoint."""
+    config = make_default_datacenter_config()
+    for zone_cfg in config.zones:
+        for crac_cfg in zone_cfg.crac_units:
+            crac_cfg.initial_setpoint_c = setpoint_c
+    sim = ThermalSimulation(config)
+    # Warmup to reach steady state
+    for _ in range(120):
+        sim.step(1.0)
+    return sim
+def _make_power_sim(
+    utility_available: bool = True,
+) -> PowerSimulation:
+    """Create a power simulation with default config."""
+    config = make_default_datacenter_config()
+    it_load = 160.0  # Default total IT load
+    power_sim = PowerSimulation(config.power, it_load_kw=it_load)
+    if not utility_available:
+        power_sim.set_utility_available(False)
+        # Step a bit so UPS transitions to battery
+        for _ in range(5):
+            power_sim.step(1.0, it_load)
+    return power_sim
+def _ok_cmd(name: str = "check_status") -> CommandResult:
+    return CommandResult(success=True, message="OK", command_name=name)
+def _fail_cmd() -> CommandResult:
+    return CommandResult(success=False, message="Unknown command", command_name="")
+# ===========================================================================
+# softplus Unit Tests
+# ===========================================================================
+class TestSoftplus:
+    """Validate the numerically stable softplus implementation."""
+    def test_softplus_positive(self) -> None:
+        assert softplus(1.0) == pytest.approx(math.log1p(math.exp(1.0)), abs=1e-10)
+    def test_softplus_zero(self) -> None:
+        assert softplus(0.0) == pytest.approx(math.log(2.0), abs=1e-10)
+    def test_softplus_negative(self) -> None:
+        assert softplus(-5.0) == pytest.approx(math.log1p(math.exp(-5.0)), abs=1e-10)
+    def test_softplus_large_positive_clamp(self) -> None:
+        """x > 20 should return x directly (avoid exp overflow)."""
+        assert softplus(25.0) == 25.0
+        assert softplus(100.0) == 100.0
+    def test_softplus_large_negative_clamp(self) -> None:
+        """x < -20 should return 0.0 (avoid underflow noise)."""
+        assert softplus(-25.0) == 0.0
+        assert softplus(-100.0) == 0.0
+    def test_softplus_monotonic(self) -> None:
+        """softplus should be monotonically increasing."""
+        values = [-10, -5, -1, 0, 1, 5, 10, 15]
+        results = [softplus(x) for x in values]
+        for i in range(len(results) - 1):
+            assert results[i] < results[i + 1]
+    def test_softplus_always_nonnegative(self) -> None:
+        for x in [-20, -10, -1, 0, 1, 10]:
+            assert softplus(x) >= 0.0
+# ===========================================================================
+# Weight Profile Tests
+# ===========================================================================
+class TestWeightProfiles:
+    """Validate weight profiles sum to 1.0 and are well-formed."""
+    @pytest.mark.parametrize("profile_name", ["thermal", "power", "default"])
+    def test_weights_sum_to_one(self, profile_name: str) -> None:
+        w = WEIGHT_PROFILES[profile_name]
+        total = (
+            w.thermal_safety + w.power_safety + w.efficiency
+            + w.scenario_progress + w.procedure + w.action_quality
+        )
+        assert total == pytest.approx(1.0, abs=1e-6)
+    @pytest.mark.parametrize("profile_name", ["thermal", "power", "default"])
+    def test_weights_nonnegative(self, profile_name: str) -> None:
+        w = WEIGHT_PROFILES[profile_name]
+        assert w.thermal_safety >= 0
+        assert w.power_safety >= 0
+        assert w.efficiency >= 0
+        assert w.scenario_progress >= 0
+        assert w.procedure >= 0
+        assert w.action_quality >= 0
+    def test_thermal_profile_emphasizes_thermal(self) -> None:
+        w = WEIGHT_PROFILES["thermal"]
+        assert w.thermal_safety >= w.power_safety
+        assert w.thermal_safety >= w.efficiency
+    def test_power_profile_emphasizes_power(self) -> None:
+        w = WEIGHT_PROFILES["power"]
+        assert w.power_safety >= w.thermal_safety
+        assert w.power_safety >= w.efficiency
+    def test_unknown_profile_falls_back_to_default(self) -> None:
+        rf = RewardFunction(scenario_type="unknown_type")
+        # Should use default weights without error
+        thermal_sim = _make_thermal_sim()
+        components = rf.compute(
+            thermal_sim, None, _ok_cmd(), "check_status", ["check_status"], None,
+        )
+        assert isinstance(components, RewardComponents)
+# ===========================================================================
+# Thermal Safety Component Tests
+# ===========================================================================
+class TestThermalSafety:
+    """Validate the dual-softplus thermal safety barrier."""
+    def test_safe_temps_near_zero(self) -> None:
+        """With comfortable temps (20°C setpoint), penalty should be near 0."""
+        thermal_sim = _make_thermal_sim(setpoint_c=20.0)
+        r = RewardFunction._thermal_safety(thermal_sim)
+        # Should be in [-1, 0] and close to 0 for safe temps
+        assert -0.3 <= r <= 0.0
+    def test_returns_negative_or_zero(self) -> None:
+        """Thermal safety should never return positive values."""
+        for sp in [15.0, 20.0, 24.0]:
+            thermal_sim = _make_thermal_sim(setpoint_c=sp)
+            r = RewardFunction._thermal_safety(thermal_sim)
+            assert r <= 0.0
+    def test_higher_setpoint_more_penalty(self) -> None:
+        """Higher setpoints → hotter temps → more penalty."""
+        r_low = RewardFunction._thermal_safety(_make_thermal_sim(15.0))
+        r_high = RewardFunction._thermal_safety(_make_thermal_sim(24.0))
+        # Higher setpoint should yield equal or more negative reward
+        assert r_high <= r_low
+    def test_bounded_to_neg_one(self) -> None:
+        """Even extreme temps should be bounded to [-1, 0] via tanh."""
+        thermal_sim = _make_thermal_sim(setpoint_c=15.0)
+        # Force extreme rack inlet temps
+        for zone in thermal_sim.state.zones:
+            for rack in zone.racks:
+                rack.inlet_temp_c = 50.0
+        r = RewardFunction._thermal_safety(thermal_sim)
+        assert r >= -1.0
+        assert r <= 0.0
+# ===========================================================================
+# Power Safety Component Tests
+# ===========================================================================
+class TestPowerSafety:
+    """Validate UPS battery and fault penalty."""
+    def test_no_power_sim_returns_zero(self) -> None:
+        assert RewardFunction._power_safety(None) == 0.0
+    def test_utility_available_near_zero(self) -> None:
+        """Normal operation (utility available) should have near-zero penalty."""
+        power_sim = _make_power_sim(utility_available=True)
+        r = RewardFunction._power_safety(power_sim)
+        # On utility with full battery → no penalty
+        assert -0.15 <= r <= 0.0
+    def test_on_battery_gives_penalty(self) -> None:
+        """UPS on battery should yield a meaningful penalty."""
+        power_sim = _make_power_sim(utility_available=False)
+        r = RewardFunction._power_safety(power_sim)
+        assert r < 0.0  # Should be negative when on battery
+    def test_low_soc_increases_penalty(self) -> None:
+        """Lower SOC while on battery should increase penalty."""
+        power_sim = _make_power_sim(utility_available=False)
+        # Force low SOC
+        for ups in power_sim.state.ups_units:
+            ups.battery_soc = 0.3
+        r_low = RewardFunction._power_safety(power_sim)
+        power_sim2 = _make_power_sim(utility_available=False)
+        for ups in power_sim2.state.ups_units:
+            ups.battery_soc = 0.8
+        r_high = RewardFunction._power_safety(power_sim2)
+        assert r_low < r_high  # Low SOC → more negative
+    def test_fault_mode_heavy_penalty(self) -> None:
+        """UPS in FAULT mode should yield heavy penalty."""
+        power_sim = _make_power_sim(utility_available=True)
+        for ups in power_sim.state.ups_units:
+            ups.mode = UPSMode.FAULT
+        r = RewardFunction._power_safety(power_sim)
+        assert r < -0.7  # Should be very negative
+    def test_bounded(self) -> None:
+        """Power safety should be in [-1, 0]."""
+        power_sim = _make_power_sim(utility_available=False)
+        for ups in power_sim.state.ups_units:
+            ups.mode = UPSMode.FAULT
+            ups.battery_soc = 0.0
+        r = RewardFunction._power_safety(power_sim)
+        assert -1.0 <= r <= 0.0
+# ===========================================================================
+# Efficiency Component Tests
+# ===========================================================================
+class TestEfficiency:
+    """Validate PUE-based efficiency penalty."""
+    def test_low_pue_near_zero_penalty(self) -> None:
+        """PUE close to 1.0 should yield near-zero penalty."""
+        thermal_sim = _make_thermal_sim(20.0)
+        r = RewardFunction._efficiency(thermal_sim, None)
+        pue = thermal_sim.state.pue
+        # PUE is typically 1.4-1.8 in our sim, so some penalty is expected
+        assert -0.5 <= r <= 0.0
+    def test_returns_negative_or_zero(self) -> None:
+        thermal_sim = _make_thermal_sim(20.0)
+        r = RewardFunction._efficiency(thermal_sim, None)
+        assert r <= 0.0
+    def test_bounded(self) -> None:
+        """Even extreme PUE should be bounded."""
+        thermal_sim = _make_thermal_sim(15.0)
+        # Force extreme PUE by manipulating state
+        thermal_sim.state._pue = 5.0
+        r = RewardFunction._efficiency(thermal_sim, None)
+        assert -1.0 <= r <= 0.0
+# ===========================================================================
+# Scenario Progress Component Tests
+# ===========================================================================
+class TestScenarioProgress:
+    """Validate delta-based progress reward."""
+    def test_no_scenario_returns_zero(self) -> None:
+        rf = RewardFunction()
+        assert rf._scenario_progress(None) == 0.0
+    def test_first_step_progress(self) -> None:
+        """First step with progress > 0 should yield positive delta."""
+        rf = RewardFunction()
+        result = ScenarioResult(progress=0.5)
+        r = rf._scenario_progress(result)
+        assert r == pytest.approx(0.5)
+    def test_delta_tracking(self) -> None:
+        """Only the delta should be rewarded, not cumulative progress."""
+        rf = RewardFunction()
+        r1 = rf._scenario_progress(ScenarioResult(progress=0.3))
+        assert r1 == pytest.approx(0.3)
+        r2 = rf._scenario_progress(ScenarioResult(progress=0.3))
+        assert r2 == pytest.approx(0.0)  # No change → no reward
+        r3 = rf._scenario_progress(ScenarioResult(progress=0.7))
+        assert r3 == pytest.approx(0.4)  # 0.7 - 0.3
+    def test_negative_delta_penalized(self) -> None:
+        """Progress regression should yield negative reward."""
+        rf = RewardFunction()
+        rf._scenario_progress(ScenarioResult(progress=0.8))
+        r = rf._scenario_progress(ScenarioResult(progress=0.5))
+        assert r == pytest.approx(-0.3)
+    def test_bounded(self) -> None:
+        """Progress delta should be clamped to [-1, 1]."""
+        rf = RewardFunction()
+        r = rf._scenario_progress(ScenarioResult(progress=1.0))
+        assert -1.0 <= r <= 1.0
+    def test_reset_clears_state(self) -> None:
+        """reset() should clear the previous progress."""
+        rf = RewardFunction()
+        rf._scenario_progress(ScenarioResult(progress=0.5))
+        rf.reset()
+        r = rf._scenario_progress(ScenarioResult(progress=0.3))
+        assert r == pytest.approx(0.3)  # From 0, not from 0.5
+# ===========================================================================
+# Procedure Component Tests
+# ===========================================================================
+class TestProcedure:
+    """Validate procedural correctness pass-through."""
+    def test_no_scenario_returns_zero(self) -> None:
+        assert RewardFunction._procedure(None) == 0.0
+    def test_positive_procedure_reward(self) -> None:
+        r = RewardFunction._procedure(ScenarioResult(procedure_reward=0.3))
+        assert r == pytest.approx(0.3)
+    def test_negative_procedure_reward(self) -> None:
+        r = RewardFunction._procedure(ScenarioResult(procedure_reward=-0.2))
+        assert r == pytest.approx(-0.2)
+    def test_clamped_to_bounds(self) -> None:
+        r = RewardFunction._procedure(ScenarioResult(procedure_reward=5.0))
+        assert r == 1.0
+        r = RewardFunction._procedure(ScenarioResult(procedure_reward=-5.0))
+        assert r == -1.0
+# ===========================================================================
+# Action Quality Component Tests
+# ===========================================================================
+class TestActionQuality:
+    """Validate contextual action quality assessment."""
+    def test_invalid_command_penalty(self) -> None:
+        thermal_sim = _make_thermal_sim()
+        r = RewardFunction._action_quality(
+            _fail_cmd(), "nonsense", ["nonsense"], thermal_sim, None,
+        )
+        assert r == pytest.approx(-0.5)
+    def test_diagnose_rewarded(self) -> None:
+        thermal_sim = _make_thermal_sim()
+        r = RewardFunction._action_quality(
+            _ok_cmd("diagnose"), "diagnose CRAC-1", ["diagnose CRAC-1"],
+            thermal_sim, None,
+        )
+        assert r == pytest.approx(0.3)
+    def test_check_status_rewarded(self) -> None:
+        thermal_sim = _make_thermal_sim()
+        r = RewardFunction._action_quality(
+            _ok_cmd("check_status"), "check_status", ["check_status"],
+            thermal_sim, None,
+        )
+        assert r == pytest.approx(0.3)
+    def test_intervention_rewarded(self) -> None:
+        thermal_sim = _make_thermal_sim()
+        r = RewardFunction._action_quality(
+            _ok_cmd("adjust_setpoint"), "adjust_setpoint CRAC-1 22",
+            ["adjust_setpoint CRAC-1 22"], thermal_sim, None,
+        )
+        assert r == pytest.approx(0.2)
+    def test_acknowledge_rewarded(self) -> None:
+        thermal_sim = _make_thermal_sim()
+        r = RewardFunction._action_quality(
+            _ok_cmd("acknowledge_alarm"), "acknowledge_alarm",
+            ["acknowledge_alarm"], thermal_sim, None,
+        )
+        assert r == pytest.approx(0.1)
+    def test_repeated_command_penalized(self) -> None:
+        """Repeated non-whitelisted command should be penalized."""
+        thermal_sim = _make_thermal_sim()
+        # Use adjust_setpoint (not whitelisted) instead of check_status
+        history = ["adjust_setpoint CRAC-1 20", "adjust_setpoint CRAC-1 20"]
+        r = RewardFunction._action_quality(
+            _ok_cmd("adjust_setpoint"), "adjust_setpoint CRAC-1 20", history,
+            thermal_sim, None,
+        )
+        assert r == pytest.approx(-0.2)
+    def test_repeated_whitelisted_not_penalized(self) -> None:
+        """Repeated check_status/wait should NOT be penalized."""
+        thermal_sim = _make_thermal_sim()
+        history = ["check_status", "check_status"]
+        r = RewardFunction._action_quality(
+            _ok_cmd("check_status"), "check_status", history,
+            thermal_sim, None,
+        )
+        assert r == pytest.approx(0.3)  # Still gets diagnose/check_status bonus
+    def test_wait_no_concern_neutral(self) -> None:
+        """Waiting when nothing is wrong should be neutral (0.0)."""
+        thermal_sim = _make_thermal_sim(20.0)  # Safe temps
+        r = RewardFunction._action_quality(
+            _ok_cmd("wait"), "wait", ["wait"], thermal_sim, None,
+        )
+        assert r == pytest.approx(0.0)
+    def test_wait_during_concern_penalized(self) -> None:
+        """Waiting during a thermal concern should be penalized."""
+        thermal_sim = _make_thermal_sim(20.0)
+        # Force rack inlet temps above recommended max to create concern
+        for zone in thermal_sim.state.zones:
+            ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
+            if ashrae:
+                for rack in zone.racks:
+                    rack.inlet_temp_c = ashrae.recommended_max_c + 2.0
+        r = RewardFunction._action_quality(
+            _ok_cmd("wait"), "wait", ["wait"], thermal_sim, None,
+        )
+        assert r == pytest.approx(-0.2)
+    def test_wait_during_battery_with_gen_starting(self) -> None:
+        """Waiting while UPS on battery but generator starting is acceptable."""
+        thermal_sim = _make_thermal_sim(20.0)
+        power_sim = _make_power_sim(utility_available=False)
+        # Generator should be in startup sequence (auto-started by ATS)
+        r = RewardFunction._action_quality(
+            _ok_cmd("wait"), "wait", ["wait"], thermal_sim, power_sim,
+        )
+        assert r == pytest.approx(0.1)  # Waiting for gen warmup is reasonable
+    def test_wait_during_thermal_concern_penalized(self) -> None:
+        """Waiting during a thermal concern (no power issue) is penalized."""
+        thermal_sim = _make_thermal_sim(20.0)
+        for zone in thermal_sim.state.zones:
+            ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
+            if ashrae:
+                for rack in zone.racks:
+                    rack.inlet_temp_c = ashrae.recommended_max_c + 2.0
+        r = RewardFunction._action_quality(
+            _ok_cmd("wait"), "wait", ["wait"], thermal_sim, None,
+        )
+        assert r == pytest.approx(-0.2)
+# ===========================================================================
+# Full Compute Tests
+# ===========================================================================
+class TestRewardCompute:
+    """Validate full reward computation."""
+    def test_compute_returns_components(self) -> None:
+        rf = RewardFunction(scenario_type="thermal")
+        thermal_sim = _make_thermal_sim()
+        components = rf.compute(
+            thermal_sim, None, _ok_cmd(), "check_status",
+            ["check_status"], None,
+        )
+        assert isinstance(components, RewardComponents)
+        assert hasattr(components, "total")
+        assert hasattr(components, "thermal_safety")
+    def test_total_bounded(self) -> None:
+        """Total reward should be in [-1, 1]."""
+        rf = RewardFunction(scenario_type="thermal")
+        thermal_sim = _make_thermal_sim()
+        components = rf.compute(
+            thermal_sim, None, _ok_cmd(), "check_status",
+            ["check_status"], None,
+        )
+        assert -1.0 <= components.total <= 1.0
+    def test_total_bounded_worst_case(self) -> None:
+        """Even with all-negative components, total should be >= -1."""
+        rf = RewardFunction(scenario_type="thermal")
+        thermal_sim = _make_thermal_sim()
+        # Force extreme conditions
+        for zone in thermal_sim.state.zones:
+            for rack in zone.racks:
+                rack.inlet_temp_c = 50.0
+        components = rf.compute(
+            thermal_sim, None, _fail_cmd(), "nonsense",
+            ["nonsense"],
+            ScenarioResult(procedure_reward=-1.0, progress=0.0),
+        )
+        assert components.total >= -1.0
+    def test_valid_action_better_than_invalid(self) -> None:
+        """Same conditions, valid action should score higher than invalid."""
+        rf = RewardFunction(scenario_type="default")
+        thermal_sim = _make_thermal_sim()
+        c_valid = rf.compute(
+            thermal_sim, None, _ok_cmd(), "check_status",
+            ["check_status"], None,
+        )
+        rf.reset()
+        c_invalid = rf.compute(
+            thermal_sim, None, _fail_cmd(), "nonsense",
+            ["nonsense"], None,
+        )
+        assert c_valid.total > c_invalid.total
+    def test_progress_delta_affects_total(self) -> None:
+        """Making progress should increase total reward."""
+        rf = RewardFunction(scenario_type="thermal")
+        thermal_sim = _make_thermal_sim()
+        c1 = rf.compute(
+            thermal_sim, None, _ok_cmd("diagnose"), "diagnose CRAC-1",
+            ["diagnose CRAC-1"],
+            ScenarioResult(progress=0.5),
+        )
+        c2 = rf.compute(
+            thermal_sim, None, _ok_cmd("diagnose"), "diagnose CRAC-2",
+            ["diagnose CRAC-1", "diagnose CRAC-2"],
+            ScenarioResult(progress=0.5),  # No change
+        )
+        # Step with progress delta should score higher (all else similar)
+        assert c1.scenario_progress > c2.scenario_progress
+    def test_with_power_sim(self) -> None:
+        """Compute should work with both thermal and power sims."""
+        rf = RewardFunction(scenario_type="power")
+        thermal_sim = _make_thermal_sim()
+        power_sim = _make_power_sim(utility_available=True)
+        components = rf.compute(
+            thermal_sim, power_sim, _ok_cmd(), "check_status",
+            ["check_status"], None,
+        )
+        assert -1.0 <= components.total <= 1.0
+    def test_custom_weights(self) -> None:
+        """Custom weights should override profile."""
+        custom = RewardWeights(
+            thermal_safety=0.0, power_safety=0.0, efficiency=0.0,
+            scenario_progress=0.0, procedure=0.0, action_quality=1.0,
+        )
+        rf = RewardFunction(weights=custom)
+        thermal_sim = _make_thermal_sim()
+        c = rf.compute(
+            thermal_sim, None, _ok_cmd("diagnose"), "diagnose CRAC-1",
+            ["diagnose CRAC-1"], None,
+        )
+        # With only action_quality weighted, total should equal action_quality
+        assert c.total == pytest.approx(c.action_quality, abs=0.01)
+# ===========================================================================
+# Integration with Full Environment
+# ===========================================================================
+class TestRewardIntegration:
+    """Validate reward function works correctly inside the environment."""
+    def test_scenario_reward_uses_reward_function(self) -> None:
+        """Environment should use RewardFunction, not old _compute_reward."""
+        from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
+        from dc_ops_env.models import DcOpsAction
+        env = DcOpsEnvironment()
+        env.reset(scenario="A1")  # Cooling setpoint optimization
+        obs = env.step(DcOpsAction(command="check_status"))
+        # Reward should be a float from the new system
+        assert isinstance(obs.reward, float)
+        assert obs.reward != 0.0  # Should have some signal
+    def test_escalation_has_penalty(self) -> None:
+        """Escalation should be penalized relative to a normal action."""
+        from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
+        from dc_ops_env.models import DcOpsAction
+        # Get reward for a normal first action
+        env1 = DcOpsEnvironment()
+        env1.reset(scenario="A2")
+        obs_normal = env1.step(DcOpsAction(command="check_status"))
+        # Get reward for escalation
+        env2 = DcOpsEnvironment()
+        env2.reset(scenario="A2")
+        obs_esc = env2.step(DcOpsAction(command="escalate"))
+        assert obs_esc.done is True
+        # Escalation should yield less reward than a check_status
+        assert obs_esc.reward < obs_normal.reward
+    def test_scenario_resolution_has_speed_bonus(self) -> None:
+        """Resolving a scenario early should yield a speed bonus."""
+        from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
+        from dc_ops_env.models import DcOpsAction
+        env = DcOpsEnvironment()
+        env.reset(scenario="B1")  # UPS Alarm Response
+        # Solve B1: diagnose UPS then acknowledge
+        env.step(DcOpsAction(command="diagnose UPS-1"))
+        obs = env.step(DcOpsAction(command="acknowledge_alarm"))
+        # Should be resolved with speed bonus
+        assert obs.done is True
+        # Speed bonus = (budget - steps) / budget = (10 - 2) / 10 = 0.8
+        # Total reward includes base + speed bonus, should be positive
+        assert obs.reward > 0.5
+    def test_reward_function_reset_on_env_reset(self) -> None:
+        """RewardFunction state should reset between episodes."""
+        from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
+        from dc_ops_env.models import DcOpsAction
+        env = DcOpsEnvironment()
+        # Episode 1
+        env.reset(scenario="A1")
+        env.step(DcOpsAction(command="check_status"))
+        # Episode 2 — progress delta should start fresh
+        env.reset(scenario="A1")
+        obs = env.step(DcOpsAction(command="check_status"))
+        assert isinstance(obs.reward, float)

tests/test_scenarios.py ADDED Viewed

	@@ -0,0 +1,415 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tests for the scenario framework.
+Validates:
+  - Scenario registry (registration, lookup, filtering)
+  - Scenario base class (procedure checking)
+  - Each scenario: initialization, fault injection, resolution detection
+  - Scenario integration with the environment
+"""
+from __future__ import annotations
+import pytest
+from dc_ops_env.models import DcOpsAction, DcOpsObservation
+from dc_ops_env.scenarios import (
+    Scenario,
+    ScenarioResult,
+    get_scenario,
+    list_scenarios,
+    random_scenario,
+    registered_scenario_ids,
+)
+from dc_ops_env.scenarios.base import ProcedureRule
+from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
+# ===========================================================================
+# Registry Tests
+# ===========================================================================
+class TestRegistry:
+    """Test scenario registration and lookup."""
+    def test_all_scenarios_registered(self) -> None:
+        ids = registered_scenario_ids()
+        assert "A1" in ids
+        assert "A2" in ids
+        assert "A4" in ids
+        assert "B1" in ids
+        assert "B3" in ids
+        assert "B4" in ids
+    def test_get_scenario_by_id(self) -> None:
+        s = get_scenario("A1")
+        assert s.scenario_id == "A1"
+        assert s.name == "Cooling Setpoint Optimization"
+    def test_get_scenario_unknown_raises(self) -> None:
+        with pytest.raises(KeyError, match="Unknown scenario"):
+            get_scenario("Z99")
+    def test_list_by_type(self) -> None:
+        thermal = list_scenarios(scenario_type="thermal")
+        assert all(s.scenario_type == "thermal" for s in thermal)
+        assert len(thermal) == 3  # A1, A2, A4
+        power = list_scenarios(scenario_type="power")
+        assert all(s.scenario_type == "power" for s in power)
+        assert len(power) == 3  # B1, B3, B4
+    def test_list_by_difficulty(self) -> None:
+        easy = list_scenarios(difficulty="easy")
+        assert all(s.difficulty == "easy" for s in easy)
+        assert len(easy) >= 2  # A1, B3
+        hard = list_scenarios(difficulty="hard")
+        assert all(s.difficulty == "hard" for s in hard)
+        assert len(hard) >= 2  # A4, B4
+    def test_random_scenario(self) -> None:
+        s = random_scenario(seed=42)
+        assert isinstance(s, Scenario)
+    def test_random_scenario_filtered(self) -> None:
+        s = random_scenario(scenario_type="thermal", difficulty="easy", seed=42)
+        assert s.scenario_type == "thermal"
+        assert s.difficulty == "easy"
+    def test_random_scenario_no_match_raises(self) -> None:
+        with pytest.raises(ValueError, match="No scenarios match"):
+            random_scenario(scenario_type="network")
+# ===========================================================================
+# Procedure Checking Tests
+# ===========================================================================
+class TestProcedureChecking:
+    """Test the procedural correctness reward mechanism."""
+    def test_procedure_bonus_when_satisfied(self) -> None:
+        s = get_scenario("A2")
+        # History has diagnose, then adjust_setpoint
+        history = ["diagnose CRAC-3", "adjust_setpoint CRAC-4 20"]
+        reward = s.check_procedure("adjust_setpoint CRAC-4 20", history)
+        assert reward > 0, f"Expected bonus, got {reward}"
+    def test_procedure_penalty_when_not_satisfied(self) -> None:
+        s = get_scenario("A2")
+        # No diagnose before adjust_setpoint
+        history = ["adjust_setpoint CRAC-4 20"]
+        reward = s.check_procedure("adjust_setpoint CRAC-4 20", history)
+        assert reward < 0, f"Expected penalty, got {reward}"
+    def test_no_procedure_rules_returns_zero(self) -> None:
+        """Scenario with no procedure rules should return 0."""
+        # Create a scenario without rules
+        s = get_scenario("A1")  # A1 has rules, but let's test the mechanism
+        reward = s.check_procedure("wait", ["wait"])
+        # "wait" doesn't match any trigger_command, so should be 0
+        assert reward == 0.0
+# ===========================================================================
+# A1: Cooling Setpoint Optimization Tests
+# ===========================================================================
+class TestA1CoolingSetpoint:
+    """Test the A1 scenario lifecycle."""
+    def test_initialization(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="A1")
+        assert obs.scenario_type == "thermal"
+        assert "setpoint" in obs.alert.lower() or "PUE" in obs.alert
+    def test_initial_pue_is_high(self) -> None:
+        """With 15°C setpoints, PUE should be higher than optimal."""
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="A1")
+        pue = obs.metadata["pue"]
+        # At 15°C setpoints, PUE should be elevated
+        assert pue > 1.5, f"Initial PUE {pue:.2f} should be > 1.5"
+    def test_raising_setpoint_improves_pue(self) -> None:
+        """Raising CRAC setpoints should reduce PUE."""
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="A1")
+        pue_before = obs.metadata["pue"]
+        # Raise all setpoints to 22°C (within ASHRAE A2 recommended)
+        for crac_id in ["CRAC-1", "CRAC-2", "CRAC-3", "CRAC-4"]:
+            env.step(DcOpsAction(command=f"adjust_setpoint {crac_id} 22"))
+        # Wait for thermal convergence
+        for _ in range(3):
+            obs = env.step(DcOpsAction(command="wait"))
+        pue_after = obs.metadata["pue"]
+        assert pue_after < pue_before, \
+            f"PUE should decrease: {pue_before:.3f} → {pue_after:.3f}"
+# ===========================================================================
+# A2: Thermal Event Response Tests
+# ===========================================================================
+class TestA2ThermalEvent:
+    """Test the A2 scenario lifecycle."""
+    def test_initialization(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="A2")
+        assert obs.scenario_type == "thermal"
+        assert "CRAC-3" in obs.alert
+    def test_crac_fault_visible_in_dashboard(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="A2")
+        assert "COMPRESSOR" in obs.dashboard or "FAULT" in obs.dashboard
+    def test_diagnose_reveals_fault(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset(scenario="A2")
+        obs = env.step(DcOpsAction(command="diagnose CRAC-3"))
+        assert "compressor" in obs.action_result.lower()
+        assert "FAULT DETECTED" in obs.action_result
+    def test_procedure_bonus_for_diagnose_first(self) -> None:
+        """Diagnosing before adjusting should yield higher reward."""
+        # Run 1: diagnose first, then adjust
+        env1 = DcOpsEnvironment()
+        env1.reset(scenario="A2")
+        obs1a = env1.step(DcOpsAction(command="diagnose CRAC-3"))
+        obs1b = env1.step(DcOpsAction(command="adjust_setpoint CRAC-4 20"))
+        r_with_diagnose = obs1b.reward
+        # Run 2: adjust without diagnosing
+        env2 = DcOpsEnvironment()
+        env2.reset(scenario="A2")
+        obs2 = env2.step(DcOpsAction(command="adjust_setpoint CRAC-4 20"))
+        r_without_diagnose = obs2.reward
+        assert r_with_diagnose > r_without_diagnose, \
+            f"Diagnose-first should yield higher reward: {r_with_diagnose:.3f} vs {r_without_diagnose:.3f}"
+# ===========================================================================
+# A4: CRAC Failure Cascade Tests
+# ===========================================================================
+class TestA4CRACCascade:
+    """Test the A4 scenario lifecycle."""
+    def test_initialization(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="A4")
+        assert obs.scenario_type == "thermal"
+        assert "CRAC-1" in obs.alert
+        assert "CRAC-3" in obs.alert
+    def test_two_cracs_faulted(self) -> None:
+        env = DcOpsEnvironment()
+        env.reset(scenario="A4")
+        obs1 = env.step(DcOpsAction(command="diagnose CRAC-1"))
+        assert "compressor" in obs1.action_result.lower()
+        obs3 = env.step(DcOpsAction(command="diagnose CRAC-3"))
+        assert "fan" in obs3.action_result.lower()
+    def test_cascade_has_faster_time(self) -> None:
+        """A4 uses 30s per step (urgent scenario)."""
+        s = get_scenario("A4")
+        assert s.game_time_per_step_s == 30.0
+    def test_harder_than_a2(self) -> None:
+        """A4 should have higher step budget than A2 (more complex)."""
+        a2 = get_scenario("A2")
+        a4 = get_scenario("A4")
+        assert a4.step_budget >= a2.step_budget
+        assert a4.difficulty == "hard"
+# ===========================================================================
+# B1: UPS Alarm Response Tests
+# ===========================================================================
+class TestB1UPSAlarm:
+    """Test the B1 scenario lifecycle."""
+    def test_initialization(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="B1")
+        assert obs.scenario_type == "power"
+        assert "UPS" in obs.alert
+    def test_battery_partially_drained(self) -> None:
+        """UPS battery should be partially drained (brief outage)."""
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="B1")
+        ups_soc = obs.metadata["power"]["UPS-1"]["battery_soc"]
+        assert ups_soc < 1.0, f"Battery should be partially drained, SOC={ups_soc}"
+    def test_resolution_requires_diagnose_and_ack(self) -> None:
+        """B1 resolves when agent diagnoses UPS AND acknowledges alarm."""
+        env = DcOpsEnvironment()
+        env.reset(scenario="B1")
+        obs = env.step(DcOpsAction(command="diagnose UPS-1"))
+        assert obs.done is False  # Not resolved yet
+        obs = env.step(DcOpsAction(command="acknowledge_alarm"))
+        assert obs.done is True  # Now resolved
+# ===========================================================================
+# B3: Generator Test Protocol Tests
+# ===========================================================================
+class TestB3GeneratorTest:
+    """Test the B3 scenario lifecycle."""
+    def test_initialization(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="B3")
+        assert obs.scenario_type == "power"
+        assert "generator" in obs.alert.lower()
+    def test_correct_protocol_resolves(self) -> None:
+        """Following correct protocol should resolve the scenario."""
+        env = DcOpsEnvironment()
+        env.reset(scenario="B3")
+        env.step(DcOpsAction(command="check_status"))
+        env.step(DcOpsAction(command="start_generator"))
+        # Wait for generator to start up
+        env.step(DcOpsAction(command="wait"))
+        env.step(DcOpsAction(command="wait"))
+        env.step(DcOpsAction(command="diagnose GEN-1"))
+        env.step(DcOpsAction(command="stop_generator"))
+        obs = env.step(DcOpsAction(command="acknowledge_alarm"))
+        assert obs.done is True
+    def test_uses_30s_steps(self) -> None:
+        s = get_scenario("B3")
+        assert s.game_time_per_step_s == 30.0
+# ===========================================================================
+# B4: Power Failure Cascade Tests
+# ===========================================================================
+class TestB4PowerCascade:
+    """Test the B4 scenario lifecycle."""
+    def test_initialization(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="B4")
+        assert obs.scenario_type == "power"
+        assert "utility" in obs.alert.lower() or "power" in obs.alert.lower()
+    def test_utility_is_down(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="B4")
+        assert obs.metadata["power"]["utility_available"] is False
+    def test_ups_on_battery(self) -> None:
+        """UPS should be on battery after utility loss."""
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="B4")
+        # After warmup + fault injection, UPS should be on battery
+        ups_mode = obs.metadata["power"]["UPS-1"]["mode"]
+        assert ups_mode in ("on_battery", "double_conversion"), f"UPS mode: {ups_mode}"
+    def test_fast_time_progression(self) -> None:
+        s = get_scenario("B4")
+        assert s.game_time_per_step_s == 15.0
+        assert s.difficulty == "hard"
+# ===========================================================================
+# Environment Scenario Integration Tests
+# ===========================================================================
+class TestScenarioIntegration:
+    """Test scenario integration with the environment."""
+    def test_scenario_by_id_string(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="A1")
+        assert obs.metadata["scenario"]["id"] == "A1"
+    def test_scenario_by_instance(self) -> None:
+        env = DcOpsEnvironment()
+        s = get_scenario("B3")
+        obs = env.reset(scenario=s)
+        assert obs.metadata["scenario"]["id"] == "B3"
+    def test_scenario_step_budget_used(self) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="A1")
+        assert obs.steps_remaining == 10  # A1 budget
+    def test_scenario_kwargs_override(self) -> None:
+        """Explicit kwargs should override scenario defaults."""
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario="A1", step_budget=5)
+        assert obs.steps_remaining == 5
+    def test_no_scenario_backward_compat(self) -> None:
+        """Environment should work without a scenario (backward compat)."""
+        env = DcOpsEnvironment()
+        obs = env.reset()
+        assert "scenario" not in obs.metadata
+        assert obs.scenario_type == ""
+    def test_scenario_resolution_ends_episode(self) -> None:
+        """When scenario is resolved, episode should end with done=True."""
+        env = DcOpsEnvironment()
+        env.reset(scenario="B1")
+        # Resolve B1: diagnose + acknowledge
+        env.step(DcOpsAction(command="diagnose UPS-1"))
+        obs = env.step(DcOpsAction(command="acknowledge_alarm"))
+        assert obs.done is True
+    def test_speed_bonus_on_resolution(self) -> None:
+        """Resolving early should give a speed bonus."""
+        env = DcOpsEnvironment()
+        env.reset(scenario="B1")  # Budget: 10
+        env.step(DcOpsAction(command="diagnose UPS-1"))  # Step 1
+        obs = env.step(DcOpsAction(command="acknowledge_alarm"))  # Step 2
+        # Speed bonus = (10 - 2) / 10 = 0.8
+        # Total reward should include this bonus
+        assert obs.reward > 0.5, f"Expected speed bonus in reward, got {obs.reward:.3f}"
+    def test_random_scenario_via_reset(self) -> None:
+        """reset(random_scenario=True) should pick a random scenario."""
+        env = DcOpsEnvironment()
+        obs = env.reset(random_scenario=True, seed=42)
+        assert "scenario" in obs.metadata
+        assert obs.metadata["scenario"]["id"] in registered_scenario_ids()
+# ===========================================================================
+# All Scenarios Smoke Test
+# ===========================================================================
+class TestAllScenariosSmoke:
+    """Smoke test: every scenario can initialize and run 3 steps."""
+    @pytest.mark.parametrize("scenario_id", registered_scenario_ids())
+    def test_scenario_runs(self, scenario_id: str) -> None:
+        env = DcOpsEnvironment()
+        obs = env.reset(scenario=scenario_id)
+        assert isinstance(obs, DcOpsObservation)
+        assert obs.done is False
+        assert len(obs.dashboard) > 100
+        # Run 3 steps
+        for _ in range(3):
+            obs = env.step(DcOpsAction(command="wait"))
+            assert isinstance(obs, DcOpsObservation)

tests/test_thermal.py ADDED Viewed

	@@ -0,0 +1,499 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Physics validation tests for the thermal simulation.
+These tests verify that the simulation produces physically plausible behavior:
+1. Steady-state temperatures are in expected ranges
+2. CRAC failure causes predictable temperature rise rates
+3. Total cooling loss leads to thermal runaway at ~5°C/min
+4. Setpoint changes propagate with correct time constants
+5. Energy conservation holds
+6. PUE is in realistic range
+7. Recirculation raises cold aisle temperature
+8. Performance target: 1000 steps < 1 second
+"""
+import time
+import pytest
+from dc_ops_env.config import (
+    ASHRAE_CLASSES,
+    CRACConfig,
+    DatacenterConfig,
+    RackConfig,
+    ZoneConfig,
+    make_default_datacenter_config,
+)
+from dc_ops_env.simulation.thermal import ThermalSimulation
+from dc_ops_env.simulation.types import CRACFaultType, CRACStatus
+@pytest.fixture
+def default_sim() -> ThermalSimulation:
+    """Default datacenter: 2 zones × 10 racks × 2 CRACs, 160 kW total IT."""
+    return ThermalSimulation()
+@pytest.fixture
+def single_zone_sim() -> ThermalSimulation:
+    """Minimal single-zone facility for isolated testing."""
+    racks = [
+        RackConfig(rack_id=f"A-{i:02d}", row="A", position=i, it_load_kw=8.0)
+        for i in range(1, 6)  # 5 racks × 8 kW = 40 kW IT
+    ]
+    cracs = [
+        CRACConfig(unit_id="CRAC-1", rated_capacity_kw=70.0),
+    ]
+    config = DatacenterConfig(
+        name="Test Single Zone",
+        zones=[
+            ZoneConfig(
+                zone_id="zone_a",
+                racks=racks,
+                crac_units=cracs,
+                air_volume_m3=300.0,
+                recirculation_factor=0.05,
+            )
+        ],
+        outside_temp_c=35.0,
+        floor_area_m2=300.0,
+    )
+    return ThermalSimulation(config)
+class TestSteadyState:
+    """Test that the simulation converges to physically plausible steady state."""
+    def test_cold_aisle_in_ashrae_range(self, default_sim: ThermalSimulation):
+        """Cold aisle should be within ASHRAE A2 recommended range at steady state."""
+        # Run 600 steps (10 minutes) to ensure steady state
+        default_sim.step_n(600)
+        for zone in default_sim.state.zones:
+            ashrae = ASHRAE_CLASSES[zone.ashrae_class]
+            assert zone.cold_aisle_temp_c >= ashrae.recommended_min_c - 2.0, (
+                f"Zone {zone.zone_id}: cold aisle {zone.cold_aisle_temp_c:.1f}°C "
+                f"below ASHRAE min {ashrae.recommended_min_c}°C"
+            )
+            assert zone.cold_aisle_temp_c <= ashrae.recommended_max_c + 2.0, (
+                f"Zone {zone.zone_id}: cold aisle {zone.cold_aisle_temp_c:.1f}°C "
+                f"above ASHRAE max {ashrae.recommended_max_c}°C"
+            )
+    def test_hot_aisle_warmer_than_cold(self, default_sim: ThermalSimulation):
+        """Hot aisle must always be warmer than cold aisle."""
+        default_sim.step_n(300)
+        for zone in default_sim.state.zones:
+            assert zone.hot_aisle_temp_c > zone.cold_aisle_temp_c, (
+                f"Zone {zone.zone_id}: hot aisle {zone.hot_aisle_temp_c:.1f}°C "
+                f"not warmer than cold aisle {zone.cold_aisle_temp_c:.1f}°C"
+            )
+    def test_hot_cold_delta_reasonable(self, default_sim: ThermalSimulation):
+        """Temperature delta across racks should be 10-20°C for standard density."""
+        # At 8 kW/rack with ~160 CFM/kW airflow, ΔT ≈ 8000 / (0.605 × 1005) ≈ 13°C
+        default_sim.step_n(300)
+        for zone in default_sim.state.zones:
+            delta = zone.hot_aisle_temp_c - zone.cold_aisle_temp_c
+            assert 5.0 < delta < 25.0, (
+                f"Zone {zone.zone_id}: ΔT = {delta:.1f}°C outside expected range 5-25°C"
+            )
+    def test_pue_realistic(self, default_sim: ThermalSimulation):
+        """PUE should be in realistic range (1.1 - 2.0) at steady state."""
+        default_sim.step_n(300)
+        pue = default_sim.state.pue
+        assert 1.1 <= pue <= 2.0, f"PUE {pue:.2f} outside realistic range 1.1-2.0"
+    def test_rack_inlet_equals_cold_aisle(self, default_sim: ThermalSimulation):
+        """All rack inlets in a zone should equal the zone cold aisle temp."""
+        default_sim.step_n(300)
+        for zone in default_sim.state.zones:
+            for rack in zone.racks:
+                assert abs(rack.inlet_temp_c - zone.cold_aisle_temp_c) < 0.01, (
+                    f"Rack {rack.rack_id}: inlet {rack.inlet_temp_c:.2f}°C "
+                    f"!= zone cold {zone.cold_aisle_temp_c:.2f}°C"
+                )
+    def test_rack_outlet_consistent_with_load(self, default_sim: ThermalSimulation):
+        """Rack outlet temp should be consistent with Q = m_dot × c_p × ΔT."""
+        from dc_ops_env.config import AIR_DENSITY_KG_M3, AIR_SPECIFIC_HEAT_J_KGK
+        default_sim.step_n(300)
+        for zone in default_sim.state.zones:
+            for rack in zone.racks:
+                m_dot = rack.airflow_m3s * AIR_DENSITY_KG_M3
+                expected_dt = (rack.it_load_kw * 1000.0) / (m_dot * AIR_SPECIFIC_HEAT_J_KGK)
+                actual_dt = rack.outlet_temp_c - rack.inlet_temp_c
+                assert abs(actual_dt - expected_dt) < 0.1, (
+                    f"Rack {rack.rack_id}: ΔT {actual_dt:.2f}°C vs expected {expected_dt:.2f}°C"
+                )
+class TestCRACFailure:
+    """Test thermal response to CRAC unit failures."""
+    def test_single_crac_failure_temp_rises(self, default_sim: ThermalSimulation):
+        """Losing 1 of 2 CRACs should cause temperature increase.
+        With N+1 cooling provisioning (2 CRACs for 80 kW IT load, each
+        rated at 70 kW), losing one CRAC means the faulted unit's fans
+        still run but blow unconditioned air (at return temp), actively
+        warming the cold aisle. Temperature should rise noticeably.
+        """
+        # Settle first
+        default_sim.step_n(300)
+        temp_before = default_sim.state.zones[0].cold_aisle_temp_c
+        # Fail one CRAC in zone A
+        default_sim.inject_crac_fault("CRAC-1", CRACFaultType.COMPRESSOR)
+        # Run 10 minutes (600 steps at dt=1s) — longer for N+1 systems
+        default_sim.step_n(600)
+        temp_after = default_sim.state.zones[0].cold_aisle_temp_c
+        assert temp_after > temp_before + 0.5, (
+            f"Temperature should rise after CRAC failure: {temp_before:.1f} → {temp_after:.1f}°C"
+        )
+    def test_single_crac_failure_other_zone_unaffected(self, default_sim: ThermalSimulation):
+        """CRAC failure in zone A should not directly affect zone B."""
+        default_sim.step_n(300)
+        temp_b_before = default_sim.state.zones[1].cold_aisle_temp_c
+        default_sim.inject_crac_fault("CRAC-1", CRACFaultType.COMPRESSOR)
+        default_sim.step_n(300)
+        temp_b_after = default_sim.state.zones[1].cold_aisle_temp_c
+        # Zone B has its own CRACs, so temp should be nearly unchanged
+        # (small change possible due to shared outside temp / lighting)
+        assert abs(temp_b_after - temp_b_before) < 2.0, (
+            f"Zone B temp changed too much: {temp_b_before:.1f} → {temp_b_after:.1f}°C"
+        )
+    def test_crac_recovery(self, default_sim: ThermalSimulation):
+        """Clearing a CRAC fault should allow temperature to recover."""
+        default_sim.step_n(300)
+        default_sim.inject_crac_fault("CRAC-1", CRACFaultType.COMPRESSOR)
+        default_sim.step_n(600)  # Let temp rise for 10 min
+        temp_during_fault = default_sim.state.zones[0].cold_aisle_temp_c
+        default_sim.clear_crac_fault("CRAC-1")
+        default_sim.step_n(600)  # Give time to recover
+        temp_recovered = default_sim.state.zones[0].cold_aisle_temp_c
+        assert temp_recovered < temp_during_fault - 0.3, (
+            f"Temperature should drop after fault cleared: "
+            f"{temp_during_fault:.1f} → {temp_recovered:.1f}°C"
+        )
+class TestTotalCoolingLoss:
+    """Test behavior when all cooling is lost."""
+    def test_temp_rise_rate_approximately_5c_per_minute(self, single_zone_sim: ThermalSimulation):
+        """With all cooling off, temperature should rise ~5°C/min.
+        Reference: Active Power WP-105, Electronics Cooling literature.
+        At standard IT densities, initial rate is ~5°C/min or more.
+        For our config: 40 kW IT in a zone with ~5 × 20 × 11.1 kJ/K = 1110 kJ/K
+        thermal mass (equipment) + ~360 kJ/K air ≈ 1470 kJ/K total.
+        dT/dt = Q_net / C = 40,000 W / 1,470,000 J/K ≈ 0.027 °C/s ≈ 1.6 °C/min
+        With envelope heat gain at 35°C outside, the actual rate will be slightly
+        higher. For a smaller zone with 5 racks, the rate is ~1.6°C/min.
+        For higher-density or lower-mass zones it can reach 5°C/min.
+        """
+        single_zone_sim.step_n(300)  # Settle
+        temp_before = single_zone_sim.state.zones[0].cold_aisle_temp_c
+        # Kill all cooling
+        single_zone_sim.inject_crac_fault("CRAC-1", CRACFaultType.COMPRESSOR)
+        # Run 2 minutes
+        single_zone_sim.step_n(120)
+        temp_after = single_zone_sim.state.zones[0].cold_aisle_temp_c
+        rise_rate_per_min = (temp_after - temp_before) / 2.0  # °C/min
+        # Accept 0.5 - 8 °C/min depending on thermal mass
+        assert rise_rate_per_min > 0.5, (
+            f"Temperature rise too slow: {rise_rate_per_min:.2f} °C/min"
+        )
+        assert rise_rate_per_min < 8.0, (
+            f"Temperature rise too fast: {rise_rate_per_min:.2f} °C/min"
+        )
+    def test_reaches_critical_in_reasonable_time(self, single_zone_sim: ThermalSimulation):
+        """With all cooling off, should reach ASHRAE allowable max within ~10-20 min."""
+        single_zone_sim.step_n(300)  # Settle
+        single_zone_sim.inject_crac_fault("CRAC-1", CRACFaultType.COMPRESSOR)
+        ashrae = ASHRAE_CLASSES["A2"]
+        max_temp = ashrae.allowable_max_c  # 35°C
+        # Run up to 30 minutes (1800 steps)
+        reached_critical = False
+        for step in range(1800):
+            single_zone_sim.step()
+            if single_zone_sim.state.zones[0].cold_aisle_temp_c > max_temp:
+                reached_critical = True
+                time_to_critical_min = (step + 1) / 60.0
+                break
+        assert reached_critical, (
+            f"Never reached {max_temp}°C in 30 min. "
+            f"Final temp: {single_zone_sim.state.zones[0].cold_aisle_temp_c:.1f}°C"
+        )
+        assert time_to_critical_min < 25.0, (
+            f"Took {time_to_critical_min:.1f} min to reach critical — too slow"
+        )
+class TestSetpointChanges:
+    """Test CRAC setpoint change dynamics."""
+    def test_setpoint_increase_raises_cold_aisle(self, single_zone_sim: ThermalSimulation):
+        """Raising CRAC setpoint should raise cold aisle temperature."""
+        single_zone_sim.step_n(300)
+        temp_before = single_zone_sim.state.zones[0].cold_aisle_temp_c
+        # Raise setpoint by 5°C
+        single_zone_sim.set_crac_setpoint("CRAC-1", 23.0)
+        single_zone_sim.step_n(300)
+        temp_after = single_zone_sim.state.zones[0].cold_aisle_temp_c
+        assert temp_after > temp_before + 2.0, (
+            f"Cold aisle should rise with higher setpoint: {temp_before:.1f} → {temp_after:.1f}°C"
+        )
+    def test_setpoint_decrease_lowers_cold_aisle(self, single_zone_sim: ThermalSimulation):
+        """Lowering CRAC setpoint should lower cold aisle temperature."""
+        single_zone_sim.step_n(300)
+        temp_before = single_zone_sim.state.zones[0].cold_aisle_temp_c
+        single_zone_sim.set_crac_setpoint("CRAC-1", 14.0)
+        single_zone_sim.step_n(300)
+        temp_after = single_zone_sim.state.zones[0].cold_aisle_temp_c
+        assert temp_after < temp_before - 1.0, (
+            f"Cold aisle should drop with lower setpoint: {temp_before:.1f} → {temp_after:.1f}°C"
+        )
+    def test_supply_temp_lag(self, single_zone_sim: ThermalSimulation):
+        """Supply temp should lag setpoint with ~30s time constant."""
+        single_zone_sim.step_n(300)
+        crac = single_zone_sim.state.zones[0].crac_units[0]
+        old_supply = crac.supply_temp_c
+        # Step change in setpoint
+        single_zone_sim.set_crac_setpoint("CRAC-1", old_supply + 10.0)
+        # After 1 time constant (30s), should be ~63% of the way there
+        single_zone_sim.step_n(30)
+        expected_63pct = old_supply + 10.0 * 0.632
+        actual = crac.supply_temp_c
+        # Allow ±1.5°C tolerance
+        assert abs(actual - expected_63pct) < 1.5, (
+            f"After 1τ, supply temp {actual:.1f}°C, expected ~{expected_63pct:.1f}°C"
+        )
+class TestRecirculation:
+    """Test hot-air recirculation effects."""
+    def test_higher_recirculation_raises_cold_aisle(self):
+        """Higher recirculation factor should result in warmer cold aisle."""
+        configs = []
+        for r in [0.0, 0.15, 0.30]:
+            racks = [RackConfig(rack_id=f"A-{i}", row="A", position=i) for i in range(1, 6)]
+            cracs = [CRACConfig(unit_id="CRAC-1")]
+            cfg = DatacenterConfig(
+                zones=[ZoneConfig(
+                    zone_id="zone_a", racks=racks, crac_units=cracs,
+                    recirculation_factor=r, air_volume_m3=300.0,
+                )],
+                floor_area_m2=300.0,
+            )
+            configs.append(cfg)
+        temps = []
+        for cfg in configs:
+            sim = ThermalSimulation(cfg)
+            sim.step_n(600)
+            temps.append(sim.state.zones[0].cold_aisle_temp_c)
+        # Each higher recirculation factor should produce a warmer cold aisle
+        assert temps[1] > temps[0], (
+            f"r=0.15 ({temps[1]:.1f}°C) should be warmer than r=0.0 ({temps[0]:.1f}°C)"
+        )
+        assert temps[2] > temps[1], (
+            f"r=0.30 ({temps[2]:.1f}°C) should be warmer than r=0.15 ({temps[1]:.1f}°C)"
+        )
+class TestFanSpeedEffects:
+    """Test fan speed control on cooling and power."""
+    def test_reduced_fan_speed_raises_temp(self, single_zone_sim: ThermalSimulation):
+        """Reducing fan speed should reduce airflow and raise temperatures.
+        At 50% fan speed, CRAC airflow drops to 50% but cooling injection
+        rate (m_dot × c_p × ΔT) drops proportionally, shifting the
+        equilibrium cold aisle temp upward. With a well-provisioned CRAC
+        the shift is modest (~0.5-1.5°C).
+        """
+        single_zone_sim.step_n(300)
+        temp_before = single_zone_sim.state.zones[0].cold_aisle_temp_c
+        single_zone_sim.set_crac_fan_speed("CRAC-1", 50.0)
+        single_zone_sim.step_n(600)  # More time to reach new equilibrium
+        temp_after = single_zone_sim.state.zones[0].cold_aisle_temp_c
+        assert temp_after > temp_before + 0.3, (
+            f"Reduced fan speed should raise temp: {temp_before:.1f} → {temp_after:.1f}°C"
+        )
+    def test_fan_power_cubic_law(self, single_zone_sim: ThermalSimulation):
+        """Fan power should follow cubic law: P ∝ speed³."""
+        crac = single_zone_sim.state.zones[0].crac_units[0]
+        rated_power = crac.fan_rated_power_kw
+        # At 50% speed, power should be 0.5³ = 0.125 of rated
+        crac.fan_speed_pct = 50.0
+        # Fan power is part of compute_power_consumption, but we can test the formula
+        expected_fan_power = rated_power * (0.5 ** 3)
+        actual_fan_power = rated_power * (crac.fan_speed_pct / 100.0) ** 3
+        assert abs(actual_fan_power - expected_fan_power) < 0.01
+class TestOutsideTemperature:
+    """Test outside temperature effects."""
+    def test_hotter_outside_increases_cooling_power(self):
+        """Higher outside temp should degrade COP and increase cooling power."""
+        temps = [20.0, 35.0, 45.0]
+        cooling_powers = []
+        for t_out in temps:
+            racks = [RackConfig(rack_id=f"A-{i}", row="A", position=i) for i in range(1, 6)]
+            cracs = [CRACConfig(unit_id="CRAC-1")]
+            cfg = DatacenterConfig(
+                zones=[ZoneConfig(
+                    zone_id="zone_a", racks=racks, crac_units=cracs,
+                    air_volume_m3=300.0,
+                )],
+                outside_temp_c=t_out,
+                floor_area_m2=300.0,
+            )
+            sim = ThermalSimulation(cfg)
+            sim.step_n(600)
+            cooling_powers.append(sim.state.total_cooling_power_kw)
+        # Higher outside temp → higher cooling power (degraded COP)
+        assert cooling_powers[1] > cooling_powers[0], (
+            f"Cooling power at 35°C ({cooling_powers[1]:.1f} kW) should exceed "
+            f"at 20°C ({cooling_powers[0]:.1f} kW)"
+        )
+        assert cooling_powers[2] > cooling_powers[1], (
+            f"Cooling power at 45°C ({cooling_powers[2]:.1f} kW) should exceed "
+            f"at 35°C ({cooling_powers[1]:.1f} kW)"
+        )
+class TestEnergyConservation:
+    """Test that energy bookkeeping is consistent."""
+    def test_energy_positive(self, default_sim: ThermalSimulation):
+        """Energy consumed per step should always be positive."""
+        for _ in range(100):
+            result = default_sim.step()
+            assert result.energy_consumed_kwh > 0, "Energy per step must be positive"
+    def test_cooling_output_matches_heat_at_steady_state(
+        self, default_sim: ThermalSimulation
+    ):
+        """At thermal equilibrium, CRAC extraction ≈ IT load + overhead.
+        The CRAC-extracted heat includes bypass airflow effects (cold air
+        that bypasses servers and returns to CRACs at T_cold instead of T_hot).
+        Total extraction should reasonably cover IT load plus internal gains.
+        """
+        default_sim.step_n(600)
+        result = default_sim.step()
+        total_it_kw = default_sim.state.total_it_load_kw
+        q_cooling = result.total_cooling_output_kw
+        # With bypass-corrected model, CRAC extraction ≈ IT load plus
+        # overhead (UPS/PDU/lighting losses + envelope gain ≈ 10-20% of IT)
+        ratio = q_cooling / total_it_kw if total_it_kw > 0 else 0
+        assert 0.5 < ratio < 2.0, (
+            f"Cooling/IT ratio {ratio:.2f} outside plausible range. "
+            f"Cooling: {q_cooling:.1f} kW, IT: {total_it_kw:.1f} kW"
+        )
+class TestPerformance:
+    """Test simulation speed meets target: <1ms per step."""
+    def test_1000_steps_under_1_second(self, default_sim: ThermalSimulation):
+        """1000 steps should complete in under 1 second for a 20-rack DC."""
+        start = time.perf_counter()
+        default_sim.step_n(1000)
+        elapsed = time.perf_counter() - start
+        assert elapsed < 1.0, (
+            f"1000 steps took {elapsed:.3f}s — exceeds 1s target"
+        )
+        # Report throughput
+        steps_per_sec = 1000.0 / elapsed
+        print(f"\nPerformance: {steps_per_sec:.0f} steps/sec ({elapsed*1000:.1f} ms for 1000 steps)")
+class TestMutationHelpers:
+    """Test that mutation helpers work correctly."""
+    def test_set_crac_setpoint(self, default_sim: ThermalSimulation):
+        assert default_sim.set_crac_setpoint("CRAC-1", 22.0)
+        crac = default_sim._find_crac("CRAC-1")
+        assert crac is not None
+        assert crac.setpoint_c == 22.0
+    def test_set_invalid_crac(self, default_sim: ThermalSimulation):
+        assert not default_sim.set_crac_setpoint("CRAC-99", 22.0)
+    def test_set_fan_speed_clamped(self, default_sim: ThermalSimulation):
+        assert default_sim.set_crac_fan_speed("CRAC-1", 150.0)
+        crac = default_sim._find_crac("CRAC-1")
+        assert crac is not None
+        assert crac.fan_speed_pct == 100.0
+    def test_inject_and_clear_fault(self, default_sim: ThermalSimulation):
+        assert default_sim.inject_crac_fault("CRAC-2", CRACFaultType.FAN)
+        crac = default_sim._find_crac("CRAC-2")
+        assert crac is not None
+        assert crac.status == CRACStatus.FAULT
+        assert crac.fault_type == CRACFaultType.FAN
+        assert crac.current_airflow_m3s == 0.0
+        assert default_sim.clear_crac_fault("CRAC-2")
+        assert crac.status == CRACStatus.RUNNING
+        assert crac.fault_type == CRACFaultType.NONE
+    def test_set_rack_load(self, default_sim: ThermalSimulation):
+        assert default_sim.set_rack_load("A-01", 12.0)
+        rack = default_sim._find_rack("A-01")
+        assert rack is not None
+        assert rack.it_load_kw == 12.0
+        assert rack.airflow_m3s > 0  # Airflow updated proportionally

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff