Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- Dockerfile +82 -0
- README.md +494 -5
- __init__.py +16 -0
- actions/__init__.py +11 -0
- actions/parser.py +356 -0
- client.py +69 -0
- config.py +549 -0
- data/datacenter_configs/default.yaml +101 -0
- data/datacenter_configs/large_facility.yaml +157 -0
- data/datacenter_configs/small_facility.yaml +67 -0
- models.py +83 -0
- openenv.yaml +7 -0
- openenv_dc_ops_env.egg-info/PKG-INFO +9 -0
- openenv_dc_ops_env.egg-info/SOURCES.txt +20 -0
- openenv_dc_ops_env.egg-info/dependency_links.txt +1 -0
- openenv_dc_ops_env.egg-info/entry_points.txt +2 -0
- openenv_dc_ops_env.egg-info/requires.txt +5 -0
- openenv_dc_ops_env.egg-info/top_level.txt +1 -0
- pyproject.toml +50 -0
- rendering/__init__.py +11 -0
- rendering/dashboard.py +262 -0
- rewards/__init__.py +23 -0
- rewards/reward_function.py +428 -0
- scenarios/__init__.py +31 -0
- scenarios/base.py +195 -0
- scenarios/power_scenarios.py +496 -0
- scenarios/registry.py +81 -0
- scenarios/thermal_scenarios.py +443 -0
- server/__init__.py +11 -0
- server/app.py +101 -0
- server/dc_ops_env_environment.py +532 -0
- server/requirements.txt +6 -0
- server/static/index.html +911 -0
- simulation/__init__.py +49 -0
- simulation/power.py +668 -0
- simulation/thermal.py +515 -0
- simulation/types.py +598 -0
- tests/__init__.py +0 -0
- tests/test_environment.py +439 -0
- tests/test_integration.py +535 -0
- tests/test_power.py +743 -0
- tests/test_rewards.py +650 -0
- tests/test_scenarios.py +415 -0
- tests/test_thermal.py +499 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# Multi-stage build using openenv-base
|
| 8 |
+
# This Dockerfile is flexible and works for both:
|
| 9 |
+
# - In-repo environments (with local OpenEnv sources)
|
| 10 |
+
# - Standalone environments (with openenv from PyPI/Git)
|
| 11 |
+
# The build script (openenv build) handles context detection and sets appropriate build args.
|
| 12 |
+
|
| 13 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 14 |
+
FROM ${BASE_IMAGE} AS builder
|
| 15 |
+
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
# Ensure git is available (required for installing dependencies from VCS)
|
| 19 |
+
RUN apt-get update && \
|
| 20 |
+
apt-get install -y --no-install-recommends git && \
|
| 21 |
+
rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Build argument to control whether we're building standalone or in-repo
|
| 24 |
+
ARG BUILD_MODE=in-repo
|
| 25 |
+
ARG ENV_NAME=dc_ops_env
|
| 26 |
+
|
| 27 |
+
# Copy environment code (always at root of build context)
|
| 28 |
+
COPY . /app/env
|
| 29 |
+
|
| 30 |
+
# For in-repo builds, openenv is already vendored in the build context
|
| 31 |
+
# For standalone builds, openenv will be installed via pyproject.toml
|
| 32 |
+
WORKDIR /app/env
|
| 33 |
+
|
| 34 |
+
# Ensure uv is available (for local builds where base image lacks it)
|
| 35 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 36 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 37 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 38 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
# Install dependencies using uv sync
|
| 42 |
+
# If uv.lock exists, use it; otherwise resolve on the fly
|
| 43 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 44 |
+
if [ -f uv.lock ]; then \
|
| 45 |
+
uv sync --frozen --no-install-project --no-editable; \
|
| 46 |
+
else \
|
| 47 |
+
uv sync --no-install-project --no-editable; \
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 51 |
+
if [ -f uv.lock ]; then \
|
| 52 |
+
uv sync --frozen --no-editable; \
|
| 53 |
+
else \
|
| 54 |
+
uv sync --no-editable; \
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
# Final runtime stage
|
| 58 |
+
FROM ${BASE_IMAGE}
|
| 59 |
+
|
| 60 |
+
WORKDIR /app
|
| 61 |
+
|
| 62 |
+
# Copy the virtual environment from builder
|
| 63 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 64 |
+
|
| 65 |
+
# Copy the environment code — directory name MUST match package name
|
| 66 |
+
# so that relative imports (from ..config, from ..simulation, etc.) resolve correctly
|
| 67 |
+
COPY --from=builder /app/env /app/dc_ops_env
|
| 68 |
+
|
| 69 |
+
# Set PATH to use the virtual environment
|
| 70 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 71 |
+
|
| 72 |
+
# Set PYTHONPATH so dc_ops_env is discoverable as a proper package
|
| 73 |
+
ENV PYTHONPATH="/app:$PYTHONPATH"
|
| 74 |
+
|
| 75 |
+
# Health check
|
| 76 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 77 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 78 |
+
|
| 79 |
+
# Run the FastAPI server using the venv's Python to ensure correct dependencies
|
| 80 |
+
# Fully-qualified module path ensures dc_ops_env is the top-level package
|
| 81 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 82 |
+
CMD ["/app/.venv/bin/python", "-m", "uvicorn", "dc_ops_env.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
|
@@ -1,10 +1,499 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: DC-Ops Environment Server
|
| 3 |
+
emoji: 🖥️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
app_port: 8000
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
- reinforcement-learning
|
| 13 |
+
- datacenter
|
| 14 |
+
- simulation
|
| 15 |
---
|
| 16 |
|
| 17 |
+
# DC-Ops Environment
|
| 18 |
+
|
| 19 |
+
A physics-based datacenter operations environment for training LLM agents, built on Meta's [OpenEnv](https://github.com/meta-pytorch/OpenEnv) framework.
|
| 20 |
+
|
| 21 |
+
The agent reads a text-based NOC dashboard and issues natural-language operator commands — exactly as a human datacenter operator would.
|
| 22 |
+
|
| 23 |
+
## Quick Start
|
| 24 |
+
|
| 25 |
+
### Prerequisites
|
| 26 |
+
|
| 27 |
+
- Python 3.10+
|
| 28 |
+
- [uv](https://docs.astral.sh/uv/) (recommended) or pip
|
| 29 |
+
- Docker (for containerized deployment)
|
| 30 |
+
|
| 31 |
+
### Install & Run Locally
|
| 32 |
+
|
| 33 |
+
```bash
|
| 34 |
+
# Clone the repository
|
| 35 |
+
git clone <repo-url>
|
| 36 |
+
cd dc_ops_env
|
| 37 |
+
|
| 38 |
+
# Install dependencies
|
| 39 |
+
uv sync
|
| 40 |
+
|
| 41 |
+
# Run the test suite (256 tests, <10s)
|
| 42 |
+
uv run pytest tests/ -v
|
| 43 |
+
|
| 44 |
+
# Start the server
|
| 45 |
+
uv run server
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
The server starts at `http://localhost:8000` with:
|
| 49 |
+
- **Web UI** → `http://localhost:8000/web`
|
| 50 |
+
- **API docs** → `http://localhost:8000/docs`
|
| 51 |
+
- **Health check** → `http://localhost:8000/health`
|
| 52 |
+
- **WebSocket** → `ws://localhost:8000/ws`
|
| 53 |
+
|
| 54 |
+
### Run with Docker
|
| 55 |
+
|
| 56 |
+
```bash
|
| 57 |
+
# Build the image
|
| 58 |
+
docker build -t dc-ops:latest -f server/Dockerfile .
|
| 59 |
+
|
| 60 |
+
# Run the container
|
| 61 |
+
docker run -d -p 8000:8000 dc-ops:latest
|
| 62 |
+
|
| 63 |
+
# Verify it's running
|
| 64 |
+
curl http://localhost:8000/health
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
---
|
| 68 |
+
|
| 69 |
+
## OpenEnv Integration
|
| 70 |
+
|
| 71 |
+
DC-Ops is a fully compliant [OpenEnv](https://github.com/meta-pytorch/OpenEnv) environment. OpenEnv provides:
|
| 72 |
+
- **MCP tool-based interactions** for LLM agents (WebSocket `/ws`)
|
| 73 |
+
- **HTTP orchestration layer** for training pipelines (`/reset`, `/step`, `/state`)
|
| 74 |
+
- **HuggingFace Spaces deployment** via `openenv push`
|
| 75 |
+
- **TRL/GRPO integration** for RL training with `GRPOTrainer`
|
| 76 |
+
|
| 77 |
+
### Action & Observation Models
|
| 78 |
+
|
| 79 |
+
**DcOpsAction** — the agent's command:
|
| 80 |
+
```python
|
| 81 |
+
class DcOpsAction(Action):
|
| 82 |
+
command: str # e.g., "diagnose CRAC-3", "adjust_setpoint CRAC-1 20"
|
| 83 |
+
reasoning: str # Optional chain-of-thought
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
**DcOpsObservation** — what the agent sees:
|
| 87 |
+
```python
|
| 88 |
+
class DcOpsObservation(Observation):
|
| 89 |
+
dashboard: str # Text-rendered monitoring dashboard
|
| 90 |
+
available_actions: list # Valid commands the agent can issue
|
| 91 |
+
alert: str # Current active alert message
|
| 92 |
+
scenario_type: str # "thermal", "power", etc.
|
| 93 |
+
steps_remaining: int # Steps left in episode budget
|
| 94 |
+
action_result: str # Feedback from last action
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
### Available Commands
|
| 98 |
+
|
| 99 |
+
| Command | Format | Description |
|
| 100 |
+
|---------|--------|-------------|
|
| 101 |
+
| `diagnose` | `diagnose <unit_id>` | Inspect a CRAC/UPS/PDU for faults |
|
| 102 |
+
| `adjust_setpoint` | `adjust_setpoint <crac_id> <temp_c>` | Change CRAC supply air setpoint |
|
| 103 |
+
| `set_fan_speed` | `set_fan_speed <crac_id> <pct>` | Set CRAC fan speed (0-100%) |
|
| 104 |
+
| `set_rack_load` | `set_rack_load <rack_id> <kw>` | Adjust rack IT load (migrate workload) |
|
| 105 |
+
| `start_crac` | `start_crac <crac_id>` | Start a standby CRAC unit |
|
| 106 |
+
| `stop_crac` | `stop_crac <crac_id>` | Put a CRAC into standby |
|
| 107 |
+
| `start_generator` | `start_generator` | Manually start the diesel generator |
|
| 108 |
+
| `stop_generator` | `stop_generator` | Initiate generator cooldown |
|
| 109 |
+
| `set_ups_mode` | `set_ups_mode <ups_id> <mode>` | Set UPS mode (eco/double_conversion/bypass) |
|
| 110 |
+
| `refuel_generator` | `refuel_generator [liters]` | Refuel (default: full tank) |
|
| 111 |
+
| `acknowledge_alarm` | `acknowledge_alarm` | Acknowledge current alert |
|
| 112 |
+
| `check_status` | `check_status` | Request full status report |
|
| 113 |
+
| `escalate` | `escalate` | Escalate to senior engineer |
|
| 114 |
+
| `wait` | `wait` | Take no action this step |
|
| 115 |
+
|
| 116 |
+
---
|
| 117 |
+
|
| 118 |
+
## Using the Client
|
| 119 |
+
|
| 120 |
+
### Programmatic Usage (Python)
|
| 121 |
+
|
| 122 |
+
```python
|
| 123 |
+
from dc_ops_env import DcOpsAction, DcOpsEnv
|
| 124 |
+
|
| 125 |
+
# Connect to a running server
|
| 126 |
+
async with DcOpsEnv(base_url="http://localhost:8000") as env:
|
| 127 |
+
# Reset with a specific scenario
|
| 128 |
+
result = await env.reset(scenario="A2")
|
| 129 |
+
print(result.observation.dashboard)
|
| 130 |
+
|
| 131 |
+
# Agent loop
|
| 132 |
+
while not result.done:
|
| 133 |
+
result = await env.step(
|
| 134 |
+
DcOpsAction(
|
| 135 |
+
command="diagnose CRAC-3",
|
| 136 |
+
reasoning="CRAC-3 shows compressor failure, need to investigate"
|
| 137 |
+
)
|
| 138 |
+
)
|
| 139 |
+
print(f"Reward: {result.reward}")
|
| 140 |
+
print(result.observation.dashboard)
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
### From Docker Image
|
| 144 |
+
|
| 145 |
+
```python
|
| 146 |
+
from dc_ops_env import DcOpsAction, DcOpsEnv
|
| 147 |
+
|
| 148 |
+
# Start environment from Docker (auto-manages container lifecycle)
|
| 149 |
+
env = DcOpsEnv.from_docker_image("dc-ops:latest")
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
result = env.reset(scenario="A2")
|
| 153 |
+
for _ in range(15):
|
| 154 |
+
result = env.step(DcOpsAction(command="check_status"))
|
| 155 |
+
if result.done:
|
| 156 |
+
break
|
| 157 |
+
finally:
|
| 158 |
+
env.close()
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
### Concurrent Sessions
|
| 162 |
+
|
| 163 |
+
The server supports multiple concurrent WebSocket sessions for parallel training:
|
| 164 |
+
|
| 165 |
+
```python
|
| 166 |
+
# In server/app.py — adjust max_concurrent_envs
|
| 167 |
+
app = create_app(
|
| 168 |
+
DcOpsEnvironment,
|
| 169 |
+
DcOpsAction,
|
| 170 |
+
DcOpsObservation,
|
| 171 |
+
max_concurrent_envs=16, # Scale up for parallel RL
|
| 172 |
+
)
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
```python
|
| 176 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 177 |
+
from dc_ops_env import DcOpsAction, DcOpsEnv
|
| 178 |
+
|
| 179 |
+
def run_episode(scenario_id: str):
|
| 180 |
+
with DcOpsEnv(base_url="http://localhost:8000") as env:
|
| 181 |
+
result = env.reset(scenario=scenario_id)
|
| 182 |
+
total_reward = 0.0
|
| 183 |
+
while not result.done:
|
| 184 |
+
result = env.step(DcOpsAction(command="check_status"))
|
| 185 |
+
total_reward += result.reward
|
| 186 |
+
return scenario_id, total_reward
|
| 187 |
+
|
| 188 |
+
# Run 8 episodes concurrently
|
| 189 |
+
scenarios = ["A1", "A2", "A4", "B1", "B3", "B4", "A2", "B4"]
|
| 190 |
+
with ThreadPoolExecutor(max_workers=8) as executor:
|
| 191 |
+
results = list(executor.map(run_episode, scenarios))
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
---
|
| 195 |
+
|
| 196 |
+
## Scenarios
|
| 197 |
+
|
| 198 |
+
6 operational scenarios across 3 difficulty levels:
|
| 199 |
+
|
| 200 |
+
| ID | Scenario | Difficulty | Type | Fault |
|
| 201 |
+
|----|----------|------------|------|-------|
|
| 202 |
+
| A1 | Cooling Setpoint Optimization | Easy | Thermal | CRACs at 15°C (wasteful) |
|
| 203 |
+
| A2 | Thermal Event Response | Medium | Thermal | CRAC-3 compressor failure |
|
| 204 |
+
| A4 | CRAC Failure Cascade | Hard | Thermal | CRAC-1 compressor + CRAC-3 fan |
|
| 205 |
+
| B1 | UPS Alarm Response | Medium | Power | UPS transferred to battery |
|
| 206 |
+
| B3 | Generator Test Protocol | Easy | Power | None (routine test) |
|
| 207 |
+
| B4 | Power Failure Cascade | Hard | Power | Utility loss + extended gen warmup |
|
| 208 |
+
|
| 209 |
+
Reset with a specific scenario:
|
| 210 |
+
```python
|
| 211 |
+
result = env.reset(scenario="A2") # By ID
|
| 212 |
+
result = env.reset(random_scenario=True) # Random
|
| 213 |
+
result = env.reset(random_scenario=True, difficulty="hard") # Random hard
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
---
|
| 217 |
+
|
| 218 |
+
## Configuration
|
| 219 |
+
|
| 220 |
+
### Built-in Facility Configs
|
| 221 |
+
|
| 222 |
+
Three YAML configurations are included:
|
| 223 |
+
|
| 224 |
+
| Config | Zones | Racks | IT Load | CRACs | Use Case |
|
| 225 |
+
|--------|-------|-------|---------|-------|----------|
|
| 226 |
+
| `default` | 2 | 20 | 160 kW | 4 × 70 kW | Standard facility |
|
| 227 |
+
| `small` | 1 | 10 | 80 kW | 2 × 70 kW | Edge / branch office |
|
| 228 |
+
| `large` | 4 | 60 | 600 kW | 8 × 100 kW | Multi-zone + GPU (H1) |
|
| 229 |
+
|
| 230 |
+
```python
|
| 231 |
+
from dc_ops_env.config import load_datacenter_config
|
| 232 |
+
|
| 233 |
+
# Load a built-in config
|
| 234 |
+
config = load_datacenter_config("small")
|
| 235 |
+
|
| 236 |
+
# Load a custom YAML file
|
| 237 |
+
config = load_datacenter_config("/path/to/my_datacenter.yaml")
|
| 238 |
+
|
| 239 |
+
# Use with environment
|
| 240 |
+
result = env.reset(scenario="A2", config=config)
|
| 241 |
+
```
|
| 242 |
+
|
| 243 |
+
### Custom YAML Configuration
|
| 244 |
+
|
| 245 |
+
Create your own datacenter layout:
|
| 246 |
+
|
| 247 |
+
```yaml
|
| 248 |
+
name: "My Custom Facility"
|
| 249 |
+
outside_temp_c: 35.0
|
| 250 |
+
outside_humidity_rh: 0.40
|
| 251 |
+
simulation_dt_s: 1.0
|
| 252 |
+
|
| 253 |
+
zones:
|
| 254 |
+
- zone_id: zone_a
|
| 255 |
+
containment_type: cold_aisle
|
| 256 |
+
recirculation_factor: 0.08
|
| 257 |
+
air_volume_m3: 500.0
|
| 258 |
+
envelope_r_kw: 0.02
|
| 259 |
+
initial_cold_aisle_temp_c: 20.0
|
| 260 |
+
ashrae_class: A2
|
| 261 |
+
racks:
|
| 262 |
+
- { rack_id: A-01, row: A, position: 1, it_load_kw: 8.0,
|
| 263 |
+
num_servers_2u: 20, server_thermal_mass_jk: 11100.0,
|
| 264 |
+
airflow_cfm_per_kw: 160.0 }
|
| 265 |
+
# ... more racks
|
| 266 |
+
crac_units:
|
| 267 |
+
- { unit_id: CRAC-1, rated_capacity_kw: 70.0,
|
| 268 |
+
rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03,
|
| 269 |
+
max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0,
|
| 270 |
+
cop_rated: 3.5, initial_setpoint_c: 18.0,
|
| 271 |
+
initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 272 |
+
|
| 273 |
+
power:
|
| 274 |
+
utility_voltage_v: 480.0
|
| 275 |
+
utility_available: true
|
| 276 |
+
ups_units:
|
| 277 |
+
- { unit_id: UPS-1, rated_capacity_kw: 500.0,
|
| 278 |
+
loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011,
|
| 279 |
+
battery_capacity_kwh: 8.3, battery_discharge_efficiency: 0.90,
|
| 280 |
+
battery_aging_factor: 0.85, recharge_rate_kw: 5.0,
|
| 281 |
+
initial_mode: double_conversion }
|
| 282 |
+
pdus:
|
| 283 |
+
- { pdu_id: PDU-A-01, voltage_ll_v: 208.0,
|
| 284 |
+
max_current_per_phase_a: 24.0, num_phases: 3,
|
| 285 |
+
efficiency: 0.98, continuous_derating: 0.80 }
|
| 286 |
+
generator:
|
| 287 |
+
gen_id: GEN-1
|
| 288 |
+
rated_capacity_kw: 750.0
|
| 289 |
+
start_delay_s: 4.0
|
| 290 |
+
crank_time_s: 5.0
|
| 291 |
+
warmup_time_s: 8.0
|
| 292 |
+
fuel_tank_liters: 2000.0
|
| 293 |
+
consumption_lph_full: 180.0
|
| 294 |
+
cooldown_time_s: 300.0
|
| 295 |
+
ats:
|
| 296 |
+
ats_id: ATS-1
|
| 297 |
+
transfer_time_ms: 100.0
|
| 298 |
+
retransfer_delay_s: 300.0
|
| 299 |
+
```
|
| 300 |
+
|
| 301 |
+
See [data/datacenter_configs/](data/datacenter_configs/) for complete examples.
|
| 302 |
+
|
| 303 |
+
---
|
| 304 |
+
|
| 305 |
+
## TRL / GRPO Training Integration
|
| 306 |
+
|
| 307 |
+
DC-Ops integrates directly with HuggingFace TRL's `GRPOTrainer` via the OpenEnv `environment_factory` pattern:
|
| 308 |
+
|
| 309 |
+
```python
|
| 310 |
+
from trl import GRPOTrainer, GRPOConfig
|
| 311 |
+
from dc_ops_env import DcOpsAction, DcOpsEnv
|
| 312 |
+
|
| 313 |
+
def dc_ops_environment_factory():
|
| 314 |
+
"""Factory that returns a DC-Ops environment instance."""
|
| 315 |
+
env = DcOpsEnv(base_url="http://localhost:8000")
|
| 316 |
+
return env
|
| 317 |
+
|
| 318 |
+
config = GRPOConfig(
|
| 319 |
+
model_name_or_path="your-base-model",
|
| 320 |
+
# ... training hyperparameters
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
trainer = GRPOTrainer(
|
| 324 |
+
config=config,
|
| 325 |
+
environments=dc_ops_environment_factory,
|
| 326 |
+
# ... other args
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
trainer.train()
|
| 330 |
+
```
|
| 331 |
+
|
| 332 |
+
For multi-environment parallel training, run multiple servers or increase `max_concurrent_envs` and spawn concurrent clients.
|
| 333 |
+
|
| 334 |
+
---
|
| 335 |
+
|
| 336 |
+
## Deploy to HuggingFace Spaces
|
| 337 |
+
|
| 338 |
+
### Using OpenEnv CLI
|
| 339 |
+
|
| 340 |
+
The simplest way to deploy:
|
| 341 |
+
|
| 342 |
+
```bash
|
| 343 |
+
# From the dc_ops_env/ directory (where openenv.yaml is located)
|
| 344 |
+
cd dc_ops_env
|
| 345 |
+
|
| 346 |
+
# Login to HuggingFace (if not already)
|
| 347 |
+
huggingface-cli login
|
| 348 |
+
|
| 349 |
+
# Push to HuggingFace Spaces
|
| 350 |
+
openenv push
|
| 351 |
+
|
| 352 |
+
# Or with options
|
| 353 |
+
openenv push --repo-id your-username/dc-ops-env --private
|
| 354 |
+
openenv push --namespace your-org
|
| 355 |
+
```
|
| 356 |
+
|
| 357 |
+
### What Gets Deployed
|
| 358 |
+
|
| 359 |
+
The `openenv push` command:
|
| 360 |
+
1. Validates the `openenv.yaml` manifest
|
| 361 |
+
2. Builds a Docker Space on HuggingFace
|
| 362 |
+
3. Uploads all environment code
|
| 363 |
+
|
| 364 |
+
Your deployed Space will be available at:
|
| 365 |
+
`https://huggingface.co/spaces/<repo-id>`
|
| 366 |
+
|
| 367 |
+
The Space includes:
|
| 368 |
+
- **Web Interface** at `/web` — Interactive scenario browser and dashboard viewer
|
| 369 |
+
- **API Documentation** at `/docs` — Full OpenAPI/Swagger interface
|
| 370 |
+
- **Health Check** at `/health` — Container health monitoring
|
| 371 |
+
- **WebSocket** at `/ws` — Persistent session endpoint for agent connections
|
| 372 |
+
|
| 373 |
+
### Connecting to a Deployed Space
|
| 374 |
+
|
| 375 |
+
```python
|
| 376 |
+
from dc_ops_env import DcOpsAction, DcOpsEnv
|
| 377 |
+
|
| 378 |
+
# Connect to your HuggingFace Space
|
| 379 |
+
space_url = "https://your-username-dc-ops-env.hf.space"
|
| 380 |
+
|
| 381 |
+
async with DcOpsEnv(base_url=space_url) as env:
|
| 382 |
+
result = await env.reset(scenario="A2")
|
| 383 |
+
print(result.observation.dashboard)
|
| 384 |
+
```
|
| 385 |
+
|
| 386 |
+
### CLI Options
|
| 387 |
+
|
| 388 |
+
| Option | Description |
|
| 389 |
+
|--------|-------------|
|
| 390 |
+
| `--directory`, `-d` | Directory containing the OpenEnv environment (default: current) |
|
| 391 |
+
| `--repo-id`, `-r` | Repository ID `username/repo-name` (default: from openenv.yaml) |
|
| 392 |
+
| `--base-image`, `-b` | Override base Docker image |
|
| 393 |
+
| `--private` | Deploy as a private Space |
|
| 394 |
+
| `--namespace` | HuggingFace namespace (user or org) |
|
| 395 |
+
|
| 396 |
+
---
|
| 397 |
+
|
| 398 |
+
## Development
|
| 399 |
+
|
| 400 |
+
### Running Tests
|
| 401 |
+
|
| 402 |
+
```bash
|
| 403 |
+
# All tests (256 tests)
|
| 404 |
+
uv run pytest tests/ -v
|
| 405 |
+
|
| 406 |
+
# Specific test modules
|
| 407 |
+
uv run pytest tests/test_thermal.py -v # Thermal physics
|
| 408 |
+
uv run pytest tests/test_power.py -v # Power systems
|
| 409 |
+
uv run pytest tests/test_actions.py -v # Command parser
|
| 410 |
+
uv run pytest tests/test_rewards.py -v # Reward function
|
| 411 |
+
uv run pytest tests/test_scenarios.py -v # Scenario framework
|
| 412 |
+
uv run pytest tests/test_integration.py -v # End-to-end episodes
|
| 413 |
+
|
| 414 |
+
# With coverage
|
| 415 |
+
uv run pytest tests/ --cov=dc_ops_env --cov-report=term-missing
|
| 416 |
+
```
|
| 417 |
+
|
| 418 |
+
### Direct Environment Testing (No Server)
|
| 419 |
+
|
| 420 |
+
Test the environment logic without the HTTP/WebSocket layer:
|
| 421 |
+
|
| 422 |
+
```python
|
| 423 |
+
from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
|
| 424 |
+
from dc_ops_env.models import DcOpsAction
|
| 425 |
+
|
| 426 |
+
env = DcOpsEnvironment()
|
| 427 |
+
obs = env.reset(scenario="A2")
|
| 428 |
+
print(obs.dashboard)
|
| 429 |
+
|
| 430 |
+
obs = env.step(DcOpsAction(command="diagnose CRAC-3"))
|
| 431 |
+
print(f"Reward: {obs.reward}")
|
| 432 |
+
print(obs.dashboard)
|
| 433 |
+
```
|
| 434 |
+
|
| 435 |
+
### Running the Server Locally
|
| 436 |
+
|
| 437 |
+
```bash
|
| 438 |
+
# Via entry point (recommended)
|
| 439 |
+
uv run server
|
| 440 |
+
|
| 441 |
+
# With custom port
|
| 442 |
+
uv run server --port 8001
|
| 443 |
+
|
| 444 |
+
# Via uvicorn directly (with auto-reload for development)
|
| 445 |
+
uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
|
| 446 |
+
|
| 447 |
+
# Production (multi-worker)
|
| 448 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
|
| 449 |
+
```
|
| 450 |
+
|
| 451 |
+
---
|
| 452 |
+
|
| 453 |
+
## Project Structure
|
| 454 |
+
|
| 455 |
+
```
|
| 456 |
+
dc_ops_env/
|
| 457 |
+
├── openenv.yaml # OpenEnv manifest
|
| 458 |
+
├── pyproject.toml # Dependencies and metadata
|
| 459 |
+
├── README.md # This file (HF Space README)
|
| 460 |
+
├── __init__.py # Exports: DcOpsEnv, DcOpsAction, DcOpsObservation
|
| 461 |
+
├── config.py # Physical constants, ASHRAE limits, YAML loader
|
| 462 |
+
├── models.py # Pydantic Action/Observation models
|
| 463 |
+
├── client.py # DcOpsEnv (EnvClient subclass)
|
| 464 |
+
├── simulation/
|
| 465 |
+
│ ├── thermal.py # RC thermal network (zones, racks, CRACs)
|
| 466 |
+
│ ├── power.py # UPS, PDU, generator, ATS models
|
| 467 |
+
│ └── types.py # Runtime state dataclasses
|
| 468 |
+
├── scenarios/
|
| 469 |
+
│ ├── base.py # Abstract Scenario + ProcedureRule
|
| 470 |
+
│ ├── registry.py # Scenario registration and selection
|
| 471 |
+
│ ├── thermal_scenarios.py # A1, A2, A4
|
| 472 |
+
│ └── power_scenarios.py # B1, B3, B4
|
| 473 |
+
├── rewards/
|
| 474 |
+
│ └── reward_function.py # 6-component composite reward
|
| 475 |
+
├── rendering/
|
| 476 |
+
│ └── dashboard.py # State → text dashboard
|
| 477 |
+
├── actions/
|
| 478 |
+
│ └── parser.py # Deterministic command parser
|
| 479 |
+
├── server/
|
| 480 |
+
│ ├── dc_ops_env_environment.py # OpenEnv Environment implementation
|
| 481 |
+
│ ├── app.py # FastAPI application
|
| 482 |
+
│ └── Dockerfile # Container image
|
| 483 |
+
├── data/
|
| 484 |
+
│ └── datacenter_configs/ # YAML facility definitions
|
| 485 |
+
│ ├── default.yaml # 2 zones, 20 racks, 160 kW
|
| 486 |
+
│ ├── small_facility.yaml # 1 zone, 10 racks, 80 kW
|
| 487 |
+
│ └── large_facility.yaml # 4 zones, 60 racks, 600 kW
|
| 488 |
+
└── tests/ # 256 tests across 6 modules
|
| 489 |
+
├── test_thermal.py
|
| 490 |
+
├── test_power.py
|
| 491 |
+
├── test_actions.py
|
| 492 |
+
├── test_rewards.py
|
| 493 |
+
├── test_scenarios.py
|
| 494 |
+
└── test_integration.py
|
| 495 |
+
```
|
| 496 |
+
|
| 497 |
+
## License
|
| 498 |
+
|
| 499 |
+
BSD-style license. See [LICENSE](../LICENSE) for details.
|
__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Dc Ops Env Environment."""
|
| 8 |
+
|
| 9 |
+
from .client import DcOpsEnv
|
| 10 |
+
from .models import DcOpsAction, DcOpsObservation
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
"DcOpsAction",
|
| 14 |
+
"DcOpsObservation",
|
| 15 |
+
"DcOpsEnv",
|
| 16 |
+
]
|
actions/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Action parsing for the DC-Ops environment."""
|
| 8 |
+
|
| 9 |
+
from .parser import parse_command
|
| 10 |
+
|
| 11 |
+
__all__ = ["parse_command"]
|
actions/parser.py
ADDED
|
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Deterministic action parser for operator commands.
|
| 9 |
+
|
| 10 |
+
Parses natural-language commands from the LLM agent into simulation mutations.
|
| 11 |
+
Uses regex matching for speed and testability — no LLM-in-the-loop.
|
| 12 |
+
|
| 13 |
+
Command format: command_name [target] [value]
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import re
|
| 19 |
+
from dataclasses import dataclass
|
| 20 |
+
from typing import Any
|
| 21 |
+
|
| 22 |
+
from ..simulation.thermal import ThermalSimulation
|
| 23 |
+
from ..simulation.power import PowerSimulation
|
| 24 |
+
from ..simulation.types import (
|
| 25 |
+
CRACFaultType,
|
| 26 |
+
CRACStatus,
|
| 27 |
+
UPSMode,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class CommandResult:
|
| 33 |
+
"""Result of parsing and executing a command."""
|
| 34 |
+
success: bool
|
| 35 |
+
message: str
|
| 36 |
+
command_name: str = ""
|
| 37 |
+
target: str = ""
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
# Available commands for the agent
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
AVAILABLE_ACTIONS: list[str] = [
|
| 44 |
+
"diagnose <unit_id> — Inspect a CRAC/UPS/PDU for faults",
|
| 45 |
+
"adjust_setpoint <crac_id> <temp_c> — Change CRAC supply air setpoint",
|
| 46 |
+
"set_fan_speed <crac_id> <pct> — Set CRAC fan speed (0-100%)",
|
| 47 |
+
"set_rack_load <rack_id> <kw> — Adjust rack IT load (migrate workload)",
|
| 48 |
+
"start_crac <crac_id> — Start a standby CRAC unit",
|
| 49 |
+
"stop_crac <crac_id> — Put a CRAC into standby",
|
| 50 |
+
"start_generator — Manually start the diesel generator",
|
| 51 |
+
"stop_generator — Initiate generator cooldown",
|
| 52 |
+
"set_ups_mode <ups_id> <mode> — Set UPS mode (eco/double_conversion/bypass)",
|
| 53 |
+
"refuel_generator [liters] — Refuel (default: full tank)",
|
| 54 |
+
"acknowledge_alarm — Acknowledge current alert",
|
| 55 |
+
"check_status — Request full status report",
|
| 56 |
+
"escalate — Escalate to senior engineer",
|
| 57 |
+
"wait — Take no action this step",
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def parse_command(
|
| 62 |
+
command: str,
|
| 63 |
+
thermal_sim: ThermalSimulation,
|
| 64 |
+
power_sim: PowerSimulation | None = None,
|
| 65 |
+
) -> CommandResult:
|
| 66 |
+
"""Parse and execute an operator command.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
command: Raw command string from the agent.
|
| 70 |
+
thermal_sim: Thermal simulation to mutate.
|
| 71 |
+
power_sim: Power simulation to mutate (optional).
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
CommandResult with success status and feedback message.
|
| 75 |
+
"""
|
| 76 |
+
cmd = command.strip()
|
| 77 |
+
if not cmd:
|
| 78 |
+
return CommandResult(False, "Empty command. Use 'check_status' or see available actions.")
|
| 79 |
+
|
| 80 |
+
# Try each handler in order
|
| 81 |
+
for pattern, handler in _COMMAND_TABLE:
|
| 82 |
+
match = re.match(pattern, cmd, re.IGNORECASE)
|
| 83 |
+
if match:
|
| 84 |
+
return handler(match, thermal_sim, power_sim)
|
| 85 |
+
|
| 86 |
+
return CommandResult(
|
| 87 |
+
False,
|
| 88 |
+
f"Unknown command: '{cmd}'. Use 'check_status' for available actions.",
|
| 89 |
+
command_name="unknown",
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# ---------------------------------------------------------------------------
|
| 94 |
+
# Command handlers
|
| 95 |
+
# ---------------------------------------------------------------------------
|
| 96 |
+
def _handle_diagnose(
|
| 97 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 98 |
+
) -> CommandResult:
|
| 99 |
+
"""Inspect a unit for faults and report status."""
|
| 100 |
+
target = match.group(1)
|
| 101 |
+
|
| 102 |
+
# Check CRACs
|
| 103 |
+
for zone in thermal.state.zones:
|
| 104 |
+
for crac in zone.crac_units:
|
| 105 |
+
if crac.unit_id.lower() == target.lower():
|
| 106 |
+
lines = [
|
| 107 |
+
f"=== Diagnostic Report: {crac.unit_id} ===",
|
| 108 |
+
f"Status: {crac.status.value}",
|
| 109 |
+
f"Fault: {crac.fault_type.value}",
|
| 110 |
+
f"Setpoint: {crac.setpoint_c:.1f}°C",
|
| 111 |
+
f"Supply Temp: {crac.supply_temp_c:.1f}°C",
|
| 112 |
+
f"Fan Speed: {crac.fan_speed_pct:.0f}%",
|
| 113 |
+
f"Airflow: {crac.current_airflow_m3s:.3f} m³/s",
|
| 114 |
+
]
|
| 115 |
+
if crac.fault_type != CRACFaultType.NONE:
|
| 116 |
+
lines.append(f">> FAULT DETECTED: {crac.fault_type.value}")
|
| 117 |
+
lines.append(">> Recommended: repair or replace component")
|
| 118 |
+
else:
|
| 119 |
+
lines.append(">> No faults detected. Unit operating normally.")
|
| 120 |
+
return CommandResult(True, "\n".join(lines), "diagnose", target)
|
| 121 |
+
|
| 122 |
+
# Check UPS
|
| 123 |
+
if power:
|
| 124 |
+
for ups in power.state.ups_units:
|
| 125 |
+
if ups.unit_id.lower() == target.lower():
|
| 126 |
+
lines = [
|
| 127 |
+
f"=== Diagnostic Report: {ups.unit_id} ===",
|
| 128 |
+
f"Mode: {ups.mode.value}",
|
| 129 |
+
f"Load: {ups.load_fraction * 100:.1f}%",
|
| 130 |
+
f"Efficiency: {ups.efficiency * 100:.1f}%",
|
| 131 |
+
f"Battery SOC: {ups.battery_soc * 100:.0f}%",
|
| 132 |
+
f"Output: {ups.output_power_kw:.1f} kW",
|
| 133 |
+
f"Losses: {ups.heat_output_kw:.1f} kW",
|
| 134 |
+
]
|
| 135 |
+
return CommandResult(True, "\n".join(lines), "diagnose", target)
|
| 136 |
+
|
| 137 |
+
return CommandResult(False, f"Unit '{target}' not found.", "diagnose", target)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def _handle_adjust_setpoint(
|
| 141 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 142 |
+
) -> CommandResult:
|
| 143 |
+
target = match.group(1)
|
| 144 |
+
try:
|
| 145 |
+
value = float(match.group(2))
|
| 146 |
+
except (ValueError, IndexError):
|
| 147 |
+
return CommandResult(False, "Invalid temperature value.", "adjust_setpoint", target)
|
| 148 |
+
|
| 149 |
+
if value < 10.0 or value > 35.0:
|
| 150 |
+
return CommandResult(
|
| 151 |
+
False,
|
| 152 |
+
f"Setpoint {value:.1f}°C out of safe range (10-35°C).",
|
| 153 |
+
"adjust_setpoint", target,
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
if thermal.set_crac_setpoint(target, value):
|
| 157 |
+
return CommandResult(
|
| 158 |
+
True,
|
| 159 |
+
f"Setpoint for {target} adjusted to {value:.1f}°C. "
|
| 160 |
+
"Supply temp will converge over ~30 seconds.",
|
| 161 |
+
"adjust_setpoint", target,
|
| 162 |
+
)
|
| 163 |
+
return CommandResult(False, f"CRAC '{target}' not found.", "adjust_setpoint", target)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _handle_set_fan_speed(
|
| 167 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 168 |
+
) -> CommandResult:
|
| 169 |
+
target = match.group(1)
|
| 170 |
+
try:
|
| 171 |
+
value = float(match.group(2))
|
| 172 |
+
except (ValueError, IndexError):
|
| 173 |
+
return CommandResult(False, "Invalid fan speed value.", "set_fan_speed", target)
|
| 174 |
+
|
| 175 |
+
if value < 0 or value > 100:
|
| 176 |
+
return CommandResult(
|
| 177 |
+
False, f"Fan speed {value:.0f}% out of range (0-100%).",
|
| 178 |
+
"set_fan_speed", target,
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
if thermal.set_crac_fan_speed(target, value):
|
| 182 |
+
return CommandResult(
|
| 183 |
+
True,
|
| 184 |
+
f"Fan speed for {target} set to {value:.0f}%.",
|
| 185 |
+
"set_fan_speed", target,
|
| 186 |
+
)
|
| 187 |
+
return CommandResult(False, f"CRAC '{target}' not found.", "set_fan_speed", target)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def _handle_set_rack_load(
|
| 191 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 192 |
+
) -> CommandResult:
|
| 193 |
+
target = match.group(1)
|
| 194 |
+
try:
|
| 195 |
+
value = float(match.group(2))
|
| 196 |
+
except (ValueError, IndexError):
|
| 197 |
+
return CommandResult(False, "Invalid load value.", "set_rack_load", target)
|
| 198 |
+
|
| 199 |
+
if value < 0 or value > 30:
|
| 200 |
+
return CommandResult(
|
| 201 |
+
False, f"Load {value:.1f} kW out of range (0-30 kW).",
|
| 202 |
+
"set_rack_load", target,
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
if thermal.set_rack_load(target, value):
|
| 206 |
+
return CommandResult(
|
| 207 |
+
True,
|
| 208 |
+
f"IT load for rack {target} set to {value:.1f} kW.",
|
| 209 |
+
"set_rack_load", target,
|
| 210 |
+
)
|
| 211 |
+
return CommandResult(False, f"Rack '{target}' not found.", "set_rack_load", target)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def _handle_start_crac(
|
| 215 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 216 |
+
) -> CommandResult:
|
| 217 |
+
target = match.group(1)
|
| 218 |
+
for zone in thermal.state.zones:
|
| 219 |
+
for crac in zone.crac_units:
|
| 220 |
+
if crac.unit_id.lower() == target.lower():
|
| 221 |
+
if crac.status == CRACStatus.RUNNING:
|
| 222 |
+
return CommandResult(False, f"{target} is already running.", "start_crac", target)
|
| 223 |
+
if crac.fault_type != CRACFaultType.NONE:
|
| 224 |
+
return CommandResult(
|
| 225 |
+
False,
|
| 226 |
+
f"{target} has an active fault ({crac.fault_type.value}). "
|
| 227 |
+
"Clear the fault before starting.",
|
| 228 |
+
"start_crac", target,
|
| 229 |
+
)
|
| 230 |
+
crac.status = CRACStatus.RUNNING
|
| 231 |
+
return CommandResult(True, f"{target} started.", "start_crac", target)
|
| 232 |
+
return CommandResult(False, f"CRAC '{target}' not found.", "start_crac", target)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def _handle_stop_crac(
|
| 236 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 237 |
+
) -> CommandResult:
|
| 238 |
+
target = match.group(1)
|
| 239 |
+
for zone in thermal.state.zones:
|
| 240 |
+
for crac in zone.crac_units:
|
| 241 |
+
if crac.unit_id.lower() == target.lower():
|
| 242 |
+
if crac.status == CRACStatus.STANDBY:
|
| 243 |
+
return CommandResult(False, f"{target} is already in standby.", "stop_crac", target)
|
| 244 |
+
crac.status = CRACStatus.STANDBY
|
| 245 |
+
return CommandResult(True, f"{target} placed in standby.", "stop_crac", target)
|
| 246 |
+
return CommandResult(False, f"CRAC '{target}' not found.", "stop_crac", target)
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def _handle_start_generator(
|
| 250 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 251 |
+
) -> CommandResult:
|
| 252 |
+
if power is None:
|
| 253 |
+
return CommandResult(False, "Power subsystem not available.", "start_generator")
|
| 254 |
+
power.start_generator()
|
| 255 |
+
return CommandResult(True, "Generator start sequence initiated.", "start_generator")
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def _handle_stop_generator(
|
| 259 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 260 |
+
) -> CommandResult:
|
| 261 |
+
if power is None:
|
| 262 |
+
return CommandResult(False, "Power subsystem not available.", "stop_generator")
|
| 263 |
+
power.stop_generator()
|
| 264 |
+
return CommandResult(True, "Generator cooldown initiated.", "stop_generator")
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def _handle_set_ups_mode(
|
| 268 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 269 |
+
) -> CommandResult:
|
| 270 |
+
if power is None:
|
| 271 |
+
return CommandResult(False, "Power subsystem not available.", "set_ups_mode")
|
| 272 |
+
target = match.group(1)
|
| 273 |
+
mode_str = match.group(2).lower().strip()
|
| 274 |
+
|
| 275 |
+
mode_map = {
|
| 276 |
+
"double_conversion": UPSMode.DOUBLE_CONVERSION,
|
| 277 |
+
"eco": UPSMode.ECO,
|
| 278 |
+
"line_interactive": UPSMode.LINE_INTERACTIVE,
|
| 279 |
+
"bypass": UPSMode.BYPASS,
|
| 280 |
+
}
|
| 281 |
+
mode = mode_map.get(mode_str)
|
| 282 |
+
if mode is None:
|
| 283 |
+
valid = ", ".join(mode_map.keys())
|
| 284 |
+
return CommandResult(False, f"Unknown UPS mode '{mode_str}'. Valid: {valid}", "set_ups_mode", target)
|
| 285 |
+
|
| 286 |
+
if power.set_ups_mode(target, mode):
|
| 287 |
+
return CommandResult(True, f"{target} set to {mode_str} mode.", "set_ups_mode", target)
|
| 288 |
+
return CommandResult(False, f"UPS '{target}' not found.", "set_ups_mode", target)
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def _handle_refuel_generator(
|
| 292 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 293 |
+
) -> CommandResult:
|
| 294 |
+
if power is None:
|
| 295 |
+
return CommandResult(False, "Power subsystem not available.", "refuel_generator")
|
| 296 |
+
liters_str = match.group(1) if match.group(1) else None
|
| 297 |
+
if liters_str:
|
| 298 |
+
try:
|
| 299 |
+
liters = float(liters_str)
|
| 300 |
+
except ValueError:
|
| 301 |
+
return CommandResult(False, "Invalid liters value.", "refuel_generator")
|
| 302 |
+
power.refuel_generator(liters)
|
| 303 |
+
return CommandResult(True, f"Added {liters:.0f}L to generator.", "refuel_generator")
|
| 304 |
+
else:
|
| 305 |
+
power.refuel_generator()
|
| 306 |
+
return CommandResult(True, "Generator refueled to full tank.", "refuel_generator")
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
def _handle_acknowledge_alarm(
|
| 310 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 311 |
+
) -> CommandResult:
|
| 312 |
+
return CommandResult(True, "Alarm acknowledged.", "acknowledge_alarm")
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def _handle_check_status(
|
| 316 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 317 |
+
) -> CommandResult:
|
| 318 |
+
return CommandResult(True, "Full status displayed in dashboard.", "check_status")
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
def _handle_escalate(
|
| 322 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 323 |
+
) -> CommandResult:
|
| 324 |
+
return CommandResult(
|
| 325 |
+
True,
|
| 326 |
+
"Incident escalated to senior datacenter engineer. Episode ending.",
|
| 327 |
+
"escalate",
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def _handle_wait(
|
| 332 |
+
match: re.Match, thermal: ThermalSimulation, power: PowerSimulation | None
|
| 333 |
+
) -> CommandResult:
|
| 334 |
+
return CommandResult(True, "Waiting. No action taken.", "wait")
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
# ---------------------------------------------------------------------------
|
| 338 |
+
# Command table: (regex_pattern, handler_function)
|
| 339 |
+
# Order matters — first match wins.
|
| 340 |
+
# ---------------------------------------------------------------------------
|
| 341 |
+
_COMMAND_TABLE: list[tuple[re.Pattern | str, Any]] = [
|
| 342 |
+
(r"diagnose\s+(\S+)", _handle_diagnose),
|
| 343 |
+
(r"adjust_setpoint\s+(\S+)\s+([\d.]+)", _handle_adjust_setpoint),
|
| 344 |
+
(r"set_fan_speed\s+(\S+)\s+([\d.]+)", _handle_set_fan_speed),
|
| 345 |
+
(r"(?:set_rack_load|migrate_workload)\s+(\S+)\s+([\d.]+)", _handle_set_rack_load),
|
| 346 |
+
(r"start_crac\s+(\S+)", _handle_start_crac),
|
| 347 |
+
(r"stop_crac\s+(\S+)", _handle_stop_crac),
|
| 348 |
+
(r"start_generator\b", _handle_start_generator),
|
| 349 |
+
(r"stop_generator\b", _handle_stop_generator),
|
| 350 |
+
(r"set_ups_mode\s+(\S+)\s+(\S+)", _handle_set_ups_mode),
|
| 351 |
+
(r"refuel_generator\s*([\d.]*)", _handle_refuel_generator),
|
| 352 |
+
(r"acknowledge_alarm\b", _handle_acknowledge_alarm),
|
| 353 |
+
(r"check_status\b", _handle_check_status),
|
| 354 |
+
(r"escalate\b", _handle_escalate),
|
| 355 |
+
(r"wait\b", _handle_wait),
|
| 356 |
+
]
|
client.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""DC-Ops Environment Client."""
|
| 8 |
+
|
| 9 |
+
from typing import Dict
|
| 10 |
+
|
| 11 |
+
from openenv.core import EnvClient
|
| 12 |
+
from openenv.core.client_types import StepResult
|
| 13 |
+
from openenv.core.env_server.types import State
|
| 14 |
+
|
| 15 |
+
from .models import DcOpsAction, DcOpsObservation
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class DcOpsEnv(
|
| 19 |
+
EnvClient[DcOpsAction, DcOpsObservation, State]
|
| 20 |
+
):
|
| 21 |
+
"""
|
| 22 |
+
Client for the DC-Ops Environment.
|
| 23 |
+
|
| 24 |
+
Connects to the environment server over WebSocket and provides
|
| 25 |
+
reset/step/state methods for interacting with the datacenter simulation.
|
| 26 |
+
|
| 27 |
+
Example:
|
| 28 |
+
>>> async with DcOpsEnv(base_url="http://localhost:8000") as client:
|
| 29 |
+
... result = await client.reset()
|
| 30 |
+
... print(result.observation.dashboard)
|
| 31 |
+
...
|
| 32 |
+
... result = await client.step(DcOpsAction(command="diagnose CRAC-1"))
|
| 33 |
+
... print(result.observation.dashboard)
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
def _step_payload(self, action: DcOpsAction) -> Dict:
|
| 37 |
+
"""Convert DcOpsAction to JSON payload for step message."""
|
| 38 |
+
payload = {"command": action.command}
|
| 39 |
+
if action.reasoning:
|
| 40 |
+
payload["reasoning"] = action.reasoning
|
| 41 |
+
return payload
|
| 42 |
+
|
| 43 |
+
def _parse_result(self, payload: Dict) -> StepResult[DcOpsObservation]:
|
| 44 |
+
"""Parse server response into StepResult[DcOpsObservation]."""
|
| 45 |
+
obs_data = payload.get("observation", {})
|
| 46 |
+
observation = DcOpsObservation(
|
| 47 |
+
dashboard=obs_data.get("dashboard", ""),
|
| 48 |
+
available_actions=obs_data.get("available_actions", []),
|
| 49 |
+
alert=obs_data.get("alert", ""),
|
| 50 |
+
scenario_type=obs_data.get("scenario_type", ""),
|
| 51 |
+
steps_remaining=obs_data.get("steps_remaining", 0),
|
| 52 |
+
action_result=obs_data.get("action_result", ""),
|
| 53 |
+
done=payload.get("done", False),
|
| 54 |
+
reward=payload.get("reward"),
|
| 55 |
+
metadata=obs_data.get("metadata", {}),
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
return StepResult(
|
| 59 |
+
observation=observation,
|
| 60 |
+
reward=payload.get("reward"),
|
| 61 |
+
done=payload.get("done", False),
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
def _parse_state(self, payload: Dict) -> State:
|
| 65 |
+
"""Parse server response into State object."""
|
| 66 |
+
return State(
|
| 67 |
+
episode_id=payload.get("episode_id"),
|
| 68 |
+
step_count=payload.get("step_count", 0),
|
| 69 |
+
)
|
config.py
ADDED
|
@@ -0,0 +1,549 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Physical constants, ASHRAE thermal guidelines, and unit conversion utilities.
|
| 9 |
+
|
| 10 |
+
All internal simulation values use SI units:
|
| 11 |
+
- Temperature: °C (Celsius)
|
| 12 |
+
- Power/Heat: W (Watts)
|
| 13 |
+
- Energy: J (Joules)
|
| 14 |
+
- Airflow: m³/s
|
| 15 |
+
- Thermal capacitance: J/K
|
| 16 |
+
- Thermal resistance: K/W
|
| 17 |
+
- Time: s (seconds)
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
from dataclasses import dataclass, field
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
from typing import Union
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ---------------------------------------------------------------------------
|
| 28 |
+
# Air properties (dry air at standard conditions: ~20 °C, 101.325 kPa)
|
| 29 |
+
# ---------------------------------------------------------------------------
|
| 30 |
+
AIR_DENSITY_KG_M3 = 1.2
|
| 31 |
+
AIR_SPECIFIC_HEAT_J_KGK = 1005.0
|
| 32 |
+
AIR_RHO_CP = AIR_DENSITY_KG_M3 * AIR_SPECIFIC_HEAT_J_KGK # 1206.0 J/(m³·K)
|
| 33 |
+
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
# Unit conversion helpers
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
CFM_TO_M3S = 4.71947e-4 # 1 CFM = 4.71947 × 10⁻⁴ m³/s
|
| 38 |
+
M3S_TO_CFM = 1.0 / CFM_TO_M3S # ≈ 2118.88
|
| 39 |
+
TONS_TO_KW = 3.517 # 1 ton of refrigeration = 3.517 kW thermal
|
| 40 |
+
KW_TO_TONS = 1.0 / TONS_TO_KW
|
| 41 |
+
BTU_HR_TO_W = 0.29307107 # 1 BTU/hr = 0.293 W
|
| 42 |
+
W_TO_BTU_HR = 1.0 / BTU_HR_TO_W # ≈ 3.412
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def fahrenheit_to_celsius(f: float) -> float:
|
| 46 |
+
return (f - 32.0) * 5.0 / 9.0
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def celsius_to_fahrenheit(c: float) -> float:
|
| 50 |
+
return c * 9.0 / 5.0 + 32.0
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def cfm_to_m3s(cfm: float) -> float:
|
| 54 |
+
return cfm * CFM_TO_M3S
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def m3s_to_cfm(m3s: float) -> float:
|
| 58 |
+
return m3s * M3S_TO_CFM
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# ---------------------------------------------------------------------------
|
| 62 |
+
# ASHRAE TC 9.9 Thermal Guidelines, 5th Edition (2021)
|
| 63 |
+
#
|
| 64 |
+
# Each class defines recommended and allowable operating envelopes for
|
| 65 |
+
# server inlet temperatures and humidity.
|
| 66 |
+
# ---------------------------------------------------------------------------
|
| 67 |
+
@dataclass(frozen=True)
|
| 68 |
+
class ASHRAEClass:
|
| 69 |
+
"""ASHRAE thermal envelope for a given equipment class."""
|
| 70 |
+
name: str
|
| 71 |
+
recommended_min_c: float
|
| 72 |
+
recommended_max_c: float
|
| 73 |
+
allowable_min_c: float
|
| 74 |
+
allowable_max_c: float
|
| 75 |
+
max_dew_point_c: float
|
| 76 |
+
max_rh: float # Fraction, e.g. 0.80 = 80%
|
| 77 |
+
description: str = ""
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
ASHRAE_A1 = ASHRAEClass(
|
| 81 |
+
name="A1",
|
| 82 |
+
recommended_min_c=18.0,
|
| 83 |
+
recommended_max_c=27.0,
|
| 84 |
+
allowable_min_c=15.0,
|
| 85 |
+
allowable_max_c=32.0,
|
| 86 |
+
max_dew_point_c=17.0,
|
| 87 |
+
max_rh=0.80,
|
| 88 |
+
description="Enterprise servers, storage",
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
ASHRAE_A2 = ASHRAEClass(
|
| 92 |
+
name="A2",
|
| 93 |
+
recommended_min_c=18.0,
|
| 94 |
+
recommended_max_c=27.0,
|
| 95 |
+
allowable_min_c=10.0,
|
| 96 |
+
allowable_max_c=35.0,
|
| 97 |
+
max_dew_point_c=21.0,
|
| 98 |
+
max_rh=0.80,
|
| 99 |
+
description="Volume servers",
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
ASHRAE_A3 = ASHRAEClass(
|
| 103 |
+
name="A3",
|
| 104 |
+
recommended_min_c=18.0,
|
| 105 |
+
recommended_max_c=27.0,
|
| 106 |
+
allowable_min_c=5.0,
|
| 107 |
+
allowable_max_c=40.0,
|
| 108 |
+
max_dew_point_c=24.0,
|
| 109 |
+
max_rh=0.85,
|
| 110 |
+
description="Extended temperature range",
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
ASHRAE_A4 = ASHRAEClass(
|
| 114 |
+
name="A4",
|
| 115 |
+
recommended_min_c=18.0,
|
| 116 |
+
recommended_max_c=27.0,
|
| 117 |
+
allowable_min_c=5.0,
|
| 118 |
+
allowable_max_c=45.0,
|
| 119 |
+
max_dew_point_c=24.0,
|
| 120 |
+
max_rh=0.90,
|
| 121 |
+
description="Maximum temperature flexibility",
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
ASHRAE_H1 = ASHRAEClass(
|
| 125 |
+
name="H1",
|
| 126 |
+
recommended_min_c=18.0,
|
| 127 |
+
recommended_max_c=22.0,
|
| 128 |
+
allowable_min_c=5.0,
|
| 129 |
+
allowable_max_c=25.0,
|
| 130 |
+
max_dew_point_c=17.0,
|
| 131 |
+
max_rh=0.80,
|
| 132 |
+
description="High-density / AI / HPC",
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
ASHRAE_CLASSES: dict[str, ASHRAEClass] = {
|
| 136 |
+
"A1": ASHRAE_A1,
|
| 137 |
+
"A2": ASHRAE_A2,
|
| 138 |
+
"A3": ASHRAE_A3,
|
| 139 |
+
"A4": ASHRAE_A4,
|
| 140 |
+
"H1": ASHRAE_H1,
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
# Minimum humidity boundary (all classes):
|
| 144 |
+
# Higher of dew point -12 °C OR 8% RH
|
| 145 |
+
ASHRAE_MIN_DEW_POINT_C = -12.0
|
| 146 |
+
ASHRAE_MIN_RH = 0.08
|
| 147 |
+
|
| 148 |
+
# Rate-of-change limits
|
| 149 |
+
ASHRAE_RATE_LIMIT_SOLID_STATE_C_PER_HR = 20.0 # °C/hr max
|
| 150 |
+
ASHRAE_RATE_LIMIT_SOLID_STATE_C_PER_15MIN = 5.0 # °C per 15 min max
|
| 151 |
+
|
| 152 |
+
# Sensor accuracy
|
| 153 |
+
ASHRAE_SENSOR_ACCURACY_STANDARD_C = 0.5
|
| 154 |
+
ASHRAE_SENSOR_ACCURACY_HIGH_DENSITY_C = 0.3
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
# ---------------------------------------------------------------------------
|
| 158 |
+
# Default datacenter configuration
|
| 159 |
+
# ---------------------------------------------------------------------------
|
| 160 |
+
@dataclass
|
| 161 |
+
class CRACConfig:
|
| 162 |
+
"""Configuration for a single CRAC/CRAH unit."""
|
| 163 |
+
unit_id: str = "CRAC-1"
|
| 164 |
+
rated_capacity_kw: float = 70.0 # Nominal cooling capacity at rated conditions
|
| 165 |
+
rated_return_temp_c: float = 24.0 # Return air temp at which capacity is rated
|
| 166 |
+
capacity_slope_per_c: float = 0.03 # Fractional capacity increase per °C above rated return
|
| 167 |
+
max_airflow_cfm: float = 12000.0 # Maximum airflow at 100% fan speed
|
| 168 |
+
fan_rated_power_kw: float = 5.0 # Fan power at 100% speed
|
| 169 |
+
cop_rated: float = 3.5 # Coefficient of performance at design conditions
|
| 170 |
+
cop_degradation_per_c: float = 0.04 # COP fractional decrease per °C outside temp above 35°C
|
| 171 |
+
initial_setpoint_c: float = 18.0 # Default supply air setpoint
|
| 172 |
+
initial_fan_speed_pct: float = 100.0 # Default fan speed
|
| 173 |
+
supply_temp_lag_s: float = 30.0 # Time constant for supply temp to reach setpoint
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
@dataclass
|
| 177 |
+
class RackConfig:
|
| 178 |
+
"""Configuration for a single server rack."""
|
| 179 |
+
rack_id: str = "A-01"
|
| 180 |
+
row: str = "A"
|
| 181 |
+
position: int = 1
|
| 182 |
+
it_load_kw: float = 8.0 # IT power draw
|
| 183 |
+
num_servers_2u: int = 20 # Number of 2U servers
|
| 184 |
+
server_thermal_mass_jk: float = 11100.0 # 11.1 kJ/K per 2U server (measured experimentally)
|
| 185 |
+
airflow_cfm_per_kw: float = 160.0 # Server fan airflow per kW IT load
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
@dataclass
|
| 189 |
+
class ZoneConfig:
|
| 190 |
+
"""Configuration for a thermal zone (section of datacenter)."""
|
| 191 |
+
zone_id: str = "zone_a"
|
| 192 |
+
racks: list[RackConfig] = field(default_factory=list)
|
| 193 |
+
crac_units: list[CRACConfig] = field(default_factory=list)
|
| 194 |
+
containment_type: str = "cold_aisle" # "cold_aisle", "hot_aisle", "none"
|
| 195 |
+
recirculation_factor: float = 0.08 # 0 = perfect containment, 0.3 = none
|
| 196 |
+
air_volume_m3: float = 500.0 # Zone air volume
|
| 197 |
+
envelope_r_kw: float = 0.02 # Thermal resistance to outside (K/W)
|
| 198 |
+
initial_cold_aisle_temp_c: float = 20.0
|
| 199 |
+
initial_humidity_rh: float = 0.45
|
| 200 |
+
ashrae_class: str = "A2"
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
# ---------------------------------------------------------------------------
|
| 204 |
+
# Power distribution configuration
|
| 205 |
+
# ---------------------------------------------------------------------------
|
| 206 |
+
@dataclass
|
| 207 |
+
class UPSConfig:
|
| 208 |
+
"""Configuration for a UPS unit.
|
| 209 |
+
|
| 210 |
+
Efficiency model (quadratic loss):
|
| 211 |
+
η(x) = x / (x + c_0 + c_1·x + c_2·x²)
|
| 212 |
+
where x = load_fraction (0 to 1).
|
| 213 |
+
|
| 214 |
+
Default coefficients from APC White Paper 108 (modern double-conversion):
|
| 215 |
+
c_0 = 0.013 (no-load: transformers, logic boards)
|
| 216 |
+
c_1 = 0.006 (proportional: conduction losses)
|
| 217 |
+
c_2 = 0.011 (square-law: I²R in conductors)
|
| 218 |
+
"""
|
| 219 |
+
unit_id: str = "UPS-1"
|
| 220 |
+
rated_capacity_kw: float = 500.0
|
| 221 |
+
# Quadratic loss coefficients (fractions of rated capacity)
|
| 222 |
+
loss_c0: float = 0.013 # No-load losses
|
| 223 |
+
loss_c1: float = 0.006 # Proportional losses
|
| 224 |
+
loss_c2: float = 0.011 # Square-law losses
|
| 225 |
+
# Battery
|
| 226 |
+
battery_capacity_kwh: float = 8.3 # ~10 min at full load
|
| 227 |
+
battery_discharge_efficiency: float = 0.90
|
| 228 |
+
battery_aging_factor: float = 0.85 # End-of-life derating
|
| 229 |
+
battery_temp_c: float = 25.0 # Battery room temperature
|
| 230 |
+
# Recharge: ~10× discharge time
|
| 231 |
+
recharge_rate_kw: float = 5.0 # Max recharge rate
|
| 232 |
+
# Operating mode
|
| 233 |
+
initial_mode: str = "double_conversion" # "double_conversion", "line_interactive", "eco", "bypass"
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
@dataclass
|
| 237 |
+
class PDUConfig:
|
| 238 |
+
"""Configuration for a three-phase PDU.
|
| 239 |
+
|
| 240 |
+
US standard: 208V L-L / 120V L-N, 24A per phase.
|
| 241 |
+
Total nameplate: √3 × 208 × 24 ≈ 8,646 W.
|
| 242 |
+
80% NEC continuous derating: 6,917 W.
|
| 243 |
+
|
| 244 |
+
European: 400V L-L / 230V L-N, 32A per phase.
|
| 245 |
+
Total nameplate: √3 × 400 × 32 ≈ 22,170 W.
|
| 246 |
+
"""
|
| 247 |
+
pdu_id: str = "PDU-A1"
|
| 248 |
+
voltage_ll_v: float = 208.0 # Line-to-line voltage
|
| 249 |
+
max_current_per_phase_a: float = 24.0
|
| 250 |
+
num_phases: int = 3
|
| 251 |
+
breaker_rating_a: float = 20.0 # Per-branch circuit breaker
|
| 252 |
+
num_outlets: int = 48
|
| 253 |
+
efficiency: float = 0.98 # Transformer efficiency (2% losses)
|
| 254 |
+
continuous_derating: float = 0.80 # NEC 80% rule for continuous loads
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
@dataclass
|
| 258 |
+
class GeneratorConfig:
|
| 259 |
+
"""Configuration for a diesel standby generator.
|
| 260 |
+
|
| 261 |
+
Startup sequence (NFPA 110 Type 10):
|
| 262 |
+
Start delay → cranking → warm-up → ready to accept load
|
| 263 |
+
Total: 10-20 seconds
|
| 264 |
+
"""
|
| 265 |
+
gen_id: str = "GEN-1"
|
| 266 |
+
rated_capacity_kw: float = 750.0
|
| 267 |
+
# Startup timing
|
| 268 |
+
start_delay_s: float = 4.0 # Programmed delay before crank
|
| 269 |
+
crank_time_s: float = 5.0 # Engine cranking duration
|
| 270 |
+
warmup_time_s: float = 8.0 # Warm-up before load acceptance
|
| 271 |
+
# Fuel
|
| 272 |
+
fuel_tank_liters: float = 2000.0
|
| 273 |
+
consumption_lph_full: float = 180.0 # Liters/hour at full load
|
| 274 |
+
# Cool-down
|
| 275 |
+
cooldown_time_s: float = 300.0 # 5-min unloaded cool-down
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
@dataclass
|
| 279 |
+
class ATSConfig:
|
| 280 |
+
"""Configuration for an Automatic Transfer Switch."""
|
| 281 |
+
ats_id: str = "ATS-1"
|
| 282 |
+
transfer_time_ms: float = 100.0 # Mechanical transfer time
|
| 283 |
+
retransfer_delay_s: float = 300.0 # Wait before transferring back to utility
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
@dataclass
|
| 287 |
+
class PowerConfig:
|
| 288 |
+
"""Aggregated power infrastructure configuration."""
|
| 289 |
+
ups_units: list[UPSConfig] = field(default_factory=list)
|
| 290 |
+
pdus: list[PDUConfig] = field(default_factory=list)
|
| 291 |
+
generator: GeneratorConfig = field(default_factory=GeneratorConfig)
|
| 292 |
+
ats: ATSConfig = field(default_factory=ATSConfig)
|
| 293 |
+
utility_voltage_v: float = 480.0 # Main utility feed voltage
|
| 294 |
+
utility_available: bool = True
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
@dataclass
|
| 298 |
+
class DatacenterConfig:
|
| 299 |
+
"""Full datacenter configuration."""
|
| 300 |
+
name: str = "DC-OPS Facility"
|
| 301 |
+
zones: list[ZoneConfig] = field(default_factory=list)
|
| 302 |
+
power: PowerConfig = field(default_factory=PowerConfig)
|
| 303 |
+
outside_temp_c: float = 35.0
|
| 304 |
+
outside_humidity_rh: float = 0.40
|
| 305 |
+
lighting_w_per_m2: float = 10.0 # Typical 10 W/m²
|
| 306 |
+
floor_area_m2: float = 500.0
|
| 307 |
+
simulation_dt_s: float = 1.0 # Integration timestep
|
| 308 |
+
# Kept for backward compatibility with Phase 1 thermal sim
|
| 309 |
+
ups_loss_fraction: float = 0.05
|
| 310 |
+
pdu_loss_fraction: float = 0.02
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
def make_default_datacenter_config() -> DatacenterConfig:
|
| 314 |
+
"""Create a realistic default datacenter: 2 zones, 10 racks each, 4 CRACs total.
|
| 315 |
+
|
| 316 |
+
Power infrastructure:
|
| 317 |
+
- 2× UPS (N+1 redundant, 500 kW each for 160 kW total IT load)
|
| 318 |
+
- 20× PDUs (one per rack, US 3-phase 208V/24A)
|
| 319 |
+
- 1× diesel generator (750 kW)
|
| 320 |
+
- 1× ATS
|
| 321 |
+
"""
|
| 322 |
+
zone_a_racks = [
|
| 323 |
+
RackConfig(rack_id=f"A-{i:02d}", row="A", position=i, it_load_kw=8.0)
|
| 324 |
+
for i in range(1, 11)
|
| 325 |
+
]
|
| 326 |
+
zone_a_cracs = [
|
| 327 |
+
CRACConfig(unit_id="CRAC-1"),
|
| 328 |
+
CRACConfig(unit_id="CRAC-2"),
|
| 329 |
+
]
|
| 330 |
+
|
| 331 |
+
zone_b_racks = [
|
| 332 |
+
RackConfig(rack_id=f"B-{i:02d}", row="B", position=i, it_load_kw=8.0)
|
| 333 |
+
for i in range(1, 11)
|
| 334 |
+
]
|
| 335 |
+
zone_b_cracs = [
|
| 336 |
+
CRACConfig(unit_id="CRAC-3"),
|
| 337 |
+
CRACConfig(unit_id="CRAC-4"),
|
| 338 |
+
]
|
| 339 |
+
|
| 340 |
+
# Power infrastructure
|
| 341 |
+
ups_units = [
|
| 342 |
+
UPSConfig(unit_id="UPS-1", rated_capacity_kw=500.0),
|
| 343 |
+
UPSConfig(unit_id="UPS-2", rated_capacity_kw=500.0),
|
| 344 |
+
]
|
| 345 |
+
pdus = [
|
| 346 |
+
PDUConfig(pdu_id=f"PDU-{rack.rack_id}")
|
| 347 |
+
for rack in zone_a_racks + zone_b_racks
|
| 348 |
+
]
|
| 349 |
+
power = PowerConfig(
|
| 350 |
+
ups_units=ups_units,
|
| 351 |
+
pdus=pdus,
|
| 352 |
+
generator=GeneratorConfig(gen_id="GEN-1", rated_capacity_kw=750.0),
|
| 353 |
+
ats=ATSConfig(ats_id="ATS-1"),
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
return DatacenterConfig(
|
| 357 |
+
name="DC-OPS Default Facility",
|
| 358 |
+
zones=[
|
| 359 |
+
ZoneConfig(
|
| 360 |
+
zone_id="zone_a",
|
| 361 |
+
racks=zone_a_racks,
|
| 362 |
+
crac_units=zone_a_cracs,
|
| 363 |
+
air_volume_m3=600.0,
|
| 364 |
+
),
|
| 365 |
+
ZoneConfig(
|
| 366 |
+
zone_id="zone_b",
|
| 367 |
+
racks=zone_b_racks,
|
| 368 |
+
crac_units=zone_b_cracs,
|
| 369 |
+
air_volume_m3=600.0,
|
| 370 |
+
),
|
| 371 |
+
],
|
| 372 |
+
power=power,
|
| 373 |
+
outside_temp_c=35.0,
|
| 374 |
+
outside_humidity_rh=0.40,
|
| 375 |
+
floor_area_m2=1200.0,
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
# ---------------------------------------------------------------------------
|
| 380 |
+
# YAML config loader
|
| 381 |
+
# ---------------------------------------------------------------------------
|
| 382 |
+
_CONFIG_DIR = Path(__file__).parent / "data" / "datacenter_configs"
|
| 383 |
+
|
| 384 |
+
# Built-in config names (resolved relative to this package)
|
| 385 |
+
BUILTIN_CONFIGS: dict[str, Path] = {
|
| 386 |
+
"default": _CONFIG_DIR / "default.yaml",
|
| 387 |
+
"small": _CONFIG_DIR / "small_facility.yaml",
|
| 388 |
+
"large": _CONFIG_DIR / "large_facility.yaml",
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
def load_datacenter_config(source: Union[str, Path]) -> DatacenterConfig:
|
| 393 |
+
"""Load a DatacenterConfig from a YAML file or built-in name.
|
| 394 |
+
|
| 395 |
+
Args:
|
| 396 |
+
source: Either a built-in name ("default", "small", "large"),
|
| 397 |
+
or a path to a YAML file.
|
| 398 |
+
|
| 399 |
+
Returns:
|
| 400 |
+
Fully constructed DatacenterConfig.
|
| 401 |
+
|
| 402 |
+
Examples:
|
| 403 |
+
config = load_datacenter_config("small")
|
| 404 |
+
config = load_datacenter_config("/path/to/custom.yaml")
|
| 405 |
+
"""
|
| 406 |
+
import yaml
|
| 407 |
+
|
| 408 |
+
# Resolve source to a file path
|
| 409 |
+
if isinstance(source, str) and source in BUILTIN_CONFIGS:
|
| 410 |
+
path = BUILTIN_CONFIGS[source]
|
| 411 |
+
else:
|
| 412 |
+
path = Path(source)
|
| 413 |
+
|
| 414 |
+
if not path.exists():
|
| 415 |
+
raise FileNotFoundError(f"Config file not found: {path}")
|
| 416 |
+
|
| 417 |
+
with open(path, "r") as f:
|
| 418 |
+
data = yaml.safe_load(f)
|
| 419 |
+
|
| 420 |
+
return _dict_to_datacenter_config(data)
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
def _dict_to_datacenter_config(data: dict) -> DatacenterConfig:
|
| 424 |
+
"""Convert a raw YAML dict into a DatacenterConfig."""
|
| 425 |
+
zones = [_dict_to_zone_config(z) for z in data.get("zones", [])]
|
| 426 |
+
power = _dict_to_power_config(data.get("power", {}))
|
| 427 |
+
|
| 428 |
+
return DatacenterConfig(
|
| 429 |
+
name=data.get("name", "DC-OPS Facility"),
|
| 430 |
+
zones=zones,
|
| 431 |
+
power=power,
|
| 432 |
+
outside_temp_c=data.get("outside_temp_c", 35.0),
|
| 433 |
+
outside_humidity_rh=data.get("outside_humidity_rh", 0.40),
|
| 434 |
+
lighting_w_per_m2=data.get("lighting_w_per_m2", 10.0),
|
| 435 |
+
floor_area_m2=data.get("floor_area_m2", 500.0),
|
| 436 |
+
simulation_dt_s=data.get("simulation_dt_s", 1.0),
|
| 437 |
+
ups_loss_fraction=data.get("ups_loss_fraction", 0.05),
|
| 438 |
+
pdu_loss_fraction=data.get("pdu_loss_fraction", 0.02),
|
| 439 |
+
)
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
def _dict_to_zone_config(data: dict) -> ZoneConfig:
|
| 443 |
+
"""Convert a raw dict into a ZoneConfig."""
|
| 444 |
+
racks = [_dict_to_rack_config(r) for r in data.get("racks", [])]
|
| 445 |
+
cracs = [_dict_to_crac_config(c) for c in data.get("crac_units", [])]
|
| 446 |
+
|
| 447 |
+
return ZoneConfig(
|
| 448 |
+
zone_id=data.get("zone_id", "zone_a"),
|
| 449 |
+
racks=racks,
|
| 450 |
+
crac_units=cracs,
|
| 451 |
+
containment_type=data.get("containment_type", "cold_aisle"),
|
| 452 |
+
recirculation_factor=data.get("recirculation_factor", 0.08),
|
| 453 |
+
air_volume_m3=data.get("air_volume_m3", 500.0),
|
| 454 |
+
envelope_r_kw=data.get("envelope_r_kw", 0.02),
|
| 455 |
+
initial_cold_aisle_temp_c=data.get("initial_cold_aisle_temp_c", 20.0),
|
| 456 |
+
initial_humidity_rh=data.get("initial_humidity_rh", 0.45),
|
| 457 |
+
ashrae_class=data.get("ashrae_class", "A2"),
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
def _dict_to_rack_config(data: dict) -> RackConfig:
|
| 462 |
+
"""Convert a raw dict into a RackConfig."""
|
| 463 |
+
return RackConfig(
|
| 464 |
+
rack_id=data.get("rack_id", "A-01"),
|
| 465 |
+
row=data.get("row", "A"),
|
| 466 |
+
position=data.get("position", 1),
|
| 467 |
+
it_load_kw=data.get("it_load_kw", 8.0),
|
| 468 |
+
num_servers_2u=data.get("num_servers_2u", 20),
|
| 469 |
+
server_thermal_mass_jk=data.get("server_thermal_mass_jk", 11100.0),
|
| 470 |
+
airflow_cfm_per_kw=data.get("airflow_cfm_per_kw", 160.0),
|
| 471 |
+
)
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
def _dict_to_crac_config(data: dict) -> CRACConfig:
|
| 475 |
+
"""Convert a raw dict into a CRACConfig."""
|
| 476 |
+
return CRACConfig(
|
| 477 |
+
unit_id=data.get("unit_id", "CRAC-1"),
|
| 478 |
+
rated_capacity_kw=data.get("rated_capacity_kw", 70.0),
|
| 479 |
+
rated_return_temp_c=data.get("rated_return_temp_c", 24.0),
|
| 480 |
+
capacity_slope_per_c=data.get("capacity_slope_per_c", 0.03),
|
| 481 |
+
max_airflow_cfm=data.get("max_airflow_cfm", 12000.0),
|
| 482 |
+
fan_rated_power_kw=data.get("fan_rated_power_kw", 5.0),
|
| 483 |
+
cop_rated=data.get("cop_rated", 3.5),
|
| 484 |
+
cop_degradation_per_c=data.get("cop_degradation_per_c", 0.04),
|
| 485 |
+
initial_setpoint_c=data.get("initial_setpoint_c", 18.0),
|
| 486 |
+
initial_fan_speed_pct=data.get("initial_fan_speed_pct", 100.0),
|
| 487 |
+
supply_temp_lag_s=data.get("supply_temp_lag_s", 30.0),
|
| 488 |
+
)
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
def _dict_to_power_config(data: dict) -> PowerConfig:
|
| 492 |
+
"""Convert a raw dict into a PowerConfig."""
|
| 493 |
+
ups = [_dict_to_ups_config(u) for u in data.get("ups_units", [])]
|
| 494 |
+
pdus = [_dict_to_pdu_config(p) for p in data.get("pdus", [])]
|
| 495 |
+
gen_data = data.get("generator", {})
|
| 496 |
+
ats_data = data.get("ats", {})
|
| 497 |
+
|
| 498 |
+
return PowerConfig(
|
| 499 |
+
ups_units=ups,
|
| 500 |
+
pdus=pdus,
|
| 501 |
+
generator=GeneratorConfig(
|
| 502 |
+
gen_id=gen_data.get("gen_id", "GEN-1"),
|
| 503 |
+
rated_capacity_kw=gen_data.get("rated_capacity_kw", 750.0),
|
| 504 |
+
start_delay_s=gen_data.get("start_delay_s", 4.0),
|
| 505 |
+
crank_time_s=gen_data.get("crank_time_s", 5.0),
|
| 506 |
+
warmup_time_s=gen_data.get("warmup_time_s", 8.0),
|
| 507 |
+
fuel_tank_liters=gen_data.get("fuel_tank_liters", 2000.0),
|
| 508 |
+
consumption_lph_full=gen_data.get("consumption_lph_full", 180.0),
|
| 509 |
+
cooldown_time_s=gen_data.get("cooldown_time_s", 300.0),
|
| 510 |
+
),
|
| 511 |
+
ats=ATSConfig(
|
| 512 |
+
ats_id=ats_data.get("ats_id", "ATS-1"),
|
| 513 |
+
transfer_time_ms=ats_data.get("transfer_time_ms", 100.0),
|
| 514 |
+
retransfer_delay_s=ats_data.get("retransfer_delay_s", 300.0),
|
| 515 |
+
),
|
| 516 |
+
utility_voltage_v=data.get("utility_voltage_v", 480.0),
|
| 517 |
+
utility_available=data.get("utility_available", True),
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
def _dict_to_ups_config(data: dict) -> UPSConfig:
|
| 522 |
+
"""Convert a raw dict into a UPSConfig."""
|
| 523 |
+
return UPSConfig(
|
| 524 |
+
unit_id=data.get("unit_id", "UPS-1"),
|
| 525 |
+
rated_capacity_kw=data.get("rated_capacity_kw", 500.0),
|
| 526 |
+
loss_c0=data.get("loss_c0", 0.013),
|
| 527 |
+
loss_c1=data.get("loss_c1", 0.006),
|
| 528 |
+
loss_c2=data.get("loss_c2", 0.011),
|
| 529 |
+
battery_capacity_kwh=data.get("battery_capacity_kwh", 8.3),
|
| 530 |
+
battery_discharge_efficiency=data.get("battery_discharge_efficiency", 0.90),
|
| 531 |
+
battery_aging_factor=data.get("battery_aging_factor", 0.85),
|
| 532 |
+
battery_temp_c=data.get("battery_temp_c", 25.0),
|
| 533 |
+
recharge_rate_kw=data.get("recharge_rate_kw", 5.0),
|
| 534 |
+
initial_mode=data.get("initial_mode", "double_conversion"),
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
|
| 538 |
+
def _dict_to_pdu_config(data: dict) -> PDUConfig:
|
| 539 |
+
"""Convert a raw dict into a PDUConfig."""
|
| 540 |
+
return PDUConfig(
|
| 541 |
+
pdu_id=data.get("pdu_id", "PDU-A1"),
|
| 542 |
+
voltage_ll_v=data.get("voltage_ll_v", 208.0),
|
| 543 |
+
max_current_per_phase_a=data.get("max_current_per_phase_a", 24.0),
|
| 544 |
+
num_phases=data.get("num_phases", 3),
|
| 545 |
+
breaker_rating_a=data.get("breaker_rating_a", 20.0),
|
| 546 |
+
num_outlets=data.get("num_outlets", 48),
|
| 547 |
+
efficiency=data.get("efficiency", 0.98),
|
| 548 |
+
continuous_derating=data.get("continuous_derating", 0.80),
|
| 549 |
+
)
|
data/datacenter_configs/default.yaml
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DC-OPS Default Facility Configuration
|
| 2 |
+
# 2 zones, 20 racks (10/zone), 160 kW total IT load
|
| 3 |
+
# N+1 cooling (4 CRACs × 70 kW for 160 kW IT)
|
| 4 |
+
# N+1 power (2 UPS × 500 kW, 1 generator 750 kW)
|
| 5 |
+
# Location: Phoenix, AZ (hot climate)
|
| 6 |
+
|
| 7 |
+
name: "DC-OPS Default Facility"
|
| 8 |
+
|
| 9 |
+
outside_temp_c: 35.0
|
| 10 |
+
outside_humidity_rh: 0.40
|
| 11 |
+
lighting_w_per_m2: 10.0
|
| 12 |
+
floor_area_m2: 1200.0
|
| 13 |
+
simulation_dt_s: 1.0
|
| 14 |
+
|
| 15 |
+
zones:
|
| 16 |
+
- zone_id: zone_a
|
| 17 |
+
containment_type: cold_aisle
|
| 18 |
+
recirculation_factor: 0.08
|
| 19 |
+
air_volume_m3: 600.0
|
| 20 |
+
envelope_r_kw: 0.02
|
| 21 |
+
initial_cold_aisle_temp_c: 20.0
|
| 22 |
+
initial_humidity_rh: 0.45
|
| 23 |
+
ashrae_class: A2
|
| 24 |
+
racks:
|
| 25 |
+
- { rack_id: A-01, row: A, position: 1, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 26 |
+
- { rack_id: A-02, row: A, position: 2, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 27 |
+
- { rack_id: A-03, row: A, position: 3, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 28 |
+
- { rack_id: A-04, row: A, position: 4, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 29 |
+
- { rack_id: A-05, row: A, position: 5, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 30 |
+
- { rack_id: A-06, row: A, position: 6, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 31 |
+
- { rack_id: A-07, row: A, position: 7, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 32 |
+
- { rack_id: A-08, row: A, position: 8, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 33 |
+
- { rack_id: A-09, row: A, position: 9, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 34 |
+
- { rack_id: A-10, row: A, position: 10, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 35 |
+
crac_units:
|
| 36 |
+
- { unit_id: CRAC-1, rated_capacity_kw: 70.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 37 |
+
- { unit_id: CRAC-2, rated_capacity_kw: 70.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 38 |
+
|
| 39 |
+
- zone_id: zone_b
|
| 40 |
+
containment_type: cold_aisle
|
| 41 |
+
recirculation_factor: 0.08
|
| 42 |
+
air_volume_m3: 600.0
|
| 43 |
+
envelope_r_kw: 0.02
|
| 44 |
+
initial_cold_aisle_temp_c: 20.0
|
| 45 |
+
initial_humidity_rh: 0.45
|
| 46 |
+
ashrae_class: A2
|
| 47 |
+
racks:
|
| 48 |
+
- { rack_id: B-01, row: B, position: 1, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 49 |
+
- { rack_id: B-02, row: B, position: 2, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 50 |
+
- { rack_id: B-03, row: B, position: 3, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 51 |
+
- { rack_id: B-04, row: B, position: 4, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 52 |
+
- { rack_id: B-05, row: B, position: 5, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 53 |
+
- { rack_id: B-06, row: B, position: 6, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 54 |
+
- { rack_id: B-07, row: B, position: 7, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 55 |
+
- { rack_id: B-08, row: B, position: 8, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 56 |
+
- { rack_id: B-09, row: B, position: 9, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 57 |
+
- { rack_id: B-10, row: B, position: 10, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 58 |
+
crac_units:
|
| 59 |
+
- { unit_id: CRAC-3, rated_capacity_kw: 70.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 60 |
+
- { unit_id: CRAC-4, rated_capacity_kw: 70.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 61 |
+
|
| 62 |
+
power:
|
| 63 |
+
utility_voltage_v: 480.0
|
| 64 |
+
utility_available: true
|
| 65 |
+
ups_units:
|
| 66 |
+
- { unit_id: UPS-1, rated_capacity_kw: 500.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 8.3, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 5.0, initial_mode: double_conversion }
|
| 67 |
+
- { unit_id: UPS-2, rated_capacity_kw: 500.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 8.3, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 5.0, initial_mode: double_conversion }
|
| 68 |
+
pdus:
|
| 69 |
+
- { pdu_id: PDU-A-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 70 |
+
- { pdu_id: PDU-A-02, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 71 |
+
- { pdu_id: PDU-A-03, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 72 |
+
- { pdu_id: PDU-A-04, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 73 |
+
- { pdu_id: PDU-A-05, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 74 |
+
- { pdu_id: PDU-A-06, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 75 |
+
- { pdu_id: PDU-A-07, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 76 |
+
- { pdu_id: PDU-A-08, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 77 |
+
- { pdu_id: PDU-A-09, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 78 |
+
- { pdu_id: PDU-A-10, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 79 |
+
- { pdu_id: PDU-B-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 80 |
+
- { pdu_id: PDU-B-02, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 81 |
+
- { pdu_id: PDU-B-03, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 82 |
+
- { pdu_id: PDU-B-04, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 83 |
+
- { pdu_id: PDU-B-05, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 84 |
+
- { pdu_id: PDU-B-06, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 85 |
+
- { pdu_id: PDU-B-07, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 86 |
+
- { pdu_id: PDU-B-08, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 87 |
+
- { pdu_id: PDU-B-09, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 88 |
+
- { pdu_id: PDU-B-10, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 89 |
+
generator:
|
| 90 |
+
gen_id: GEN-1
|
| 91 |
+
rated_capacity_kw: 750.0
|
| 92 |
+
start_delay_s: 4.0
|
| 93 |
+
crank_time_s: 5.0
|
| 94 |
+
warmup_time_s: 8.0
|
| 95 |
+
fuel_tank_liters: 2000.0
|
| 96 |
+
consumption_lph_full: 180.0
|
| 97 |
+
cooldown_time_s: 300.0
|
| 98 |
+
ats:
|
| 99 |
+
ats_id: ATS-1
|
| 100 |
+
transfer_time_ms: 100.0
|
| 101 |
+
retransfer_delay_s: 300.0
|
data/datacenter_configs/large_facility.yaml
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Large Facility Configuration
|
| 2 |
+
# 4 zones, 80 racks (20/zone), 640 kW total IT load
|
| 3 |
+
# N+1 cooling (8 CRACs × 100 kW for 640 kW IT)
|
| 4 |
+
# 2N power (4 UPS × 500 kW, 2 generators × 750 kW)
|
| 5 |
+
# Mixed ASHRAE classes: A2 standard + H1 high-density
|
| 6 |
+
# Use case: enterprise datacenter with GPU/HPC section
|
| 7 |
+
|
| 8 |
+
name: "DC-OPS Large Facility"
|
| 9 |
+
|
| 10 |
+
outside_temp_c: 35.0
|
| 11 |
+
outside_humidity_rh: 0.35
|
| 12 |
+
lighting_w_per_m2: 10.0
|
| 13 |
+
floor_area_m2: 4000.0
|
| 14 |
+
simulation_dt_s: 1.0
|
| 15 |
+
|
| 16 |
+
zones:
|
| 17 |
+
# Standard-density zones (A2 class, 8 kW/rack)
|
| 18 |
+
- zone_id: zone_a
|
| 19 |
+
containment_type: cold_aisle
|
| 20 |
+
recirculation_factor: 0.06 # Excellent containment
|
| 21 |
+
air_volume_m3: 800.0
|
| 22 |
+
envelope_r_kw: 0.015
|
| 23 |
+
initial_cold_aisle_temp_c: 20.0
|
| 24 |
+
initial_humidity_rh: 0.45
|
| 25 |
+
ashrae_class: A2
|
| 26 |
+
racks:
|
| 27 |
+
- { rack_id: A-01, row: A, position: 1, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 28 |
+
- { rack_id: A-02, row: A, position: 2, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 29 |
+
- { rack_id: A-03, row: A, position: 3, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 30 |
+
- { rack_id: A-04, row: A, position: 4, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 31 |
+
- { rack_id: A-05, row: A, position: 5, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 32 |
+
- { rack_id: A-06, row: A, position: 6, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 33 |
+
- { rack_id: A-07, row: A, position: 7, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 34 |
+
- { rack_id: A-08, row: A, position: 8, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 35 |
+
- { rack_id: A-09, row: A, position: 9, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 36 |
+
- { rack_id: A-10, row: A, position: 10, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 37 |
+
- { rack_id: A-11, row: A, position: 11, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 38 |
+
- { rack_id: A-12, row: A, position: 12, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 39 |
+
- { rack_id: A-13, row: A, position: 13, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 40 |
+
- { rack_id: A-14, row: A, position: 14, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 41 |
+
- { rack_id: A-15, row: A, position: 15, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 42 |
+
- { rack_id: A-16, row: A, position: 16, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 43 |
+
- { rack_id: A-17, row: A, position: 17, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 44 |
+
- { rack_id: A-18, row: A, position: 18, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 45 |
+
- { rack_id: A-19, row: A, position: 19, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 46 |
+
- { rack_id: A-20, row: A, position: 20, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 47 |
+
crac_units:
|
| 48 |
+
- { unit_id: CRAC-1, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 49 |
+
- { unit_id: CRAC-2, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 50 |
+
|
| 51 |
+
- zone_id: zone_b
|
| 52 |
+
containment_type: cold_aisle
|
| 53 |
+
recirculation_factor: 0.06
|
| 54 |
+
air_volume_m3: 800.0
|
| 55 |
+
envelope_r_kw: 0.015
|
| 56 |
+
initial_cold_aisle_temp_c: 20.0
|
| 57 |
+
initial_humidity_rh: 0.45
|
| 58 |
+
ashrae_class: A2
|
| 59 |
+
racks:
|
| 60 |
+
- { rack_id: B-01, row: B, position: 1, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 61 |
+
- { rack_id: B-02, row: B, position: 2, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 62 |
+
- { rack_id: B-03, row: B, position: 3, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 63 |
+
- { rack_id: B-04, row: B, position: 4, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 64 |
+
- { rack_id: B-05, row: B, position: 5, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 65 |
+
- { rack_id: B-06, row: B, position: 6, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 66 |
+
- { rack_id: B-07, row: B, position: 7, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 67 |
+
- { rack_id: B-08, row: B, position: 8, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 68 |
+
- { rack_id: B-09, row: B, position: 9, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 69 |
+
- { rack_id: B-10, row: B, position: 10, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 70 |
+
- { rack_id: B-11, row: B, position: 11, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 71 |
+
- { rack_id: B-12, row: B, position: 12, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 72 |
+
- { rack_id: B-13, row: B, position: 13, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 73 |
+
- { rack_id: B-14, row: B, position: 14, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 74 |
+
- { rack_id: B-15, row: B, position: 15, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 75 |
+
- { rack_id: B-16, row: B, position: 16, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 76 |
+
- { rack_id: B-17, row: B, position: 17, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 77 |
+
- { rack_id: B-18, row: B, position: 18, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 78 |
+
- { rack_id: B-19, row: B, position: 19, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 79 |
+
- { rack_id: B-20, row: B, position: 20, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 80 |
+
crac_units:
|
| 81 |
+
- { unit_id: CRAC-3, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 82 |
+
- { unit_id: CRAC-4, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 83 |
+
|
| 84 |
+
# High-density GPU zone (H1 class, 20 kW/rack)
|
| 85 |
+
- zone_id: zone_c
|
| 86 |
+
containment_type: hot_aisle
|
| 87 |
+
recirculation_factor: 0.05 # Hot aisle containment — tighter
|
| 88 |
+
air_volume_m3: 800.0
|
| 89 |
+
envelope_r_kw: 0.015
|
| 90 |
+
initial_cold_aisle_temp_c: 20.0
|
| 91 |
+
initial_humidity_rh: 0.45
|
| 92 |
+
ashrae_class: H1 # High-density class: 18-22°C recommended, 25°C allowable max
|
| 93 |
+
racks:
|
| 94 |
+
- { rack_id: C-01, row: C, position: 1, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
|
| 95 |
+
- { rack_id: C-02, row: C, position: 2, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
|
| 96 |
+
- { rack_id: C-03, row: C, position: 3, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
|
| 97 |
+
- { rack_id: C-04, row: C, position: 4, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
|
| 98 |
+
- { rack_id: C-05, row: C, position: 5, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
|
| 99 |
+
- { rack_id: C-06, row: C, position: 6, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
|
| 100 |
+
- { rack_id: C-07, row: C, position: 7, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
|
| 101 |
+
- { rack_id: C-08, row: C, position: 8, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
|
| 102 |
+
- { rack_id: C-09, row: C, position: 9, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
|
| 103 |
+
- { rack_id: C-10, row: C, position: 10, it_load_kw: 20.0, num_servers_2u: 10, server_thermal_mass_jk: 15000.0, airflow_cfm_per_kw: 120.0 }
|
| 104 |
+
crac_units:
|
| 105 |
+
- { unit_id: CRAC-5, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 17.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 106 |
+
- { unit_id: CRAC-6, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 17.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 107 |
+
|
| 108 |
+
# Standard-density zone D
|
| 109 |
+
- zone_id: zone_d
|
| 110 |
+
containment_type: cold_aisle
|
| 111 |
+
recirculation_factor: 0.06
|
| 112 |
+
air_volume_m3: 800.0
|
| 113 |
+
envelope_r_kw: 0.015
|
| 114 |
+
initial_cold_aisle_temp_c: 20.0
|
| 115 |
+
initial_humidity_rh: 0.45
|
| 116 |
+
ashrae_class: A2
|
| 117 |
+
racks:
|
| 118 |
+
- { rack_id: D-01, row: D, position: 1, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 119 |
+
- { rack_id: D-02, row: D, position: 2, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 120 |
+
- { rack_id: D-03, row: D, position: 3, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 121 |
+
- { rack_id: D-04, row: D, position: 4, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 122 |
+
- { rack_id: D-05, row: D, position: 5, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 123 |
+
- { rack_id: D-06, row: D, position: 6, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 124 |
+
- { rack_id: D-07, row: D, position: 7, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 125 |
+
- { rack_id: D-08, row: D, position: 8, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 126 |
+
- { rack_id: D-09, row: D, position: 9, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 127 |
+
- { rack_id: D-10, row: D, position: 10, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 128 |
+
crac_units:
|
| 129 |
+
- { unit_id: CRAC-7, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 130 |
+
- { unit_id: CRAC-8, rated_capacity_kw: 100.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 15000.0, fan_rated_power_kw: 7.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 131 |
+
|
| 132 |
+
power:
|
| 133 |
+
utility_voltage_v: 480.0
|
| 134 |
+
utility_available: true
|
| 135 |
+
ups_units:
|
| 136 |
+
- { unit_id: UPS-1, rated_capacity_kw: 500.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 12.0, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 8.0, initial_mode: double_conversion }
|
| 137 |
+
- { unit_id: UPS-2, rated_capacity_kw: 500.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 12.0, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 8.0, initial_mode: double_conversion }
|
| 138 |
+
- { unit_id: UPS-3, rated_capacity_kw: 500.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 12.0, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 8.0, initial_mode: double_conversion }
|
| 139 |
+
- { unit_id: UPS-4, rated_capacity_kw: 500.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 12.0, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 8.0, initial_mode: double_conversion }
|
| 140 |
+
pdus:
|
| 141 |
+
- { pdu_id: PDU-A-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 142 |
+
- { pdu_id: PDU-B-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 143 |
+
- { pdu_id: PDU-C-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 144 |
+
- { pdu_id: PDU-D-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 145 |
+
generator:
|
| 146 |
+
gen_id: GEN-1
|
| 147 |
+
rated_capacity_kw: 1500.0
|
| 148 |
+
start_delay_s: 4.0
|
| 149 |
+
crank_time_s: 5.0
|
| 150 |
+
warmup_time_s: 8.0
|
| 151 |
+
fuel_tank_liters: 5000.0
|
| 152 |
+
consumption_lph_full: 400.0
|
| 153 |
+
cooldown_time_s: 300.0
|
| 154 |
+
ats:
|
| 155 |
+
ats_id: ATS-1
|
| 156 |
+
transfer_time_ms: 100.0
|
| 157 |
+
retransfer_delay_s: 300.0
|
data/datacenter_configs/small_facility.yaml
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Small Facility Configuration
|
| 2 |
+
# 1 zone, 10 racks, 80 kW total IT load
|
| 3 |
+
# N+1 cooling (2 CRACs × 70 kW for 80 kW IT)
|
| 4 |
+
# Single UPS, smaller generator
|
| 5 |
+
# Use case: edge datacenter, branch office
|
| 6 |
+
|
| 7 |
+
name: "DC-OPS Small Facility"
|
| 8 |
+
|
| 9 |
+
outside_temp_c: 30.0
|
| 10 |
+
outside_humidity_rh: 0.50
|
| 11 |
+
lighting_w_per_m2: 10.0
|
| 12 |
+
floor_area_m2: 300.0
|
| 13 |
+
simulation_dt_s: 1.0
|
| 14 |
+
|
| 15 |
+
zones:
|
| 16 |
+
- zone_id: zone_a
|
| 17 |
+
containment_type: cold_aisle
|
| 18 |
+
recirculation_factor: 0.10 # Slightly less tight containment
|
| 19 |
+
air_volume_m3: 300.0
|
| 20 |
+
envelope_r_kw: 0.03 # Less insulation than large facility
|
| 21 |
+
initial_cold_aisle_temp_c: 20.0
|
| 22 |
+
initial_humidity_rh: 0.45
|
| 23 |
+
ashrae_class: A2
|
| 24 |
+
racks:
|
| 25 |
+
- { rack_id: A-01, row: A, position: 1, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 26 |
+
- { rack_id: A-02, row: A, position: 2, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 27 |
+
- { rack_id: A-03, row: A, position: 3, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 28 |
+
- { rack_id: A-04, row: A, position: 4, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 29 |
+
- { rack_id: A-05, row: A, position: 5, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 30 |
+
- { rack_id: A-06, row: A, position: 6, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 31 |
+
- { rack_id: A-07, row: A, position: 7, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 32 |
+
- { rack_id: A-08, row: A, position: 8, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 33 |
+
- { rack_id: A-09, row: A, position: 9, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 34 |
+
- { rack_id: A-10, row: A, position: 10, it_load_kw: 8.0, num_servers_2u: 20, server_thermal_mass_jk: 11100.0, airflow_cfm_per_kw: 160.0 }
|
| 35 |
+
crac_units:
|
| 36 |
+
- { unit_id: CRAC-1, rated_capacity_kw: 70.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 37 |
+
- { unit_id: CRAC-2, rated_capacity_kw: 70.0, rated_return_temp_c: 24.0, capacity_slope_per_c: 0.03, max_airflow_cfm: 12000.0, fan_rated_power_kw: 5.0, cop_rated: 3.5, cop_degradation_per_c: 0.04, initial_setpoint_c: 18.0, initial_fan_speed_pct: 100.0, supply_temp_lag_s: 30.0 }
|
| 38 |
+
|
| 39 |
+
power:
|
| 40 |
+
utility_voltage_v: 480.0
|
| 41 |
+
utility_available: true
|
| 42 |
+
ups_units:
|
| 43 |
+
- { unit_id: UPS-1, rated_capacity_kw: 200.0, loss_c0: 0.013, loss_c1: 0.006, loss_c2: 0.011, battery_capacity_kwh: 5.0, battery_discharge_efficiency: 0.90, battery_aging_factor: 0.85, battery_temp_c: 25.0, recharge_rate_kw: 3.0, initial_mode: double_conversion }
|
| 44 |
+
pdus:
|
| 45 |
+
- { pdu_id: PDU-A-01, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 46 |
+
- { pdu_id: PDU-A-02, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 47 |
+
- { pdu_id: PDU-A-03, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 48 |
+
- { pdu_id: PDU-A-04, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 49 |
+
- { pdu_id: PDU-A-05, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 50 |
+
- { pdu_id: PDU-A-06, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 51 |
+
- { pdu_id: PDU-A-07, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 52 |
+
- { pdu_id: PDU-A-08, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 53 |
+
- { pdu_id: PDU-A-09, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 54 |
+
- { pdu_id: PDU-A-10, voltage_ll_v: 208.0, max_current_per_phase_a: 24.0, num_phases: 3, breaker_rating_a: 20.0, num_outlets: 48, efficiency: 0.98, continuous_derating: 0.80 }
|
| 55 |
+
generator:
|
| 56 |
+
gen_id: GEN-1
|
| 57 |
+
rated_capacity_kw: 300.0
|
| 58 |
+
start_delay_s: 4.0
|
| 59 |
+
crank_time_s: 5.0
|
| 60 |
+
warmup_time_s: 8.0
|
| 61 |
+
fuel_tank_liters: 1000.0
|
| 62 |
+
consumption_lph_full: 80.0
|
| 63 |
+
cooldown_time_s: 300.0
|
| 64 |
+
ats:
|
| 65 |
+
ats_id: ATS-1
|
| 66 |
+
transfer_time_ms: 100.0
|
| 67 |
+
retransfer_delay_s: 300.0
|
models.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Pydantic models for the DC-Ops Environment.
|
| 9 |
+
|
| 10 |
+
Action: Natural-language operator commands (e.g., "adjust_setpoint CRAC-1 20").
|
| 11 |
+
Observation: Text dashboard + structured metadata for the LLM agent.
|
| 12 |
+
|
| 13 |
+
These use OpenEnv's Action/Observation base classes which enforce
|
| 14 |
+
`extra="forbid"` — only declared fields are allowed.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
from typing import Any, Dict, List
|
| 20 |
+
|
| 21 |
+
from openenv.core.env_server.types import Action, Observation
|
| 22 |
+
from pydantic import Field
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class DcOpsAction(Action):
|
| 26 |
+
"""Operator command issued by the LLM agent.
|
| 27 |
+
|
| 28 |
+
The agent reads the dashboard observation and responds with a command string.
|
| 29 |
+
Commands follow the format: `command_name [target] [value]`
|
| 30 |
+
|
| 31 |
+
Examples:
|
| 32 |
+
- "diagnose CRAC-3"
|
| 33 |
+
- "adjust_setpoint CRAC-1 20"
|
| 34 |
+
- "increase_fan_speed CRAC-2 80"
|
| 35 |
+
- "start_generator"
|
| 36 |
+
- "acknowledge_alarm"
|
| 37 |
+
- "escalate"
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
command: str = Field(
|
| 41 |
+
...,
|
| 42 |
+
description="Operator command (e.g., 'diagnose CRAC-3', 'adjust_setpoint CRAC-1 20')",
|
| 43 |
+
)
|
| 44 |
+
reasoning: str = Field(
|
| 45 |
+
default="",
|
| 46 |
+
description="Optional chain-of-thought reasoning from the agent",
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class DcOpsObservation(Observation):
|
| 51 |
+
"""Text-based monitoring dashboard observation.
|
| 52 |
+
|
| 53 |
+
The 'dashboard' field contains the full text rendering of the current
|
| 54 |
+
datacenter state — formatted like a real operator's monitoring screen.
|
| 55 |
+
This is the primary field the LLM agent reads.
|
| 56 |
+
|
| 57 |
+
Structured data is available in the inherited 'metadata' dict.
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
dashboard: str = Field(
|
| 61 |
+
default="",
|
| 62 |
+
description="Text-rendered monitoring dashboard",
|
| 63 |
+
)
|
| 64 |
+
available_actions: List[str] = Field(
|
| 65 |
+
default_factory=list,
|
| 66 |
+
description="Valid commands the agent can issue",
|
| 67 |
+
)
|
| 68 |
+
alert: str = Field(
|
| 69 |
+
default="",
|
| 70 |
+
description="Current active alert message, if any",
|
| 71 |
+
)
|
| 72 |
+
scenario_type: str = Field(
|
| 73 |
+
default="",
|
| 74 |
+
description="Type of scenario (thermal, power, network, incident)",
|
| 75 |
+
)
|
| 76 |
+
steps_remaining: int = Field(
|
| 77 |
+
default=0,
|
| 78 |
+
description="Steps left in episode budget",
|
| 79 |
+
)
|
| 80 |
+
action_result: str = Field(
|
| 81 |
+
default="",
|
| 82 |
+
description="Feedback from the last action (success/error message)",
|
| 83 |
+
)
|
openenv.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: dc_ops_env
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
| 7 |
+
|
openenv_dc_ops_env.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: openenv-dc_ops_env
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: Dc Ops Env environment for OpenEnv
|
| 5 |
+
Requires-Python: >=3.10
|
| 6 |
+
Requires-Dist: openenv-core[core]>=0.2.1
|
| 7 |
+
Provides-Extra: dev
|
| 8 |
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 9 |
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
openenv_dc_ops_env.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
__init__.py
|
| 3 |
+
client.py
|
| 4 |
+
models.py
|
| 5 |
+
pyproject.toml
|
| 6 |
+
openenv_dc_ops_env.egg-info/PKG-INFO
|
| 7 |
+
openenv_dc_ops_env.egg-info/SOURCES.txt
|
| 8 |
+
openenv_dc_ops_env.egg-info/dependency_links.txt
|
| 9 |
+
openenv_dc_ops_env.egg-info/entry_points.txt
|
| 10 |
+
openenv_dc_ops_env.egg-info/requires.txt
|
| 11 |
+
openenv_dc_ops_env.egg-info/top_level.txt
|
| 12 |
+
server/__init__.py
|
| 13 |
+
server/app.py
|
| 14 |
+
server/dc_ops_env_environment.py
|
| 15 |
+
tests/test_environment.py
|
| 16 |
+
tests/test_integration.py
|
| 17 |
+
tests/test_power.py
|
| 18 |
+
tests/test_rewards.py
|
| 19 |
+
tests/test_scenarios.py
|
| 20 |
+
tests/test_thermal.py
|
openenv_dc_ops_env.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
openenv_dc_ops_env.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
server = dc_ops_env.server.app:main
|
openenv_dc_ops_env.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.1
|
| 2 |
+
|
| 3 |
+
[dev]
|
| 4 |
+
pytest>=8.0.0
|
| 5 |
+
pytest-cov>=4.0.0
|
openenv_dc_ops_env.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
pyproject.toml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
[build-system]
|
| 8 |
+
requires = ["setuptools>=45", "wheel"]
|
| 9 |
+
build-backend = "setuptools.build_meta"
|
| 10 |
+
|
| 11 |
+
[project]
|
| 12 |
+
name = "openenv-dc_ops_env"
|
| 13 |
+
version = "0.1.0"
|
| 14 |
+
description = "Dc Ops Env environment for OpenEnv"
|
| 15 |
+
requires-python = ">=3.10"
|
| 16 |
+
dependencies = [
|
| 17 |
+
# Core OpenEnv runtime (provides FastAPI server + HTTP client types)
|
| 18 |
+
# install from github
|
| 19 |
+
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
+
"openenv-core[core]>=0.2.1",
|
| 21 |
+
# Environment-specific dependencies
|
| 22 |
+
# Add all dependencies needed for your environment here
|
| 23 |
+
# Examples:
|
| 24 |
+
# "numpy>=1.19.0",
|
| 25 |
+
# "torch>=2.0.0",
|
| 26 |
+
# "gymnasium>=0.29.0",
|
| 27 |
+
# "openspiel>=1.0.0",
|
| 28 |
+
# "smolagents>=1.22.0,<2",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
[project.optional-dependencies]
|
| 32 |
+
dev = [
|
| 33 |
+
"pytest>=8.0.0",
|
| 34 |
+
"pytest-cov>=4.0.0",
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
[project.scripts]
|
| 38 |
+
# Server entry point - enables running via: uv run --project . server
|
| 39 |
+
# or: python -m dc_ops_env.server.app
|
| 40 |
+
server = "dc_ops_env.server.app:main"
|
| 41 |
+
|
| 42 |
+
[tool.setuptools]
|
| 43 |
+
include-package-data = true
|
| 44 |
+
|
| 45 |
+
[tool.setuptools.packages.find]
|
| 46 |
+
where = ["."]
|
| 47 |
+
include = ["dc_ops_env*"]
|
| 48 |
+
|
| 49 |
+
[tool.setuptools.package-dir]
|
| 50 |
+
dc_ops_env = "."
|
rendering/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Dashboard rendering for the DC-Ops environment."""
|
| 8 |
+
|
| 9 |
+
from .dashboard import render_dashboard
|
| 10 |
+
|
| 11 |
+
__all__ = ["render_dashboard"]
|
rendering/dashboard.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Renders simulation state into a text-based monitoring dashboard.
|
| 9 |
+
|
| 10 |
+
The dashboard mimics what a real datacenter operator would see on their
|
| 11 |
+
NOC (Network Operations Center) screens. It is the primary observation
|
| 12 |
+
for the LLM agent.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
from ..config import ASHRAE_CLASSES, m3s_to_cfm
|
| 18 |
+
from ..simulation.types import (
|
| 19 |
+
CRACFaultType,
|
| 20 |
+
CRACState,
|
| 21 |
+
CRACStatus,
|
| 22 |
+
DatacenterState,
|
| 23 |
+
PowerState,
|
| 24 |
+
ZoneState,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def render_dashboard(
|
| 29 |
+
state: DatacenterState,
|
| 30 |
+
*,
|
| 31 |
+
alert: str = "",
|
| 32 |
+
step: int = 0,
|
| 33 |
+
max_steps: int = 15,
|
| 34 |
+
scenario_type: str = "",
|
| 35 |
+
) -> str:
|
| 36 |
+
"""Render the full monitoring dashboard as a text string.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
state: Current datacenter simulation state.
|
| 40 |
+
alert: Active alert message to display prominently.
|
| 41 |
+
step: Current step number in the episode.
|
| 42 |
+
max_steps: Maximum steps in the episode.
|
| 43 |
+
scenario_type: Type of scenario being run.
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
Multi-line string formatted as a monitoring dashboard.
|
| 47 |
+
"""
|
| 48 |
+
w = 68 # Inner width of the dashboard frame
|
| 49 |
+
lines: list[str] = []
|
| 50 |
+
|
| 51 |
+
def hline(char: str = "═") -> str:
|
| 52 |
+
return f"╠{char * w}╣"
|
| 53 |
+
|
| 54 |
+
def row(text: str) -> str:
|
| 55 |
+
return f"║ {text:<{w - 2}} ║"
|
| 56 |
+
|
| 57 |
+
# Header
|
| 58 |
+
lines.append(f"╔{'═' * w}╗")
|
| 59 |
+
title = "DC-OPS MONITORING DASHBOARD"
|
| 60 |
+
lines.append(f"║{title:^{w}}║")
|
| 61 |
+
sim_min = state.sim_time_s / 60.0
|
| 62 |
+
status_line = f"Sim Time: {sim_min:.1f} min Step: {step}/{max_steps}"
|
| 63 |
+
if scenario_type:
|
| 64 |
+
status_line += f" [{scenario_type}]"
|
| 65 |
+
lines.append(row(status_line))
|
| 66 |
+
|
| 67 |
+
# Alert section
|
| 68 |
+
if alert:
|
| 69 |
+
lines.append(hline())
|
| 70 |
+
# Split long alerts across lines
|
| 71 |
+
alert_prefix = "!! ALERT: "
|
| 72 |
+
remaining = w - 2 - len(alert_prefix)
|
| 73 |
+
if len(alert) <= remaining:
|
| 74 |
+
lines.append(row(f"{alert_prefix}{alert}"))
|
| 75 |
+
else:
|
| 76 |
+
lines.append(row(f"{alert_prefix}{alert[:remaining]}"))
|
| 77 |
+
# Continuation lines
|
| 78 |
+
for i in range(remaining, len(alert), w - 4):
|
| 79 |
+
lines.append(row(f" {alert[i:i + w - 4]}"))
|
| 80 |
+
|
| 81 |
+
# Cooling Units
|
| 82 |
+
lines.append(hline())
|
| 83 |
+
lines.append(row("COOLING UNITS"))
|
| 84 |
+
lines.append(row(f"{'Unit':<10} {'Status':<12} {'Setpoint':>8} {'Supply':>8} {'Fan%':>5} {'CFM':>7} {'kW':>6}"))
|
| 85 |
+
lines.append(row("-" * (w - 2)))
|
| 86 |
+
|
| 87 |
+
for zone in state.zones:
|
| 88 |
+
for crac in zone.crac_units:
|
| 89 |
+
lines.append(row(_format_crac_row(crac, state.outside_temp_c, zone.hot_aisle_temp_c)))
|
| 90 |
+
|
| 91 |
+
# Zone Temperatures
|
| 92 |
+
lines.append(hline())
|
| 93 |
+
lines.append(row("ZONE TEMPERATURES"))
|
| 94 |
+
lines.append(row(f"{'Zone':<8} {'Cold Aisle':>10} {'Hot Aisle':>10} {'Max Inlet':>10} {'IT Load':>8} {'Class':>6}"))
|
| 95 |
+
lines.append(row("-" * (w - 2)))
|
| 96 |
+
|
| 97 |
+
for zone in state.zones:
|
| 98 |
+
lines.append(row(_format_zone_row(zone)))
|
| 99 |
+
|
| 100 |
+
# Rack Detail (per zone, show max-temp racks)
|
| 101 |
+
lines.append(hline())
|
| 102 |
+
lines.append(row("RACK TEMPERATURES (top 5 hottest)"))
|
| 103 |
+
lines.append(row(f"{'Rack':<8} {'Inlet':>8} {'Outlet':>8} {'Load kW':>8} {'CFM':>7}"))
|
| 104 |
+
lines.append(row("-" * (w - 2)))
|
| 105 |
+
|
| 106 |
+
# Collect all racks, sort by inlet temp descending
|
| 107 |
+
all_racks = []
|
| 108 |
+
for zone in state.zones:
|
| 109 |
+
all_racks.extend(zone.racks)
|
| 110 |
+
all_racks.sort(key=lambda r: r.inlet_temp_c, reverse=True)
|
| 111 |
+
for rack in all_racks[:5]:
|
| 112 |
+
cfm = m3s_to_cfm(rack.airflow_m3s)
|
| 113 |
+
lines.append(row(
|
| 114 |
+
f"{rack.rack_id:<8} {rack.inlet_temp_c:>7.1f}°C {rack.outlet_temp_c:>7.1f}°C "
|
| 115 |
+
f"{rack.it_load_kw:>7.1f} {cfm:>7.0f}"
|
| 116 |
+
))
|
| 117 |
+
|
| 118 |
+
# Power Section
|
| 119 |
+
lines.append(hline())
|
| 120 |
+
lines.append(row("POWER"))
|
| 121 |
+
|
| 122 |
+
p_it = state.total_it_load_kw
|
| 123 |
+
p_cooling = state.total_cooling_power_kw
|
| 124 |
+
pue = state.pue
|
| 125 |
+
|
| 126 |
+
lines.append(row(
|
| 127 |
+
f"IT Load: {p_it:.1f} kW | Cooling: {p_cooling:.1f} kW | PUE: {pue:.2f}"
|
| 128 |
+
))
|
| 129 |
+
|
| 130 |
+
if state.power is not None:
|
| 131 |
+
lines.append(row(_format_power_section(state.power)))
|
| 132 |
+
lines.append(row(_format_ups_summary(state.power)))
|
| 133 |
+
else:
|
| 134 |
+
lines.append(row("UPS: N/A | Generator: N/A"))
|
| 135 |
+
|
| 136 |
+
# Environment
|
| 137 |
+
lines.append(hline())
|
| 138 |
+
lines.append(row("ENVIRONMENT"))
|
| 139 |
+
lines.append(row(
|
| 140 |
+
f"Outside: {state.outside_temp_c:.1f}°C | "
|
| 141 |
+
f"Humidity: {state.outside_humidity_rh * 100:.0f}% RH"
|
| 142 |
+
))
|
| 143 |
+
|
| 144 |
+
# Footer
|
| 145 |
+
lines.append(f"╚{'═' * w}╝")
|
| 146 |
+
|
| 147 |
+
return "\n".join(lines)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def _format_crac_row(crac: CRACState, outside_temp_c: float, hot_aisle_temp_c: float) -> str:
|
| 151 |
+
"""Format a single CRAC row for the dashboard."""
|
| 152 |
+
# Status display
|
| 153 |
+
if crac.status == CRACStatus.FAULT:
|
| 154 |
+
fault_label = crac.fault_type.value.upper() if crac.fault_type != CRACFaultType.NONE else "FAULT"
|
| 155 |
+
status_str = f"!! {fault_label}"
|
| 156 |
+
elif crac.status == CRACStatus.MAINTENANCE:
|
| 157 |
+
status_str = "MAINT"
|
| 158 |
+
elif crac.status == CRACStatus.STANDBY:
|
| 159 |
+
status_str = "STANDBY"
|
| 160 |
+
else:
|
| 161 |
+
status_str = "RUNNING"
|
| 162 |
+
|
| 163 |
+
# Supply temp display
|
| 164 |
+
if crac.status != CRACStatus.RUNNING:
|
| 165 |
+
supply_str = "---"
|
| 166 |
+
else:
|
| 167 |
+
supply_str = f"{crac.supply_temp_c:.1f}°C"
|
| 168 |
+
|
| 169 |
+
# CFM
|
| 170 |
+
cfm = m3s_to_cfm(crac.current_airflow_m3s)
|
| 171 |
+
|
| 172 |
+
# Power consumption
|
| 173 |
+
q_cool = crac.compute_cooling_output_kw(hot_aisle_temp_c)
|
| 174 |
+
p_kw = crac.compute_power_consumption_kw(q_cool, outside_temp_c)
|
| 175 |
+
|
| 176 |
+
return (
|
| 177 |
+
f"{crac.unit_id:<10} {status_str:<12} {crac.setpoint_c:>7.1f}°C "
|
| 178 |
+
f"{supply_str:>8} {crac.fan_speed_pct:>5.0f} {cfm:>7.0f} {p_kw:>6.1f}"
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def _format_zone_row(zone: ZoneState) -> str:
|
| 183 |
+
"""Format a single zone row for the dashboard."""
|
| 184 |
+
ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
|
| 185 |
+
max_inlet = zone.max_inlet_temp_c
|
| 186 |
+
|
| 187 |
+
# Mark if exceeding ASHRAE recommended
|
| 188 |
+
inlet_marker = ""
|
| 189 |
+
if ashrae and max_inlet > ashrae.recommended_max_c:
|
| 190 |
+
inlet_marker = "*"
|
| 191 |
+
if ashrae and max_inlet > ashrae.allowable_max_c:
|
| 192 |
+
inlet_marker = "!!"
|
| 193 |
+
|
| 194 |
+
return (
|
| 195 |
+
f"{zone.zone_id:<8} {zone.cold_aisle_temp_c:>9.1f}°C "
|
| 196 |
+
f"{zone.hot_aisle_temp_c:>9.1f}°C {max_inlet:>8.1f}°C{inlet_marker:<2}"
|
| 197 |
+
f"{zone.total_it_load_kw:>7.1f} {zone.ashrae_class:>6}"
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def _format_power_section(power: PowerState) -> str:
|
| 202 |
+
"""Format power source status line."""
|
| 203 |
+
parts: list[str] = []
|
| 204 |
+
|
| 205 |
+
# Utility / generator status
|
| 206 |
+
if power.utility_available:
|
| 207 |
+
parts.append("Utility: NORMAL")
|
| 208 |
+
else:
|
| 209 |
+
parts.append("Utility: DOWN")
|
| 210 |
+
|
| 211 |
+
from ..simulation.types import GeneratorState as GS
|
| 212 |
+
gen = power.generator
|
| 213 |
+
if gen.state == GS.OFF:
|
| 214 |
+
parts.append("Gen: OFF")
|
| 215 |
+
elif gen.state == GS.LOADED:
|
| 216 |
+
fuel_hrs = gen.fuel_remaining_hours
|
| 217 |
+
fuel_str = f"{fuel_hrs:.1f}h" if fuel_hrs < 100 else ">100h"
|
| 218 |
+
parts.append(f"Gen: LOADED {gen.load_fraction * 100:.0f}% (fuel: {fuel_str})")
|
| 219 |
+
elif gen.state in (GS.START_DELAY, GS.CRANKING, GS.WARMING):
|
| 220 |
+
parts.append(f"Gen: STARTING ({gen.state.value})")
|
| 221 |
+
elif gen.state == GS.READY:
|
| 222 |
+
parts.append("Gen: READY")
|
| 223 |
+
elif gen.state == GS.COOLDOWN:
|
| 224 |
+
parts.append("Gen: COOLDOWN")
|
| 225 |
+
|
| 226 |
+
# ATS position
|
| 227 |
+
from ..simulation.types import ATSPosition
|
| 228 |
+
ats = power.ats
|
| 229 |
+
if ats.position == ATSPosition.UTILITY:
|
| 230 |
+
parts.append("ATS: UTILITY")
|
| 231 |
+
elif ats.position == ATSPosition.GENERATOR:
|
| 232 |
+
parts.append("ATS: GENERATOR")
|
| 233 |
+
elif ats.position == ATSPosition.TRANSFERRING:
|
| 234 |
+
parts.append("ATS: TRANSFERRING")
|
| 235 |
+
|
| 236 |
+
return " | ".join(parts)
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def _format_ups_summary(power: PowerState) -> str:
|
| 240 |
+
"""Format UPS status summary line."""
|
| 241 |
+
if not power.ups_units:
|
| 242 |
+
return "UPS: N/A"
|
| 243 |
+
|
| 244 |
+
parts: list[str] = []
|
| 245 |
+
for ups in power.ups_units:
|
| 246 |
+
soc_pct = ups.battery_soc * 100
|
| 247 |
+
mode_str = ups.mode.value.upper().replace("_", " ")
|
| 248 |
+
load_pct = ups.load_fraction * 100
|
| 249 |
+
eta_pct = ups.efficiency * 100
|
| 250 |
+
|
| 251 |
+
if ups.mode.value == "on_battery":
|
| 252 |
+
time_str = ""
|
| 253 |
+
if ups.battery_time_remaining_s < float("inf"):
|
| 254 |
+
mins = ups.battery_time_remaining_s / 60.0
|
| 255 |
+
time_str = f" {mins:.0f}min"
|
| 256 |
+
parts.append(f"{ups.unit_id}: BATTERY {soc_pct:.0f}%{time_str}")
|
| 257 |
+
elif ups.mode.value == "fault":
|
| 258 |
+
parts.append(f"{ups.unit_id}: FAULT")
|
| 259 |
+
else:
|
| 260 |
+
parts.append(f"{ups.unit_id}: {mode_str} {load_pct:.0f}% η{eta_pct:.0f}%")
|
| 261 |
+
|
| 262 |
+
return "UPS: " + " | ".join(parts)
|
rewards/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Reward system for DC-Ops environment."""
|
| 8 |
+
|
| 9 |
+
from .reward_function import (
|
| 10 |
+
RewardComponents,
|
| 11 |
+
RewardFunction,
|
| 12 |
+
RewardWeights,
|
| 13 |
+
WEIGHT_PROFILES,
|
| 14 |
+
softplus,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
__all__ = [
|
| 18 |
+
"RewardComponents",
|
| 19 |
+
"RewardFunction",
|
| 20 |
+
"RewardWeights",
|
| 21 |
+
"WEIGHT_PROFILES",
|
| 22 |
+
"softplus",
|
| 23 |
+
]
|
rewards/reward_function.py
ADDED
|
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Multi-objective reward function for DC-Ops environment.
|
| 8 |
+
|
| 9 |
+
Research-informed design:
|
| 10 |
+
- Softplus barrier functions for safety constraints
|
| 11 |
+
(Google/DeepMind 2017, ICLR 2025 DC Cooling)
|
| 12 |
+
- Delta-based progress rewards for credit assignment
|
| 13 |
+
(process reward model literature)
|
| 14 |
+
- Normalized components in [-1, 1] via tanh
|
| 15 |
+
- Scenario-type-aware weight profiles
|
| 16 |
+
|
| 17 |
+
All components are bounded to [-1, 1]. Total reward is the weighted sum,
|
| 18 |
+
clamped to [-1, 1].
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
import math
|
| 24 |
+
from dataclasses import dataclass, field
|
| 25 |
+
from typing import Optional
|
| 26 |
+
|
| 27 |
+
from ..config import ASHRAE_CLASSES
|
| 28 |
+
from ..simulation.thermal import ThermalSimulation
|
| 29 |
+
from ..simulation.power import PowerSimulation
|
| 30 |
+
from ..simulation.types import UPSMode
|
| 31 |
+
from ..actions.parser import CommandResult
|
| 32 |
+
from ..scenarios.base import ScenarioResult
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ---------------------------------------------------------------------------
|
| 36 |
+
# Numerically stable softplus
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
def softplus(x: float) -> float:
|
| 39 |
+
"""Numerically stable softplus: ln(1 + exp(x)).
|
| 40 |
+
|
| 41 |
+
- x > 20: returns x (avoids exp overflow)
|
| 42 |
+
- x < -20: returns 0.0 (avoids underflow noise)
|
| 43 |
+
"""
|
| 44 |
+
if x > 20.0:
|
| 45 |
+
return x
|
| 46 |
+
if x < -20.0:
|
| 47 |
+
return 0.0
|
| 48 |
+
return math.log1p(math.exp(x))
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# ---------------------------------------------------------------------------
|
| 52 |
+
# Reward components dataclass
|
| 53 |
+
# ---------------------------------------------------------------------------
|
| 54 |
+
@dataclass
|
| 55 |
+
class RewardComponents:
|
| 56 |
+
"""Individual reward components for logging and analysis."""
|
| 57 |
+
|
| 58 |
+
thermal_safety: float = 0.0
|
| 59 |
+
power_safety: float = 0.0
|
| 60 |
+
efficiency: float = 0.0
|
| 61 |
+
scenario_progress: float = 0.0
|
| 62 |
+
procedure: float = 0.0
|
| 63 |
+
action_quality: float = 0.0
|
| 64 |
+
speed_bonus: float = 0.0
|
| 65 |
+
total: float = 0.0
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# ---------------------------------------------------------------------------
|
| 69 |
+
# Weight profiles
|
| 70 |
+
# ---------------------------------------------------------------------------
|
| 71 |
+
@dataclass
|
| 72 |
+
class RewardWeights:
|
| 73 |
+
"""Weights for reward components. Should sum to 1.0."""
|
| 74 |
+
|
| 75 |
+
thermal_safety: float = 0.30
|
| 76 |
+
power_safety: float = 0.10
|
| 77 |
+
efficiency: float = 0.15
|
| 78 |
+
scenario_progress: float = 0.25
|
| 79 |
+
procedure: float = 0.15
|
| 80 |
+
action_quality: float = 0.05
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
WEIGHT_PROFILES: dict[str, RewardWeights] = {
|
| 84 |
+
"thermal": RewardWeights(
|
| 85 |
+
thermal_safety=0.30,
|
| 86 |
+
power_safety=0.05,
|
| 87 |
+
efficiency=0.10,
|
| 88 |
+
scenario_progress=0.30,
|
| 89 |
+
procedure=0.20,
|
| 90 |
+
action_quality=0.05,
|
| 91 |
+
),
|
| 92 |
+
"power": RewardWeights(
|
| 93 |
+
thermal_safety=0.10,
|
| 94 |
+
power_safety=0.25,
|
| 95 |
+
efficiency=0.05,
|
| 96 |
+
scenario_progress=0.30,
|
| 97 |
+
procedure=0.25,
|
| 98 |
+
action_quality=0.05,
|
| 99 |
+
),
|
| 100 |
+
"default": RewardWeights(
|
| 101 |
+
thermal_safety=0.30,
|
| 102 |
+
power_safety=0.15,
|
| 103 |
+
efficiency=0.25,
|
| 104 |
+
scenario_progress=0.0,
|
| 105 |
+
procedure=0.0,
|
| 106 |
+
action_quality=0.30,
|
| 107 |
+
),
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ---------------------------------------------------------------------------
|
| 112 |
+
# Softplus barrier constants
|
| 113 |
+
# ---------------------------------------------------------------------------
|
| 114 |
+
# Thermal barriers
|
| 115 |
+
_ALPHA_RECOMMENDED = 2.0 # °C transition width at recommended limit
|
| 116 |
+
_ALPHA_ALLOWABLE = 1.5 # °C transition width at allowable limit
|
| 117 |
+
_ALLOWABLE_WEIGHT = 3.0 # Allowable violations 3x worse per degree
|
| 118 |
+
_THERMAL_NORM = 8.0 # Normalization so T=40°C (A2) → R≈-0.97
|
| 119 |
+
|
| 120 |
+
# Thermal safety positive baseline — small reward for being well within limits
|
| 121 |
+
# Based on DCRL-Green (ICLR 2025): agents learn faster with a positive signal
|
| 122 |
+
# for maintaining safe state, not just penalties for violations.
|
| 123 |
+
_SAFE_MARGIN_C = 3.0 # °C below recommended max to qualify as "safe"
|
| 124 |
+
_SAFE_BASELINE = 0.1 # Small positive reward when all zones safe
|
| 125 |
+
|
| 126 |
+
# Power barriers
|
| 127 |
+
_SOC_THRESHOLD = 0.5 # Concern increases below 50% SOC
|
| 128 |
+
_SOC_ALPHA = 0.15 # Sharp transition around threshold
|
| 129 |
+
_UPS_FAULT_PENALTY = 5.0 # Fixed penalty for UPS fault
|
| 130 |
+
_POWER_NORM = 4.0 # Normalization constant
|
| 131 |
+
|
| 132 |
+
# Efficiency
|
| 133 |
+
_PUE_NORM = 2.0 # PUE sensitivity: PUE=3.0 → R≈-0.76
|
| 134 |
+
|
| 135 |
+
# Action quality
|
| 136 |
+
_REPEAT_WHITELIST = frozenset({"wait", "check_status"})
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# ---------------------------------------------------------------------------
|
| 140 |
+
# Main reward function
|
| 141 |
+
# ---------------------------------------------------------------------------
|
| 142 |
+
class RewardFunction:
|
| 143 |
+
"""Composable, research-informed reward function for DC operations.
|
| 144 |
+
|
| 145 |
+
Usage:
|
| 146 |
+
rf = RewardFunction(scenario_type="thermal")
|
| 147 |
+
rf.reset() # Call at episode start
|
| 148 |
+
|
| 149 |
+
# Each step:
|
| 150 |
+
components = rf.compute(thermal_sim, power_sim, cmd_result,
|
| 151 |
+
action_command, action_history, scenario_result)
|
| 152 |
+
reward = components.total
|
| 153 |
+
"""
|
| 154 |
+
|
| 155 |
+
def __init__(
|
| 156 |
+
self,
|
| 157 |
+
scenario_type: str = "default",
|
| 158 |
+
weights: Optional[RewardWeights] = None,
|
| 159 |
+
) -> None:
|
| 160 |
+
self._scenario_type = scenario_type
|
| 161 |
+
self._weights = weights or WEIGHT_PROFILES.get(
|
| 162 |
+
scenario_type, WEIGHT_PROFILES["default"]
|
| 163 |
+
)
|
| 164 |
+
self._prev_progress: float = 0.0
|
| 165 |
+
|
| 166 |
+
def reset(self) -> None:
|
| 167 |
+
"""Reset state between episodes."""
|
| 168 |
+
self._prev_progress = 0.0
|
| 169 |
+
|
| 170 |
+
def compute(
|
| 171 |
+
self,
|
| 172 |
+
thermal_sim: ThermalSimulation,
|
| 173 |
+
power_sim: Optional[PowerSimulation],
|
| 174 |
+
cmd_result: CommandResult,
|
| 175 |
+
action_command: str,
|
| 176 |
+
action_history: list[str],
|
| 177 |
+
scenario_result: Optional[ScenarioResult],
|
| 178 |
+
) -> RewardComponents:
|
| 179 |
+
"""Compute all reward components and weighted total.
|
| 180 |
+
|
| 181 |
+
Returns RewardComponents with per-component values and total.
|
| 182 |
+
Total is clamped to [-1, 1].
|
| 183 |
+
"""
|
| 184 |
+
r_thermal = self._thermal_safety(thermal_sim)
|
| 185 |
+
r_power = self._power_safety(power_sim)
|
| 186 |
+
r_efficiency = self._efficiency(thermal_sim, power_sim)
|
| 187 |
+
r_progress = self._scenario_progress(scenario_result)
|
| 188 |
+
r_procedure = self._procedure(scenario_result)
|
| 189 |
+
r_action = self._action_quality(
|
| 190 |
+
cmd_result, action_command, action_history, thermal_sim, power_sim,
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
w = self._weights
|
| 194 |
+
total = (
|
| 195 |
+
w.thermal_safety * r_thermal
|
| 196 |
+
+ w.power_safety * r_power
|
| 197 |
+
+ w.efficiency * r_efficiency
|
| 198 |
+
+ w.scenario_progress * r_progress
|
| 199 |
+
+ w.procedure * r_procedure
|
| 200 |
+
+ w.action_quality * r_action
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
total = max(-1.0, min(1.0, total))
|
| 204 |
+
|
| 205 |
+
return RewardComponents(
|
| 206 |
+
thermal_safety=r_thermal,
|
| 207 |
+
power_safety=r_power,
|
| 208 |
+
efficiency=r_efficiency,
|
| 209 |
+
scenario_progress=r_progress,
|
| 210 |
+
procedure=r_procedure,
|
| 211 |
+
action_quality=r_action,
|
| 212 |
+
total=total,
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
# -------------------------------------------------------------------
|
| 216 |
+
# Component implementations
|
| 217 |
+
# -------------------------------------------------------------------
|
| 218 |
+
|
| 219 |
+
@staticmethod
|
| 220 |
+
def _thermal_safety(thermal_sim: ThermalSimulation) -> float:
|
| 221 |
+
"""ASHRAE compliance via dual softplus barriers.
|
| 222 |
+
|
| 223 |
+
Returns value in [-1, _SAFE_BASELINE].
|
| 224 |
+
Two barriers per zone: recommended (gentle) and allowable (steep).
|
| 225 |
+
Averaged across zones so the signal is independent of zone count.
|
| 226 |
+
|
| 227 |
+
Positive baseline (+0.1) when ALL zones are well within safe range
|
| 228 |
+
(>= _SAFE_MARGIN_C below recommended max). This provides gradient
|
| 229 |
+
signal for maintaining good state, not just avoiding violations.
|
| 230 |
+
(Informed by DCRL-Green, ICLR 2025.)
|
| 231 |
+
"""
|
| 232 |
+
zones = thermal_sim.state.zones
|
| 233 |
+
if not zones:
|
| 234 |
+
return 0.0
|
| 235 |
+
|
| 236 |
+
n_zones = len(zones)
|
| 237 |
+
penalty = 0.0
|
| 238 |
+
all_safe = True
|
| 239 |
+
|
| 240 |
+
for zone in zones:
|
| 241 |
+
ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
|
| 242 |
+
if not ashrae:
|
| 243 |
+
continue
|
| 244 |
+
|
| 245 |
+
t = zone.max_inlet_temp_c
|
| 246 |
+
rec_max = ashrae.recommended_max_c
|
| 247 |
+
allow_max = ashrae.allowable_max_c
|
| 248 |
+
|
| 249 |
+
# Check if zone is well within safe range
|
| 250 |
+
if t > rec_max - _SAFE_MARGIN_C:
|
| 251 |
+
all_safe = False
|
| 252 |
+
|
| 253 |
+
# Soft barrier at recommended limit
|
| 254 |
+
penalty += softplus((t - rec_max) / _ALPHA_RECOMMENDED) / n_zones
|
| 255 |
+
# Harder barrier at allowable limit
|
| 256 |
+
penalty += (
|
| 257 |
+
_ALLOWABLE_WEIGHT
|
| 258 |
+
* softplus((t - allow_max) / _ALPHA_ALLOWABLE)
|
| 259 |
+
/ n_zones
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
if penalty < 1e-6 and all_safe:
|
| 263 |
+
return _SAFE_BASELINE
|
| 264 |
+
|
| 265 |
+
return -math.tanh(penalty / _THERMAL_NORM)
|
| 266 |
+
|
| 267 |
+
@staticmethod
|
| 268 |
+
def _power_safety(power_sim: Optional[PowerSimulation]) -> float:
|
| 269 |
+
"""UPS battery and fault condition penalty.
|
| 270 |
+
|
| 271 |
+
Returns value in [-1, 0].
|
| 272 |
+
Penalty compounds across multiple failing UPS units.
|
| 273 |
+
"""
|
| 274 |
+
if power_sim is None:
|
| 275 |
+
return 0.0
|
| 276 |
+
|
| 277 |
+
penalty = 0.0
|
| 278 |
+
for ups in power_sim.state.ups_units:
|
| 279 |
+
if ups.mode == UPSMode.ON_BATTERY:
|
| 280 |
+
penalty += softplus((_SOC_THRESHOLD - ups.battery_soc) / _SOC_ALPHA)
|
| 281 |
+
elif ups.mode == UPSMode.FAULT:
|
| 282 |
+
penalty += _UPS_FAULT_PENALTY
|
| 283 |
+
|
| 284 |
+
return -math.tanh(penalty / _POWER_NORM)
|
| 285 |
+
|
| 286 |
+
@staticmethod
|
| 287 |
+
def _efficiency(
|
| 288 |
+
thermal_sim: ThermalSimulation,
|
| 289 |
+
power_sim: Optional[PowerSimulation],
|
| 290 |
+
) -> float:
|
| 291 |
+
"""PUE-based energy efficiency penalty.
|
| 292 |
+
|
| 293 |
+
Returns value in [-1, 0].
|
| 294 |
+
PUE 1.0 (ideal) → 0, PUE 2.0 → -0.46, PUE 3.0 → -0.76.
|
| 295 |
+
|
| 296 |
+
During power emergencies (UPS on battery), efficiency is suppressed
|
| 297 |
+
to zero — the agent should not be penalized for load shedding that
|
| 298 |
+
increases PUE but correctly preserves battery life.
|
| 299 |
+
"""
|
| 300 |
+
# Suppress efficiency signal during power emergencies
|
| 301 |
+
if power_sim is not None:
|
| 302 |
+
for ups in power_sim.state.ups_units:
|
| 303 |
+
if ups.mode in (UPSMode.ON_BATTERY, UPSMode.FAULT):
|
| 304 |
+
return 0.0
|
| 305 |
+
|
| 306 |
+
pue = thermal_sim.state.pue
|
| 307 |
+
return -math.tanh((pue - 1.0) / _PUE_NORM)
|
| 308 |
+
|
| 309 |
+
def _scenario_progress(self, scenario_result: Optional[ScenarioResult]) -> float:
|
| 310 |
+
"""Delta-based progress toward scenario resolution.
|
| 311 |
+
|
| 312 |
+
Returns value in [-1, 1].
|
| 313 |
+
Rewards the CHANGE in progress — gives credit to the action that
|
| 314 |
+
actually caused forward progress.
|
| 315 |
+
"""
|
| 316 |
+
if scenario_result is None:
|
| 317 |
+
return 0.0
|
| 318 |
+
|
| 319 |
+
current = scenario_result.progress
|
| 320 |
+
delta = current - self._prev_progress
|
| 321 |
+
self._prev_progress = current
|
| 322 |
+
|
| 323 |
+
return max(-1.0, min(1.0, delta))
|
| 324 |
+
|
| 325 |
+
@staticmethod
|
| 326 |
+
def _procedure(scenario_result: Optional[ScenarioResult]) -> float:
|
| 327 |
+
"""Procedural correctness from scenario rules.
|
| 328 |
+
|
| 329 |
+
Returns value in [-1, 1].
|
| 330 |
+
"""
|
| 331 |
+
if scenario_result is None:
|
| 332 |
+
return 0.0
|
| 333 |
+
return max(-1.0, min(1.0, scenario_result.procedure_reward))
|
| 334 |
+
|
| 335 |
+
@staticmethod
|
| 336 |
+
def _action_quality(
|
| 337 |
+
cmd_result: CommandResult,
|
| 338 |
+
action_command: str,
|
| 339 |
+
action_history: list[str],
|
| 340 |
+
thermal_sim: ThermalSimulation,
|
| 341 |
+
power_sim: Optional[PowerSimulation],
|
| 342 |
+
) -> float:
|
| 343 |
+
"""Action quality assessment.
|
| 344 |
+
|
| 345 |
+
Returns value in [-1, 1].
|
| 346 |
+
Considers: validity, repetition, action type, urgency context.
|
| 347 |
+
"""
|
| 348 |
+
if not cmd_result.success:
|
| 349 |
+
return -0.5
|
| 350 |
+
|
| 351 |
+
cmd_lower = action_command.strip().lower()
|
| 352 |
+
name = cmd_result.command_name
|
| 353 |
+
|
| 354 |
+
# Check for exact repeated command — but whitelist commands that
|
| 355 |
+
# are legitimately repeatable (wait, check_status).
|
| 356 |
+
if name not in _REPEAT_WHITELIST:
|
| 357 |
+
prior = (
|
| 358 |
+
[h.strip().lower() for h in action_history[:-1]]
|
| 359 |
+
if len(action_history) > 1
|
| 360 |
+
else []
|
| 361 |
+
)
|
| 362 |
+
if cmd_lower in prior:
|
| 363 |
+
return -0.2
|
| 364 |
+
|
| 365 |
+
# "wait" quality depends on whether there's an active concern
|
| 366 |
+
if name == "wait":
|
| 367 |
+
if _has_active_concern(thermal_sim, power_sim):
|
| 368 |
+
# Waiting during a power event where we're waiting for
|
| 369 |
+
# generator startup is acceptable — check if generator
|
| 370 |
+
# is in startup sequence.
|
| 371 |
+
if power_sim is not None and _generator_starting(power_sim):
|
| 372 |
+
return 0.1 # Waiting for gen to warm up is reasonable
|
| 373 |
+
return -0.2 # Waiting during a thermal problem
|
| 374 |
+
return 0.0 # Nothing wrong, waiting is fine
|
| 375 |
+
|
| 376 |
+
# Information-gathering actions are valuable
|
| 377 |
+
if name in ("diagnose", "check_status"):
|
| 378 |
+
return 0.3
|
| 379 |
+
|
| 380 |
+
# Active interventions
|
| 381 |
+
if name in (
|
| 382 |
+
"adjust_setpoint", "set_fan_speed", "set_rack_load",
|
| 383 |
+
"migrate_workload", "start_generator", "stop_generator",
|
| 384 |
+
"set_ups_mode", "start_crac", "stop_crac", "refuel_generator",
|
| 385 |
+
):
|
| 386 |
+
return 0.2
|
| 387 |
+
|
| 388 |
+
# Administrative
|
| 389 |
+
if name == "acknowledge_alarm":
|
| 390 |
+
return 0.1
|
| 391 |
+
|
| 392 |
+
# Escalation — handled solely by scenario procedure rules now,
|
| 393 |
+
# no extra penalty here. The environment no longer double-penalizes.
|
| 394 |
+
if name == "escalate":
|
| 395 |
+
return -0.1
|
| 396 |
+
|
| 397 |
+
return 0.1 # Other valid commands
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
# ---------------------------------------------------------------------------
|
| 401 |
+
# Helpers
|
| 402 |
+
# ---------------------------------------------------------------------------
|
| 403 |
+
def _has_active_concern(
|
| 404 |
+
thermal_sim: ThermalSimulation,
|
| 405 |
+
power_sim: Optional[PowerSimulation],
|
| 406 |
+
) -> bool:
|
| 407 |
+
"""Check if there is an active thermal or power concern."""
|
| 408 |
+
for zone in thermal_sim.state.zones:
|
| 409 |
+
ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
|
| 410 |
+
if ashrae and zone.max_inlet_temp_c > ashrae.recommended_max_c:
|
| 411 |
+
return True
|
| 412 |
+
|
| 413 |
+
if power_sim:
|
| 414 |
+
for ups in power_sim.state.ups_units:
|
| 415 |
+
if ups.mode == UPSMode.ON_BATTERY:
|
| 416 |
+
return True
|
| 417 |
+
|
| 418 |
+
return False
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
def _generator_starting(power_sim: PowerSimulation) -> bool:
|
| 422 |
+
"""Check if the generator is in a startup sequence (agent should wait)."""
|
| 423 |
+
from ..simulation.types import GeneratorState
|
| 424 |
+
return power_sim.state.generator.state in (
|
| 425 |
+
GeneratorState.START_DELAY,
|
| 426 |
+
GeneratorState.CRANKING,
|
| 427 |
+
GeneratorState.WARMING,
|
| 428 |
+
)
|
scenarios/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Datacenter operation scenarios."""
|
| 8 |
+
|
| 9 |
+
from .base import ProcedureRule, Scenario, ScenarioResult
|
| 10 |
+
from .registry import (
|
| 11 |
+
get_scenario,
|
| 12 |
+
list_scenarios,
|
| 13 |
+
random_scenario,
|
| 14 |
+
register_scenario,
|
| 15 |
+
registered_scenario_ids,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# Import scenario modules to trigger registration
|
| 19 |
+
from . import thermal_scenarios # noqa: F401
|
| 20 |
+
from . import power_scenarios # noqa: F401
|
| 21 |
+
|
| 22 |
+
__all__ = [
|
| 23 |
+
"ProcedureRule",
|
| 24 |
+
"Scenario",
|
| 25 |
+
"ScenarioResult",
|
| 26 |
+
"get_scenario",
|
| 27 |
+
"list_scenarios",
|
| 28 |
+
"random_scenario",
|
| 29 |
+
"register_scenario",
|
| 30 |
+
"registered_scenario_ids",
|
| 31 |
+
]
|
scenarios/base.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Abstract base class for datacenter operation scenarios.
|
| 9 |
+
|
| 10 |
+
A Scenario defines:
|
| 11 |
+
- Initial datacenter configuration overrides
|
| 12 |
+
- Fault injection (what goes wrong)
|
| 13 |
+
- Available actions for the agent
|
| 14 |
+
- Resolution criteria (how to "win")
|
| 15 |
+
- Scenario-specific reward shaping
|
| 16 |
+
- Procedural correctness rules (diagnose before repair, etc.)
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
from abc import ABC, abstractmethod
|
| 22 |
+
from dataclasses import dataclass, field
|
| 23 |
+
from typing import Any
|
| 24 |
+
|
| 25 |
+
from ..config import DatacenterConfig
|
| 26 |
+
from ..simulation.thermal import ThermalSimulation
|
| 27 |
+
from ..simulation.power import PowerSimulation
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class ProcedureRule:
|
| 32 |
+
"""A procedural correctness rule for reward shaping.
|
| 33 |
+
|
| 34 |
+
Attributes:
|
| 35 |
+
required_before: Commands that must appear before `trigger_command`.
|
| 36 |
+
trigger_command: The command this rule applies to.
|
| 37 |
+
bonus: Reward bonus if required_before was satisfied.
|
| 38 |
+
penalty: Reward penalty if trigger_command issued without required_before.
|
| 39 |
+
description: Human-readable explanation.
|
| 40 |
+
"""
|
| 41 |
+
required_before: list[str]
|
| 42 |
+
trigger_command: str
|
| 43 |
+
bonus: float = 0.3
|
| 44 |
+
penalty: float = -0.2
|
| 45 |
+
description: str = ""
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@dataclass
|
| 49 |
+
class ScenarioResult:
|
| 50 |
+
"""Outcome of checking scenario state after a step.
|
| 51 |
+
|
| 52 |
+
Attributes:
|
| 53 |
+
resolved: True if the incident is successfully resolved.
|
| 54 |
+
resolution_message: Human-readable message on resolution.
|
| 55 |
+
scenario_reward: Legacy scenario-specific reward (kept for compat).
|
| 56 |
+
procedure_reward: Procedural correctness reward from check_procedure().
|
| 57 |
+
progress: Normalized [0, 1] progress toward resolution.
|
| 58 |
+
Used by the delta-based reward function for credit assignment.
|
| 59 |
+
info: Additional scenario-specific data for logging.
|
| 60 |
+
"""
|
| 61 |
+
resolved: bool = False
|
| 62 |
+
resolution_message: str = ""
|
| 63 |
+
scenario_reward: float = 0.0
|
| 64 |
+
procedure_reward: float = 0.0
|
| 65 |
+
progress: float = 0.0
|
| 66 |
+
info: dict[str, Any] = field(default_factory=dict)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class Scenario(ABC):
|
| 70 |
+
"""Abstract base class for datacenter operation scenarios.
|
| 71 |
+
|
| 72 |
+
Lifecycle:
|
| 73 |
+
1. Environment calls `configure(config)` to get modified DatacenterConfig
|
| 74 |
+
2. Environment calls `inject_fault(thermal_sim, power_sim)` after warmup
|
| 75 |
+
3. Each step, environment calls `evaluate_step(...)` for reward + resolution
|
| 76 |
+
4. Environment uses `alert_message`, `step_budget`, etc. for episode control
|
| 77 |
+
|
| 78 |
+
Subclasses must implement all abstract methods/properties.
|
| 79 |
+
"""
|
| 80 |
+
|
| 81 |
+
@abstractmethod
|
| 82 |
+
def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
|
| 83 |
+
"""Optionally modify the datacenter configuration for this scenario.
|
| 84 |
+
|
| 85 |
+
Override to change rack loads, outside temperature, number of CRACs, etc.
|
| 86 |
+
Return the base_config unchanged if no modifications needed.
|
| 87 |
+
"""
|
| 88 |
+
|
| 89 |
+
def reset_state(self) -> None:
|
| 90 |
+
"""Reset mutable episode state between episodes.
|
| 91 |
+
|
| 92 |
+
Called by the environment at the start of each episode, before
|
| 93 |
+
configure() / inject_fault(). Subclasses with mutable state
|
| 94 |
+
(counters, flags) MUST override this and reset them.
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
@abstractmethod
|
| 98 |
+
def inject_fault(
|
| 99 |
+
self,
|
| 100 |
+
thermal_sim: ThermalSimulation,
|
| 101 |
+
power_sim: PowerSimulation | None,
|
| 102 |
+
) -> None:
|
| 103 |
+
"""Inject the fault or initial condition into the running simulation.
|
| 104 |
+
|
| 105 |
+
Called after warmup, so the datacenter is at quasi-steady-state.
|
| 106 |
+
"""
|
| 107 |
+
|
| 108 |
+
@abstractmethod
|
| 109 |
+
def evaluate_step(
|
| 110 |
+
self,
|
| 111 |
+
thermal_sim: ThermalSimulation,
|
| 112 |
+
power_sim: PowerSimulation | None,
|
| 113 |
+
action_command: str,
|
| 114 |
+
action_history: list[str],
|
| 115 |
+
step: int,
|
| 116 |
+
) -> ScenarioResult:
|
| 117 |
+
"""Evaluate the current state after a step.
|
| 118 |
+
|
| 119 |
+
Returns ScenarioResult with:
|
| 120 |
+
- resolved: True if the incident is successfully resolved
|
| 121 |
+
- scenario_reward: Scenario-specific reward component
|
| 122 |
+
- procedure_reward: Procedural correctness reward
|
| 123 |
+
"""
|
| 124 |
+
|
| 125 |
+
@property
|
| 126 |
+
@abstractmethod
|
| 127 |
+
def scenario_id(self) -> str:
|
| 128 |
+
"""Unique identifier, e.g. 'A1', 'B4'."""
|
| 129 |
+
|
| 130 |
+
@property
|
| 131 |
+
@abstractmethod
|
| 132 |
+
def name(self) -> str:
|
| 133 |
+
"""Human-readable scenario name."""
|
| 134 |
+
|
| 135 |
+
@property
|
| 136 |
+
@abstractmethod
|
| 137 |
+
def scenario_type(self) -> str:
|
| 138 |
+
"""Category: 'thermal', 'power', 'network', 'incident'."""
|
| 139 |
+
|
| 140 |
+
@property
|
| 141 |
+
@abstractmethod
|
| 142 |
+
def difficulty(self) -> str:
|
| 143 |
+
"""'easy', 'medium', 'hard'."""
|
| 144 |
+
|
| 145 |
+
@property
|
| 146 |
+
@abstractmethod
|
| 147 |
+
def step_budget(self) -> int:
|
| 148 |
+
"""Maximum steps allowed for this scenario."""
|
| 149 |
+
|
| 150 |
+
@property
|
| 151 |
+
@abstractmethod
|
| 152 |
+
def alert_message(self) -> str:
|
| 153 |
+
"""Initial alert shown to the agent."""
|
| 154 |
+
|
| 155 |
+
@property
|
| 156 |
+
def game_time_per_step_s(self) -> float:
|
| 157 |
+
"""Simulation time per agent step. Override for faster/slower scenarios."""
|
| 158 |
+
return 60.0
|
| 159 |
+
|
| 160 |
+
@property
|
| 161 |
+
def procedure_rules(self) -> list[ProcedureRule]:
|
| 162 |
+
"""Procedural correctness rules. Override to define scenario-specific rules."""
|
| 163 |
+
return []
|
| 164 |
+
|
| 165 |
+
@property
|
| 166 |
+
def available_actions(self) -> list[str] | None:
|
| 167 |
+
"""Override to restrict available actions. None = all actions available."""
|
| 168 |
+
return None
|
| 169 |
+
|
| 170 |
+
def check_procedure(self, action_command: str, action_history: list[str]) -> float:
|
| 171 |
+
"""Check procedural correctness of the current action against history.
|
| 172 |
+
|
| 173 |
+
Returns reward bonus/penalty based on whether required prerequisites
|
| 174 |
+
were satisfied before the current action.
|
| 175 |
+
"""
|
| 176 |
+
if not self.procedure_rules:
|
| 177 |
+
return 0.0
|
| 178 |
+
|
| 179 |
+
# Extract just the command name (first word)
|
| 180 |
+
cmd_name = action_command.strip().split()[0].lower() if action_command.strip() else ""
|
| 181 |
+
history_cmds = [h.strip().split()[0].lower() for h in action_history[:-1] if h.strip()]
|
| 182 |
+
|
| 183 |
+
reward = 0.0
|
| 184 |
+
for rule in self.procedure_rules:
|
| 185 |
+
if cmd_name == rule.trigger_command:
|
| 186 |
+
# Check if all required_before commands appeared in history
|
| 187 |
+
all_satisfied = all(
|
| 188 |
+
any(req == h for h in history_cmds)
|
| 189 |
+
for req in rule.required_before
|
| 190 |
+
)
|
| 191 |
+
if all_satisfied:
|
| 192 |
+
reward += rule.bonus
|
| 193 |
+
else:
|
| 194 |
+
reward += rule.penalty
|
| 195 |
+
return reward
|
scenarios/power_scenarios.py
ADDED
|
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Power operation scenarios (Category B).
|
| 9 |
+
|
| 10 |
+
B1: UPS Alarm Response (Medium)
|
| 11 |
+
- UPS switches to battery after utility micro-outage
|
| 12 |
+
- Agent must verify UPS status, check battery, ensure generator readiness
|
| 13 |
+
B3: Generator Test Protocol (Easy)
|
| 14 |
+
- Monthly generator test — agent must follow proper procedure
|
| 15 |
+
- Start generator, verify output, run loaded test, cooldown, shutdown
|
| 16 |
+
B4: Power Failure Cascade (Hard)
|
| 17 |
+
- Full utility loss + generator fails to start
|
| 18 |
+
- Agent must manage UPS battery time, shed load, troubleshoot generator
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
from ..config import ASHRAE_CLASSES, DatacenterConfig
|
| 24 |
+
from ..simulation.thermal import ThermalSimulation
|
| 25 |
+
from ..simulation.power import PowerSimulation
|
| 26 |
+
from ..simulation.types import GeneratorState, UPSMode
|
| 27 |
+
from .base import ProcedureRule, Scenario, ScenarioResult
|
| 28 |
+
from .registry import register_scenario
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# ===========================================================================
|
| 32 |
+
# B1: UPS Alarm Response (Medium)
|
| 33 |
+
# ===========================================================================
|
| 34 |
+
@register_scenario
|
| 35 |
+
class UPSAlarmResponse(Scenario):
|
| 36 |
+
"""Agent responds to UPS switching to battery.
|
| 37 |
+
|
| 38 |
+
Scenario: A brief utility dip caused UPS to transfer to battery.
|
| 39 |
+
Utility has been restored, but the agent should:
|
| 40 |
+
1. Check UPS status (diagnose UPS-1)
|
| 41 |
+
2. Verify battery SOC
|
| 42 |
+
3. Verify generator is in standby and ready
|
| 43 |
+
4. Verify ATS is back on utility
|
| 44 |
+
5. Acknowledge the alarm
|
| 45 |
+
|
| 46 |
+
Resolution: Agent diagnoses UPS AND acknowledges alarm.
|
| 47 |
+
The system will self-recover, but proper procedure matters.
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
_BATTERY_DRAIN_SECONDS = 30 # Brief outage duration
|
| 51 |
+
|
| 52 |
+
def __init__(self) -> None:
|
| 53 |
+
super().__init__()
|
| 54 |
+
self._diagnosed_ups = False
|
| 55 |
+
self._acknowledged = False
|
| 56 |
+
|
| 57 |
+
def reset_state(self) -> None:
|
| 58 |
+
self._diagnosed_ups = False
|
| 59 |
+
self._acknowledged = False
|
| 60 |
+
|
| 61 |
+
def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
|
| 62 |
+
return base_config
|
| 63 |
+
|
| 64 |
+
def inject_fault(
|
| 65 |
+
self,
|
| 66 |
+
thermal_sim: ThermalSimulation,
|
| 67 |
+
power_sim: PowerSimulation | None,
|
| 68 |
+
) -> None:
|
| 69 |
+
if power_sim is None:
|
| 70 |
+
return
|
| 71 |
+
# Simulate a brief utility outage that has already ended
|
| 72 |
+
# Drain some battery to show it was on battery
|
| 73 |
+
for ups in power_sim.state.ups_units:
|
| 74 |
+
ups.battery_soc = 0.85 # ~15% used during brief outage
|
| 75 |
+
ups.mode = UPSMode.DOUBLE_CONVERSION # Already back on utility
|
| 76 |
+
|
| 77 |
+
def evaluate_step(
|
| 78 |
+
self,
|
| 79 |
+
thermal_sim: ThermalSimulation,
|
| 80 |
+
power_sim: PowerSimulation | None,
|
| 81 |
+
action_command: str,
|
| 82 |
+
action_history: list[str],
|
| 83 |
+
step: int,
|
| 84 |
+
) -> ScenarioResult:
|
| 85 |
+
cmd = action_command.strip().lower()
|
| 86 |
+
|
| 87 |
+
# Track diagnosis
|
| 88 |
+
if cmd.startswith("diagnose") and "ups" in cmd:
|
| 89 |
+
self._diagnosed_ups = True
|
| 90 |
+
if cmd.startswith("acknowledge"):
|
| 91 |
+
self._acknowledged = True
|
| 92 |
+
|
| 93 |
+
resolved = self._diagnosed_ups and self._acknowledged
|
| 94 |
+
|
| 95 |
+
# Reward for proper investigation
|
| 96 |
+
scenario_reward = 0.0
|
| 97 |
+
if self._diagnosed_ups:
|
| 98 |
+
scenario_reward += 0.3
|
| 99 |
+
if self._acknowledged:
|
| 100 |
+
scenario_reward += 0.2
|
| 101 |
+
|
| 102 |
+
procedure_reward = self.check_procedure(action_command, action_history)
|
| 103 |
+
|
| 104 |
+
# Progress: 50% for diagnose, 50% for acknowledge
|
| 105 |
+
progress = 0.0
|
| 106 |
+
if self._diagnosed_ups:
|
| 107 |
+
progress += 0.5
|
| 108 |
+
if self._acknowledged:
|
| 109 |
+
progress += 0.5
|
| 110 |
+
|
| 111 |
+
return ScenarioResult(
|
| 112 |
+
resolved=resolved,
|
| 113 |
+
resolution_message="UPS alarm properly investigated and acknowledged." if resolved else "",
|
| 114 |
+
scenario_reward=scenario_reward,
|
| 115 |
+
procedure_reward=procedure_reward,
|
| 116 |
+
progress=progress,
|
| 117 |
+
info={
|
| 118 |
+
"diagnosed_ups": self._diagnosed_ups,
|
| 119 |
+
"acknowledged": self._acknowledged,
|
| 120 |
+
},
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
@property
|
| 124 |
+
def scenario_id(self) -> str:
|
| 125 |
+
return "B1"
|
| 126 |
+
|
| 127 |
+
@property
|
| 128 |
+
def name(self) -> str:
|
| 129 |
+
return "UPS Alarm Response"
|
| 130 |
+
|
| 131 |
+
@property
|
| 132 |
+
def scenario_type(self) -> str:
|
| 133 |
+
return "power"
|
| 134 |
+
|
| 135 |
+
@property
|
| 136 |
+
def difficulty(self) -> str:
|
| 137 |
+
return "medium"
|
| 138 |
+
|
| 139 |
+
@property
|
| 140 |
+
def step_budget(self) -> int:
|
| 141 |
+
return 10
|
| 142 |
+
|
| 143 |
+
@property
|
| 144 |
+
def alert_message(self) -> str:
|
| 145 |
+
return (
|
| 146 |
+
"WARNING: UPS-1 transferred to battery at 14:23:05. "
|
| 147 |
+
"Utility restored at 14:23:35. Battery SOC: 85%. "
|
| 148 |
+
"Verify system status and acknowledge."
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
@property
|
| 152 |
+
def procedure_rules(self) -> list[ProcedureRule]:
|
| 153 |
+
return [
|
| 154 |
+
ProcedureRule(
|
| 155 |
+
required_before=["diagnose"],
|
| 156 |
+
trigger_command="acknowledge_alarm",
|
| 157 |
+
bonus=0.3,
|
| 158 |
+
penalty=-0.2,
|
| 159 |
+
description="Diagnose UPS before acknowledging alarm",
|
| 160 |
+
),
|
| 161 |
+
]
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
# ===========================================================================
|
| 165 |
+
# B3: Generator Test Protocol (Easy)
|
| 166 |
+
# ===========================================================================
|
| 167 |
+
@register_scenario
|
| 168 |
+
class GeneratorTestProtocol(Scenario):
|
| 169 |
+
"""Agent must follow proper monthly generator test procedure.
|
| 170 |
+
|
| 171 |
+
Correct sequence:
|
| 172 |
+
1. check_status — Review current system state
|
| 173 |
+
2. start_generator — Initiate startup
|
| 174 |
+
3. diagnose GEN-1 — Verify engine started and output is stable
|
| 175 |
+
4. stop_generator — Initiate cooldown
|
| 176 |
+
5. acknowledge_alarm — Log test completion
|
| 177 |
+
|
| 178 |
+
Resolution: Generator successfully started, verified, and shut down.
|
| 179 |
+
"""
|
| 180 |
+
|
| 181 |
+
def __init__(self) -> None:
|
| 182 |
+
super().__init__()
|
| 183 |
+
self._started = False
|
| 184 |
+
self._verified = False
|
| 185 |
+
self._stopped = False
|
| 186 |
+
self._completed = False
|
| 187 |
+
|
| 188 |
+
def reset_state(self) -> None:
|
| 189 |
+
self._started = False
|
| 190 |
+
self._verified = False
|
| 191 |
+
self._stopped = False
|
| 192 |
+
self._completed = False
|
| 193 |
+
|
| 194 |
+
def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
|
| 195 |
+
return base_config
|
| 196 |
+
|
| 197 |
+
def inject_fault(
|
| 198 |
+
self,
|
| 199 |
+
thermal_sim: ThermalSimulation,
|
| 200 |
+
power_sim: PowerSimulation | None,
|
| 201 |
+
) -> None:
|
| 202 |
+
# No fault — this is a routine test procedure
|
| 203 |
+
pass
|
| 204 |
+
|
| 205 |
+
def evaluate_step(
|
| 206 |
+
self,
|
| 207 |
+
thermal_sim: ThermalSimulation,
|
| 208 |
+
power_sim: PowerSimulation | None,
|
| 209 |
+
action_command: str,
|
| 210 |
+
action_history: list[str],
|
| 211 |
+
step: int,
|
| 212 |
+
) -> ScenarioResult:
|
| 213 |
+
cmd = action_command.strip().lower()
|
| 214 |
+
|
| 215 |
+
if cmd.startswith("start_generator"):
|
| 216 |
+
self._started = True
|
| 217 |
+
if self._started and cmd.startswith("diagnose") and "gen" in cmd:
|
| 218 |
+
self._verified = True
|
| 219 |
+
if cmd.startswith("stop_generator"):
|
| 220 |
+
if self._started and self._verified:
|
| 221 |
+
self._stopped = True
|
| 222 |
+
if cmd.startswith("acknowledge") and self._stopped:
|
| 223 |
+
self._completed = True
|
| 224 |
+
|
| 225 |
+
# Check generator state
|
| 226 |
+
gen_running = False
|
| 227 |
+
if power_sim:
|
| 228 |
+
gen_running = power_sim.state.generator.state in (
|
| 229 |
+
GeneratorState.READY, GeneratorState.LOADED,
|
| 230 |
+
GeneratorState.WARMING, GeneratorState.CRANKING,
|
| 231 |
+
GeneratorState.START_DELAY,
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
resolved = self._completed
|
| 235 |
+
|
| 236 |
+
scenario_reward = 0.0
|
| 237 |
+
if self._started:
|
| 238 |
+
scenario_reward += 0.1
|
| 239 |
+
if self._verified:
|
| 240 |
+
scenario_reward += 0.2
|
| 241 |
+
if self._stopped:
|
| 242 |
+
scenario_reward += 0.2
|
| 243 |
+
if self._completed:
|
| 244 |
+
scenario_reward += 0.3
|
| 245 |
+
|
| 246 |
+
procedure_reward = self.check_procedure(action_command, action_history)
|
| 247 |
+
|
| 248 |
+
# Progress: 25% per protocol step
|
| 249 |
+
progress = 0.0
|
| 250 |
+
if self._started:
|
| 251 |
+
progress += 0.25
|
| 252 |
+
if self._verified:
|
| 253 |
+
progress += 0.25
|
| 254 |
+
if self._stopped:
|
| 255 |
+
progress += 0.25
|
| 256 |
+
if self._completed:
|
| 257 |
+
progress += 0.25
|
| 258 |
+
|
| 259 |
+
return ScenarioResult(
|
| 260 |
+
resolved=resolved,
|
| 261 |
+
resolution_message="Generator test protocol completed successfully." if resolved else "",
|
| 262 |
+
scenario_reward=scenario_reward,
|
| 263 |
+
procedure_reward=procedure_reward,
|
| 264 |
+
progress=progress,
|
| 265 |
+
info={
|
| 266 |
+
"started": self._started,
|
| 267 |
+
"verified": self._verified,
|
| 268 |
+
"stopped": self._stopped,
|
| 269 |
+
"completed": self._completed,
|
| 270 |
+
"gen_running": gen_running,
|
| 271 |
+
},
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
@property
|
| 275 |
+
def scenario_id(self) -> str:
|
| 276 |
+
return "B3"
|
| 277 |
+
|
| 278 |
+
@property
|
| 279 |
+
def name(self) -> str:
|
| 280 |
+
return "Generator Test Protocol"
|
| 281 |
+
|
| 282 |
+
@property
|
| 283 |
+
def scenario_type(self) -> str:
|
| 284 |
+
return "power"
|
| 285 |
+
|
| 286 |
+
@property
|
| 287 |
+
def difficulty(self) -> str:
|
| 288 |
+
return "easy"
|
| 289 |
+
|
| 290 |
+
@property
|
| 291 |
+
def step_budget(self) -> int:
|
| 292 |
+
return 10
|
| 293 |
+
|
| 294 |
+
@property
|
| 295 |
+
def alert_message(self) -> str:
|
| 296 |
+
return (
|
| 297 |
+
"SCHEDULED: Monthly generator test due. "
|
| 298 |
+
"Follow standard test protocol: start, verify, loaded test, shutdown."
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
@property
|
| 302 |
+
def procedure_rules(self) -> list[ProcedureRule]:
|
| 303 |
+
return [
|
| 304 |
+
ProcedureRule(
|
| 305 |
+
required_before=["check_status"],
|
| 306 |
+
trigger_command="start_generator",
|
| 307 |
+
bonus=0.2,
|
| 308 |
+
penalty=-0.1,
|
| 309 |
+
description="Check system status before starting generator",
|
| 310 |
+
),
|
| 311 |
+
ProcedureRule(
|
| 312 |
+
required_before=["start_generator"],
|
| 313 |
+
trigger_command="stop_generator",
|
| 314 |
+
bonus=0.2,
|
| 315 |
+
penalty=-0.3,
|
| 316 |
+
description="Must start generator before stopping it",
|
| 317 |
+
),
|
| 318 |
+
]
|
| 319 |
+
|
| 320 |
+
@property
|
| 321 |
+
def game_time_per_step_s(self) -> float:
|
| 322 |
+
# Generator startup is ~17s, so 30s per step lets agent observe transitions
|
| 323 |
+
return 30.0
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
# ===========================================================================
|
| 327 |
+
# B4: Power Failure Cascade (Hard)
|
| 328 |
+
# ===========================================================================
|
| 329 |
+
@register_scenario
|
| 330 |
+
class PowerFailureCascade(Scenario):
|
| 331 |
+
"""Full utility loss with degraded generator response.
|
| 332 |
+
|
| 333 |
+
Scenario: Utility power fails. Generator starts but takes longer than
|
| 334 |
+
usual (warm-up extended). UPS batteries are bridging the gap.
|
| 335 |
+
Meanwhile, battery SOC is dropping.
|
| 336 |
+
|
| 337 |
+
The agent must:
|
| 338 |
+
1. Diagnose UPS status and battery levels
|
| 339 |
+
2. Verify generator startup sequence
|
| 340 |
+
3. Shed non-critical IT load to extend battery life
|
| 341 |
+
4. Monitor temperatures (no cooling compressors during transfer)
|
| 342 |
+
5. Stabilize once generator is online
|
| 343 |
+
|
| 344 |
+
Resolution: Generator loaded AND all temps within allowable limits
|
| 345 |
+
AND UPS battery SOC stabilized (charging or >20%).
|
| 346 |
+
"""
|
| 347 |
+
|
| 348 |
+
_CONSECUTIVE_STABLE_STEPS = 2
|
| 349 |
+
|
| 350 |
+
def __init__(self) -> None:
|
| 351 |
+
super().__init__()
|
| 352 |
+
self._stable_count = 0
|
| 353 |
+
|
| 354 |
+
def reset_state(self) -> None:
|
| 355 |
+
self._stable_count = 0
|
| 356 |
+
|
| 357 |
+
def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
|
| 358 |
+
# Extend generator warmup to make it more challenging
|
| 359 |
+
base_config.power.generator.warmup_time_s = 15.0 # Longer than default 8s
|
| 360 |
+
return base_config
|
| 361 |
+
|
| 362 |
+
def inject_fault(
|
| 363 |
+
self,
|
| 364 |
+
thermal_sim: ThermalSimulation,
|
| 365 |
+
power_sim: PowerSimulation | None,
|
| 366 |
+
) -> None:
|
| 367 |
+
if power_sim is None:
|
| 368 |
+
return
|
| 369 |
+
power_sim.set_utility_available(False)
|
| 370 |
+
|
| 371 |
+
def evaluate_step(
|
| 372 |
+
self,
|
| 373 |
+
thermal_sim: ThermalSimulation,
|
| 374 |
+
power_sim: PowerSimulation | None,
|
| 375 |
+
action_command: str,
|
| 376 |
+
action_history: list[str],
|
| 377 |
+
step: int,
|
| 378 |
+
) -> ScenarioResult:
|
| 379 |
+
dc = thermal_sim.state
|
| 380 |
+
|
| 381 |
+
# Check temperatures
|
| 382 |
+
all_within_allowable = True
|
| 383 |
+
max_over = 0.0
|
| 384 |
+
for zone in dc.zones:
|
| 385 |
+
ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
|
| 386 |
+
if not ashrae:
|
| 387 |
+
continue
|
| 388 |
+
if zone.max_inlet_temp_c > ashrae.allowable_max_c:
|
| 389 |
+
all_within_allowable = False
|
| 390 |
+
max_over = max(max_over, zone.max_inlet_temp_c - ashrae.allowable_max_c)
|
| 391 |
+
|
| 392 |
+
# Check power recovery
|
| 393 |
+
gen_loaded = False
|
| 394 |
+
battery_ok = True
|
| 395 |
+
if power_sim:
|
| 396 |
+
gen_loaded = power_sim.state.generator.state == GeneratorState.LOADED
|
| 397 |
+
for ups in power_sim.state.ups_units:
|
| 398 |
+
if ups.battery_soc < 0.10:
|
| 399 |
+
battery_ok = False
|
| 400 |
+
|
| 401 |
+
stable = all_within_allowable and gen_loaded and battery_ok
|
| 402 |
+
if stable:
|
| 403 |
+
self._stable_count += 1
|
| 404 |
+
else:
|
| 405 |
+
self._stable_count = 0
|
| 406 |
+
|
| 407 |
+
resolved = self._stable_count >= self._CONSECUTIVE_STABLE_STEPS
|
| 408 |
+
|
| 409 |
+
# Reward shaping
|
| 410 |
+
scenario_reward = 0.0
|
| 411 |
+
# Penalty for temperature overshoot
|
| 412 |
+
if max_over > 0:
|
| 413 |
+
scenario_reward -= max_over * 1.5
|
| 414 |
+
# Reward for generator online
|
| 415 |
+
if gen_loaded:
|
| 416 |
+
scenario_reward += 0.3
|
| 417 |
+
# Penalty for low battery
|
| 418 |
+
if power_sim:
|
| 419 |
+
min_soc = min(u.battery_soc for u in power_sim.state.ups_units) if power_sim.state.ups_units else 1.0
|
| 420 |
+
if min_soc < 0.20:
|
| 421 |
+
scenario_reward -= (0.20 - min_soc) * 5.0
|
| 422 |
+
|
| 423 |
+
procedure_reward = self.check_procedure(action_command, action_history)
|
| 424 |
+
|
| 425 |
+
# Progress: partial credit per condition, full credit for stability
|
| 426 |
+
conditions_met = sum([gen_loaded, all_within_allowable, battery_ok])
|
| 427 |
+
if conditions_met == 3:
|
| 428 |
+
progress = 0.5 + 0.5 * min(1.0, self._stable_count / self._CONSECUTIVE_STABLE_STEPS)
|
| 429 |
+
else:
|
| 430 |
+
progress = (conditions_met / 3.0) * 0.5
|
| 431 |
+
|
| 432 |
+
return ScenarioResult(
|
| 433 |
+
resolved=resolved,
|
| 434 |
+
resolution_message="Power failure resolved. Generator online, temps stable." if resolved else "",
|
| 435 |
+
scenario_reward=scenario_reward,
|
| 436 |
+
procedure_reward=procedure_reward,
|
| 437 |
+
progress=progress,
|
| 438 |
+
info={
|
| 439 |
+
"max_overshoot_c": max_over,
|
| 440 |
+
"gen_loaded": gen_loaded,
|
| 441 |
+
"battery_ok": battery_ok,
|
| 442 |
+
"stable_count": self._stable_count,
|
| 443 |
+
},
|
| 444 |
+
)
|
| 445 |
+
|
| 446 |
+
@property
|
| 447 |
+
def scenario_id(self) -> str:
|
| 448 |
+
return "B4"
|
| 449 |
+
|
| 450 |
+
@property
|
| 451 |
+
def name(self) -> str:
|
| 452 |
+
return "Power Failure Cascade"
|
| 453 |
+
|
| 454 |
+
@property
|
| 455 |
+
def scenario_type(self) -> str:
|
| 456 |
+
return "power"
|
| 457 |
+
|
| 458 |
+
@property
|
| 459 |
+
def difficulty(self) -> str:
|
| 460 |
+
return "hard"
|
| 461 |
+
|
| 462 |
+
@property
|
| 463 |
+
def step_budget(self) -> int:
|
| 464 |
+
return 20
|
| 465 |
+
|
| 466 |
+
@property
|
| 467 |
+
def alert_message(self) -> str:
|
| 468 |
+
return (
|
| 469 |
+
"CRITICAL: Utility power lost. UPS on battery. "
|
| 470 |
+
"Generator startup in progress. "
|
| 471 |
+
"Battery SOC declining. Immediate action required."
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
@property
|
| 475 |
+
def procedure_rules(self) -> list[ProcedureRule]:
|
| 476 |
+
return [
|
| 477 |
+
ProcedureRule(
|
| 478 |
+
required_before=["diagnose"],
|
| 479 |
+
trigger_command="set_rack_load",
|
| 480 |
+
bonus=0.3,
|
| 481 |
+
penalty=-0.1,
|
| 482 |
+
description="Diagnose before shedding load",
|
| 483 |
+
),
|
| 484 |
+
ProcedureRule(
|
| 485 |
+
required_before=[],
|
| 486 |
+
trigger_command="escalate",
|
| 487 |
+
bonus=0.0,
|
| 488 |
+
penalty=-0.5,
|
| 489 |
+
description="Escalation during power cascade is heavily penalized",
|
| 490 |
+
),
|
| 491 |
+
]
|
| 492 |
+
|
| 493 |
+
@property
|
| 494 |
+
def game_time_per_step_s(self) -> float:
|
| 495 |
+
# Fast progression — every second counts with battery draining
|
| 496 |
+
return 15.0
|
scenarios/registry.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Scenario registry for selecting scenarios by ID, type, or difficulty.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import random
|
| 14 |
+
from typing import Optional
|
| 15 |
+
|
| 16 |
+
from .base import Scenario
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# Global registry: scenario_id → Scenario class
|
| 20 |
+
_REGISTRY: dict[str, type[Scenario]] = {}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def register_scenario(cls: type[Scenario]) -> type[Scenario]:
|
| 24 |
+
"""Class decorator to register a scenario.
|
| 25 |
+
|
| 26 |
+
Usage:
|
| 27 |
+
@register_scenario
|
| 28 |
+
class MyCoolScenario(Scenario):
|
| 29 |
+
...
|
| 30 |
+
"""
|
| 31 |
+
# Instantiate temporarily to read scenario_id
|
| 32 |
+
instance = cls()
|
| 33 |
+
_REGISTRY[instance.scenario_id] = cls
|
| 34 |
+
return cls
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def get_scenario(scenario_id: str) -> Scenario:
|
| 38 |
+
"""Get a scenario by its ID (e.g. 'A1', 'B4')."""
|
| 39 |
+
cls = _REGISTRY.get(scenario_id)
|
| 40 |
+
if cls is None:
|
| 41 |
+
available = ", ".join(sorted(_REGISTRY.keys()))
|
| 42 |
+
raise KeyError(f"Unknown scenario '{scenario_id}'. Available: {available}")
|
| 43 |
+
return cls()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def list_scenarios(
|
| 47 |
+
*,
|
| 48 |
+
scenario_type: Optional[str] = None,
|
| 49 |
+
difficulty: Optional[str] = None,
|
| 50 |
+
) -> list[Scenario]:
|
| 51 |
+
"""List registered scenarios, optionally filtered by type or difficulty."""
|
| 52 |
+
result = []
|
| 53 |
+
for cls in _REGISTRY.values():
|
| 54 |
+
instance = cls()
|
| 55 |
+
if scenario_type and instance.scenario_type != scenario_type:
|
| 56 |
+
continue
|
| 57 |
+
if difficulty and instance.difficulty != difficulty:
|
| 58 |
+
continue
|
| 59 |
+
result.append(instance)
|
| 60 |
+
return result
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def random_scenario(
|
| 64 |
+
*,
|
| 65 |
+
scenario_type: Optional[str] = None,
|
| 66 |
+
difficulty: Optional[str] = None,
|
| 67 |
+
seed: Optional[int] = None,
|
| 68 |
+
) -> Scenario:
|
| 69 |
+
"""Pick a random scenario from the registry, optionally filtered."""
|
| 70 |
+
candidates = list_scenarios(scenario_type=scenario_type, difficulty=difficulty)
|
| 71 |
+
if not candidates:
|
| 72 |
+
raise ValueError(
|
| 73 |
+
f"No scenarios match type={scenario_type!r}, difficulty={difficulty!r}"
|
| 74 |
+
)
|
| 75 |
+
rng = random.Random(seed)
|
| 76 |
+
return rng.choice(candidates)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def registered_scenario_ids() -> list[str]:
|
| 80 |
+
"""Return all registered scenario IDs in sorted order."""
|
| 81 |
+
return sorted(_REGISTRY.keys())
|
scenarios/thermal_scenarios.py
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Thermal operation scenarios (Category A).
|
| 9 |
+
|
| 10 |
+
A1: Cooling Setpoint Optimization (Easy)
|
| 11 |
+
- PUE is high because setpoints are too low
|
| 12 |
+
- Agent must raise setpoints to improve efficiency without violating ASHRAE
|
| 13 |
+
A2: Thermal Event Response (Medium)
|
| 14 |
+
- Single CRAC failure causes temperature rise
|
| 15 |
+
- Agent must diagnose, compensate, and stabilize
|
| 16 |
+
A4: CRAC Failure Cascade (Hard)
|
| 17 |
+
- Two CRACs fail in quick succession
|
| 18 |
+
- Agent must triage, redistribute cooling, migrate workload
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
from ..config import ASHRAE_CLASSES, DatacenterConfig
|
| 24 |
+
from ..simulation.thermal import ThermalSimulation
|
| 25 |
+
from ..simulation.power import PowerSimulation
|
| 26 |
+
from ..simulation.types import CRACFaultType
|
| 27 |
+
from .base import ProcedureRule, Scenario, ScenarioResult
|
| 28 |
+
from .registry import register_scenario
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# ===========================================================================
|
| 32 |
+
# A1: Cooling Setpoint Optimization (Easy)
|
| 33 |
+
# ===========================================================================
|
| 34 |
+
@register_scenario
|
| 35 |
+
class CoolingSetpointOptimization(Scenario):
|
| 36 |
+
"""Agent must optimize CRAC setpoints to reduce PUE.
|
| 37 |
+
|
| 38 |
+
Initial condition: All CRACs at 15°C setpoint (overly aggressive cooling).
|
| 39 |
+
This wastes energy — PUE is unnecessarily high.
|
| 40 |
+
|
| 41 |
+
Goal: Raise setpoints closer to ASHRAE recommended range (18-27°C for A2)
|
| 42 |
+
while keeping all inlet temps within recommended limits.
|
| 43 |
+
|
| 44 |
+
Resolution: PUE drops below target AND all temps within recommended range.
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
_PUE_TARGET = 1.6 # Achievable PUE with proper setpoints
|
| 48 |
+
|
| 49 |
+
def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
|
| 50 |
+
# Set all CRACs to 15°C (too cold, wasting energy)
|
| 51 |
+
for zone_cfg in base_config.zones:
|
| 52 |
+
for crac_cfg in zone_cfg.crac_units:
|
| 53 |
+
crac_cfg.initial_setpoint_c = 15.0
|
| 54 |
+
return base_config
|
| 55 |
+
|
| 56 |
+
def inject_fault(
|
| 57 |
+
self,
|
| 58 |
+
thermal_sim: ThermalSimulation,
|
| 59 |
+
power_sim: PowerSimulation | None,
|
| 60 |
+
) -> None:
|
| 61 |
+
# No fault — this is an optimization scenario
|
| 62 |
+
# The "problem" is already baked into the config (low setpoints)
|
| 63 |
+
pass
|
| 64 |
+
|
| 65 |
+
def evaluate_step(
|
| 66 |
+
self,
|
| 67 |
+
thermal_sim: ThermalSimulation,
|
| 68 |
+
power_sim: PowerSimulation | None,
|
| 69 |
+
action_command: str,
|
| 70 |
+
action_history: list[str],
|
| 71 |
+
step: int,
|
| 72 |
+
) -> ScenarioResult:
|
| 73 |
+
dc = thermal_sim.state
|
| 74 |
+
pue = dc.pue
|
| 75 |
+
|
| 76 |
+
# Check all zones within recommended
|
| 77 |
+
all_within_recommended = True
|
| 78 |
+
for zone in dc.zones:
|
| 79 |
+
ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
|
| 80 |
+
if ashrae and zone.max_inlet_temp_c > ashrae.recommended_max_c:
|
| 81 |
+
all_within_recommended = False
|
| 82 |
+
break
|
| 83 |
+
|
| 84 |
+
# Reward: improvement toward target PUE
|
| 85 |
+
# Baseline PUE at 15°C setpoints is ~2.0+, target is ~1.6
|
| 86 |
+
pue_reward = max(0, 2.0 - pue) # Higher is better as PUE drops
|
| 87 |
+
|
| 88 |
+
resolved = pue < self._PUE_TARGET and all_within_recommended
|
| 89 |
+
procedure_reward = self.check_procedure(action_command, action_history)
|
| 90 |
+
|
| 91 |
+
# Progress: PUE improvement toward target + temperature compliance
|
| 92 |
+
pue_progress = max(0.0, min(1.0, (2.0 - pue) / (2.0 - self._PUE_TARGET)))
|
| 93 |
+
temp_factor = 1.0 if all_within_recommended else 0.0
|
| 94 |
+
progress = 0.7 * pue_progress + 0.3 * temp_factor
|
| 95 |
+
|
| 96 |
+
return ScenarioResult(
|
| 97 |
+
resolved=resolved,
|
| 98 |
+
resolution_message="PUE optimized within target range." if resolved else "",
|
| 99 |
+
scenario_reward=pue_reward * 0.5,
|
| 100 |
+
procedure_reward=procedure_reward,
|
| 101 |
+
progress=progress,
|
| 102 |
+
info={"pue": pue, "target_pue": self._PUE_TARGET},
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
@property
|
| 106 |
+
def scenario_id(self) -> str:
|
| 107 |
+
return "A1"
|
| 108 |
+
|
| 109 |
+
@property
|
| 110 |
+
def name(self) -> str:
|
| 111 |
+
return "Cooling Setpoint Optimization"
|
| 112 |
+
|
| 113 |
+
@property
|
| 114 |
+
def scenario_type(self) -> str:
|
| 115 |
+
return "thermal"
|
| 116 |
+
|
| 117 |
+
@property
|
| 118 |
+
def difficulty(self) -> str:
|
| 119 |
+
return "easy"
|
| 120 |
+
|
| 121 |
+
@property
|
| 122 |
+
def step_budget(self) -> int:
|
| 123 |
+
return 10
|
| 124 |
+
|
| 125 |
+
@property
|
| 126 |
+
def alert_message(self) -> str:
|
| 127 |
+
return (
|
| 128 |
+
"NOTICE: PUE exceeds 1.8 — cooling setpoints may be suboptimal. "
|
| 129 |
+
"Review CRAC setpoints and adjust for energy efficiency."
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
@property
|
| 133 |
+
def procedure_rules(self) -> list[ProcedureRule]:
|
| 134 |
+
return [
|
| 135 |
+
ProcedureRule(
|
| 136 |
+
required_before=["check_status"],
|
| 137 |
+
trigger_command="adjust_setpoint",
|
| 138 |
+
bonus=0.2,
|
| 139 |
+
penalty=-0.1,
|
| 140 |
+
description="Check status before adjusting setpoints",
|
| 141 |
+
),
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
# ===========================================================================
|
| 146 |
+
# A2: Thermal Event Response (Medium)
|
| 147 |
+
# ===========================================================================
|
| 148 |
+
@register_scenario
|
| 149 |
+
class ThermalEventResponse(Scenario):
|
| 150 |
+
"""Agent must respond to a single CRAC compressor failure.
|
| 151 |
+
|
| 152 |
+
A CRAC unit suffers a compressor failure, reducing cooling capacity.
|
| 153 |
+
With N+1 provisioning the remaining CRACs can handle the load,
|
| 154 |
+
but the agent should:
|
| 155 |
+
1. Diagnose the failed unit
|
| 156 |
+
2. Increase fan speeds or adjust setpoints on remaining CRACs
|
| 157 |
+
3. Optionally reduce load on hottest racks
|
| 158 |
+
|
| 159 |
+
Resolution: All inlet temps within recommended range for 2+ consecutive steps.
|
| 160 |
+
"""
|
| 161 |
+
|
| 162 |
+
_FAILED_UNIT = "CRAC-3"
|
| 163 |
+
_CONSECUTIVE_STABLE_STEPS = 2
|
| 164 |
+
|
| 165 |
+
def __init__(self) -> None:
|
| 166 |
+
super().__init__()
|
| 167 |
+
self._stable_count = 0
|
| 168 |
+
|
| 169 |
+
def reset_state(self) -> None:
|
| 170 |
+
self._stable_count = 0
|
| 171 |
+
|
| 172 |
+
def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
|
| 173 |
+
return base_config # Default config is fine
|
| 174 |
+
|
| 175 |
+
def inject_fault(
|
| 176 |
+
self,
|
| 177 |
+
thermal_sim: ThermalSimulation,
|
| 178 |
+
power_sim: PowerSimulation | None,
|
| 179 |
+
) -> None:
|
| 180 |
+
thermal_sim.inject_crac_fault(self._FAILED_UNIT, CRACFaultType.COMPRESSOR)
|
| 181 |
+
|
| 182 |
+
def evaluate_step(
|
| 183 |
+
self,
|
| 184 |
+
thermal_sim: ThermalSimulation,
|
| 185 |
+
power_sim: PowerSimulation | None,
|
| 186 |
+
action_command: str,
|
| 187 |
+
action_history: list[str],
|
| 188 |
+
step: int,
|
| 189 |
+
) -> ScenarioResult:
|
| 190 |
+
dc = thermal_sim.state
|
| 191 |
+
|
| 192 |
+
# Check if all zones within recommended
|
| 193 |
+
all_within_recommended = True
|
| 194 |
+
max_over = 0.0
|
| 195 |
+
for zone in dc.zones:
|
| 196 |
+
ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
|
| 197 |
+
if not ashrae:
|
| 198 |
+
continue
|
| 199 |
+
if zone.max_inlet_temp_c > ashrae.recommended_max_c:
|
| 200 |
+
all_within_recommended = False
|
| 201 |
+
max_over = max(max_over, zone.max_inlet_temp_c - ashrae.recommended_max_c)
|
| 202 |
+
|
| 203 |
+
if all_within_recommended:
|
| 204 |
+
self._stable_count += 1
|
| 205 |
+
else:
|
| 206 |
+
self._stable_count = 0
|
| 207 |
+
|
| 208 |
+
resolved = self._stable_count >= self._CONSECUTIVE_STABLE_STEPS
|
| 209 |
+
|
| 210 |
+
# Scenario reward: penalty proportional to temperature overshoot
|
| 211 |
+
scenario_reward = -max_over * 0.5 if max_over > 0 else 0.1
|
| 212 |
+
|
| 213 |
+
procedure_reward = self.check_procedure(action_command, action_history)
|
| 214 |
+
|
| 215 |
+
# Progress: partial credit for being close, full credit for stability
|
| 216 |
+
if all_within_recommended:
|
| 217 |
+
progress = 0.5 + 0.5 * min(1.0, self._stable_count / self._CONSECUTIVE_STABLE_STEPS)
|
| 218 |
+
else:
|
| 219 |
+
progress = max(0.0, 0.4 / (1.0 + max_over))
|
| 220 |
+
|
| 221 |
+
return ScenarioResult(
|
| 222 |
+
resolved=resolved,
|
| 223 |
+
resolution_message="Thermal event stabilized. All zones within recommended range." if resolved else "",
|
| 224 |
+
scenario_reward=scenario_reward,
|
| 225 |
+
procedure_reward=procedure_reward,
|
| 226 |
+
progress=progress,
|
| 227 |
+
info={"max_overshoot_c": max_over, "stable_count": self._stable_count},
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
@property
|
| 231 |
+
def scenario_id(self) -> str:
|
| 232 |
+
return "A2"
|
| 233 |
+
|
| 234 |
+
@property
|
| 235 |
+
def name(self) -> str:
|
| 236 |
+
return "Thermal Event Response"
|
| 237 |
+
|
| 238 |
+
@property
|
| 239 |
+
def scenario_type(self) -> str:
|
| 240 |
+
return "thermal"
|
| 241 |
+
|
| 242 |
+
@property
|
| 243 |
+
def difficulty(self) -> str:
|
| 244 |
+
return "medium"
|
| 245 |
+
|
| 246 |
+
@property
|
| 247 |
+
def step_budget(self) -> int:
|
| 248 |
+
return 15
|
| 249 |
+
|
| 250 |
+
@property
|
| 251 |
+
def alert_message(self) -> str:
|
| 252 |
+
return (
|
| 253 |
+
f"CRITICAL: {self._FAILED_UNIT} compressor failure detected. "
|
| 254 |
+
"Zone B temperatures rising. Investigate and stabilize."
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
@property
|
| 258 |
+
def procedure_rules(self) -> list[ProcedureRule]:
|
| 259 |
+
return [
|
| 260 |
+
ProcedureRule(
|
| 261 |
+
required_before=["diagnose"],
|
| 262 |
+
trigger_command="adjust_setpoint",
|
| 263 |
+
bonus=0.3,
|
| 264 |
+
penalty=-0.2,
|
| 265 |
+
description="Diagnose the fault before adjusting setpoints",
|
| 266 |
+
),
|
| 267 |
+
ProcedureRule(
|
| 268 |
+
required_before=["diagnose"],
|
| 269 |
+
trigger_command="set_fan_speed",
|
| 270 |
+
bonus=0.3,
|
| 271 |
+
penalty=-0.2,
|
| 272 |
+
description="Diagnose the fault before adjusting fan speed",
|
| 273 |
+
),
|
| 274 |
+
ProcedureRule(
|
| 275 |
+
required_before=[],
|
| 276 |
+
trigger_command="escalate",
|
| 277 |
+
bonus=0.0,
|
| 278 |
+
penalty=-0.3,
|
| 279 |
+
description="Escalated without attempting diagnosis or fix",
|
| 280 |
+
),
|
| 281 |
+
]
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
# ===========================================================================
|
| 285 |
+
# A4: CRAC Failure Cascade (Hard)
|
| 286 |
+
# ===========================================================================
|
| 287 |
+
@register_scenario
|
| 288 |
+
class CRACFailureCascade(Scenario):
|
| 289 |
+
"""Two CRACs fail, overwhelming remaining cooling capacity.
|
| 290 |
+
|
| 291 |
+
CRAC-1 has a compressor failure and CRAC-3 has a fan failure.
|
| 292 |
+
With only 2 of 4 CRACs operational, cooling is severely degraded.
|
| 293 |
+
The agent must:
|
| 294 |
+
1. Diagnose both failures
|
| 295 |
+
2. Aggressively compensate (max fan speeds, lower setpoints on survivors)
|
| 296 |
+
3. Reduce IT load on hottest racks (workload migration)
|
| 297 |
+
4. Monitor and stabilize before thermal runaway
|
| 298 |
+
|
| 299 |
+
Resolution: All inlet temps below allowable max for 2+ steps.
|
| 300 |
+
"""
|
| 301 |
+
|
| 302 |
+
_FAILED_UNITS = [
|
| 303 |
+
("CRAC-1", CRACFaultType.COMPRESSOR),
|
| 304 |
+
("CRAC-3", CRACFaultType.FAN),
|
| 305 |
+
]
|
| 306 |
+
_CONSECUTIVE_STABLE_STEPS = 2
|
| 307 |
+
|
| 308 |
+
def __init__(self) -> None:
|
| 309 |
+
super().__init__()
|
| 310 |
+
self._stable_count = 0
|
| 311 |
+
|
| 312 |
+
def reset_state(self) -> None:
|
| 313 |
+
self._stable_count = 0
|
| 314 |
+
|
| 315 |
+
def configure(self, base_config: DatacenterConfig) -> DatacenterConfig:
|
| 316 |
+
return base_config
|
| 317 |
+
|
| 318 |
+
def inject_fault(
|
| 319 |
+
self,
|
| 320 |
+
thermal_sim: ThermalSimulation,
|
| 321 |
+
power_sim: PowerSimulation | None,
|
| 322 |
+
) -> None:
|
| 323 |
+
for unit_id, fault_type in self._FAILED_UNITS:
|
| 324 |
+
thermal_sim.inject_crac_fault(unit_id, fault_type)
|
| 325 |
+
|
| 326 |
+
def evaluate_step(
|
| 327 |
+
self,
|
| 328 |
+
thermal_sim: ThermalSimulation,
|
| 329 |
+
power_sim: PowerSimulation | None,
|
| 330 |
+
action_command: str,
|
| 331 |
+
action_history: list[str],
|
| 332 |
+
step: int,
|
| 333 |
+
) -> ScenarioResult:
|
| 334 |
+
dc = thermal_sim.state
|
| 335 |
+
|
| 336 |
+
all_within_allowable = True
|
| 337 |
+
max_over = 0.0
|
| 338 |
+
for zone in dc.zones:
|
| 339 |
+
ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
|
| 340 |
+
if not ashrae:
|
| 341 |
+
continue
|
| 342 |
+
if zone.max_inlet_temp_c > ashrae.allowable_max_c:
|
| 343 |
+
all_within_allowable = False
|
| 344 |
+
max_over = max(max_over, zone.max_inlet_temp_c - ashrae.allowable_max_c)
|
| 345 |
+
|
| 346 |
+
if all_within_allowable:
|
| 347 |
+
self._stable_count += 1
|
| 348 |
+
else:
|
| 349 |
+
self._stable_count = 0
|
| 350 |
+
|
| 351 |
+
resolved = self._stable_count >= self._CONSECUTIVE_STABLE_STEPS
|
| 352 |
+
|
| 353 |
+
# Heavy penalty for being over allowable
|
| 354 |
+
scenario_reward = -max_over * 2.0 if max_over > 0 else 0.2
|
| 355 |
+
|
| 356 |
+
procedure_reward = self.check_procedure(action_command, action_history)
|
| 357 |
+
|
| 358 |
+
# Bonus for diagnosing both units
|
| 359 |
+
diagnosed_units = set()
|
| 360 |
+
for h in action_history:
|
| 361 |
+
parts = h.strip().split()
|
| 362 |
+
if len(parts) >= 2 and parts[0].lower() == "diagnose":
|
| 363 |
+
diagnosed_units.add(parts[1].upper())
|
| 364 |
+
if "CRAC-1" in diagnosed_units and "CRAC-3" in diagnosed_units:
|
| 365 |
+
procedure_reward += 0.2 # Bonus for thorough diagnosis
|
| 366 |
+
|
| 367 |
+
# Progress: partial credit for being close, full credit for stability
|
| 368 |
+
if all_within_allowable:
|
| 369 |
+
progress = 0.5 + 0.5 * min(1.0, self._stable_count / self._CONSECUTIVE_STABLE_STEPS)
|
| 370 |
+
else:
|
| 371 |
+
progress = max(0.0, 0.4 / (1.0 + max_over))
|
| 372 |
+
|
| 373 |
+
return ScenarioResult(
|
| 374 |
+
resolved=resolved,
|
| 375 |
+
resolution_message="CRAC cascade stabilized. Temps within allowable range." if resolved else "",
|
| 376 |
+
scenario_reward=scenario_reward,
|
| 377 |
+
procedure_reward=procedure_reward,
|
| 378 |
+
progress=progress,
|
| 379 |
+
info={
|
| 380 |
+
"max_overshoot_c": max_over,
|
| 381 |
+
"stable_count": self._stable_count,
|
| 382 |
+
"diagnosed_units": list(diagnosed_units),
|
| 383 |
+
},
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
@property
|
| 387 |
+
def scenario_id(self) -> str:
|
| 388 |
+
return "A4"
|
| 389 |
+
|
| 390 |
+
@property
|
| 391 |
+
def name(self) -> str:
|
| 392 |
+
return "CRAC Failure Cascade"
|
| 393 |
+
|
| 394 |
+
@property
|
| 395 |
+
def scenario_type(self) -> str:
|
| 396 |
+
return "thermal"
|
| 397 |
+
|
| 398 |
+
@property
|
| 399 |
+
def difficulty(self) -> str:
|
| 400 |
+
return "hard"
|
| 401 |
+
|
| 402 |
+
@property
|
| 403 |
+
def step_budget(self) -> int:
|
| 404 |
+
return 20
|
| 405 |
+
|
| 406 |
+
@property
|
| 407 |
+
def alert_message(self) -> str:
|
| 408 |
+
return (
|
| 409 |
+
"CRITICAL: Multiple CRAC failures detected. "
|
| 410 |
+
"CRAC-1 compressor fault, CRAC-3 fan fault. "
|
| 411 |
+
"Temperatures rising rapidly. Immediate action required."
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
@property
|
| 415 |
+
def procedure_rules(self) -> list[ProcedureRule]:
|
| 416 |
+
return [
|
| 417 |
+
ProcedureRule(
|
| 418 |
+
required_before=["diagnose"],
|
| 419 |
+
trigger_command="adjust_setpoint",
|
| 420 |
+
bonus=0.2,
|
| 421 |
+
penalty=-0.3,
|
| 422 |
+
description="Diagnose before adjusting setpoints during cascade",
|
| 423 |
+
),
|
| 424 |
+
ProcedureRule(
|
| 425 |
+
required_before=["diagnose"],
|
| 426 |
+
trigger_command="set_fan_speed",
|
| 427 |
+
bonus=0.2,
|
| 428 |
+
penalty=-0.3,
|
| 429 |
+
description="Diagnose before adjusting fan speed during cascade",
|
| 430 |
+
),
|
| 431 |
+
ProcedureRule(
|
| 432 |
+
required_before=["diagnose"],
|
| 433 |
+
trigger_command="set_rack_load",
|
| 434 |
+
bonus=0.3,
|
| 435 |
+
penalty=-0.1,
|
| 436 |
+
description="Diagnose before migrating workloads",
|
| 437 |
+
),
|
| 438 |
+
]
|
| 439 |
+
|
| 440 |
+
@property
|
| 441 |
+
def game_time_per_step_s(self) -> float:
|
| 442 |
+
# Faster time progression — cascade is urgent
|
| 443 |
+
return 30.0
|
server/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Dc Ops Env environment server components."""
|
| 8 |
+
|
| 9 |
+
from .dc_ops_env_environment import DcOpsEnvironment
|
| 10 |
+
|
| 11 |
+
__all__ = ["DcOpsEnvironment"]
|
server/app.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
FastAPI application for the Dc Ops Env Environment.
|
| 9 |
+
|
| 10 |
+
This module creates an HTTP server that exposes the DcOpsEnvironment
|
| 11 |
+
over HTTP and WebSocket endpoints, compatible with EnvClient.
|
| 12 |
+
|
| 13 |
+
Endpoints:
|
| 14 |
+
- POST /reset: Reset the environment
|
| 15 |
+
- POST /step: Execute an action
|
| 16 |
+
- GET /state: Get current environment state
|
| 17 |
+
- GET /schema: Get action/observation schemas
|
| 18 |
+
- WS /ws: WebSocket endpoint for persistent sessions
|
| 19 |
+
|
| 20 |
+
Usage:
|
| 21 |
+
# Development (with auto-reload):
|
| 22 |
+
uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
|
| 23 |
+
|
| 24 |
+
# Production:
|
| 25 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
|
| 26 |
+
|
| 27 |
+
# Or run directly:
|
| 28 |
+
python -m server.app
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
|
| 33 |
+
from fastapi.responses import FileResponse
|
| 34 |
+
from fastapi.staticfiles import StaticFiles
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
from openenv.core.env_server.http_server import create_app
|
| 38 |
+
except Exception as e: # pragma: no cover
|
| 39 |
+
raise ImportError(
|
| 40 |
+
"openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
|
| 41 |
+
) from e
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
from ..models import DcOpsAction, DcOpsObservation
|
| 45 |
+
from .dc_ops_env_environment import DcOpsEnvironment
|
| 46 |
+
except ModuleNotFoundError:
|
| 47 |
+
from models import DcOpsAction, DcOpsObservation
|
| 48 |
+
from server.dc_ops_env_environment import DcOpsEnvironment
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# Create the app with web interface and README integration
|
| 52 |
+
app = create_app(
|
| 53 |
+
DcOpsEnvironment,
|
| 54 |
+
DcOpsAction,
|
| 55 |
+
DcOpsObservation,
|
| 56 |
+
env_name="dc_ops_env",
|
| 57 |
+
max_concurrent_envs=4, # increase this number to allow more concurrent WebSocket sessions
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Mount custom DC-Ops dashboard UI at /web
|
| 61 |
+
_STATIC_DIR = Path(__file__).parent / "static"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@app.get("/web")
|
| 65 |
+
async def web_ui():
|
| 66 |
+
"""Serve the DC-Ops operations console."""
|
| 67 |
+
return FileResponse(_STATIC_DIR / "index.html", media_type="text/html")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
app.mount("/static", StaticFiles(directory=str(_STATIC_DIR)), name="static")
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 74 |
+
"""
|
| 75 |
+
Entry point for direct execution via uv run or python -m.
|
| 76 |
+
|
| 77 |
+
This function enables running the server without Docker:
|
| 78 |
+
uv run --project . server
|
| 79 |
+
uv run --project . server --port 8001
|
| 80 |
+
python -m dc_ops_env.server.app
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
host: Host address to bind to (default: "0.0.0.0")
|
| 84 |
+
port: Port number to listen on (default: 8000)
|
| 85 |
+
|
| 86 |
+
For production deployments, consider using uvicorn directly with
|
| 87 |
+
multiple workers:
|
| 88 |
+
uvicorn dc_ops_env.server.app:app --workers 4
|
| 89 |
+
"""
|
| 90 |
+
import uvicorn
|
| 91 |
+
|
| 92 |
+
uvicorn.run(app, host=host, port=port)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
if __name__ == "__main__":
|
| 96 |
+
import argparse
|
| 97 |
+
|
| 98 |
+
parser = argparse.ArgumentParser()
|
| 99 |
+
parser.add_argument("--port", type=int, default=8000)
|
| 100 |
+
args = parser.parse_args()
|
| 101 |
+
main(port=args.port)
|
server/dc_ops_env_environment.py
ADDED
|
@@ -0,0 +1,532 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
DC-Ops Environment Implementation.
|
| 9 |
+
|
| 10 |
+
Wires the thermal and power simulations into OpenEnv's Environment interface.
|
| 11 |
+
Each step:
|
| 12 |
+
1. Parse the agent's command
|
| 13 |
+
2. Apply mutations to simulation state
|
| 14 |
+
3. Advance simulation by game-time dt (default 60s)
|
| 15 |
+
4. Render dashboard observation
|
| 16 |
+
5. Compute reward (via multi-objective RewardFunction)
|
| 17 |
+
6. Check termination conditions
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
from typing import Any, Optional
|
| 23 |
+
from uuid import uuid4
|
| 24 |
+
|
| 25 |
+
from openenv.core.env_server.interfaces import Environment
|
| 26 |
+
from openenv.core.env_server.types import State
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
from ..config import (
|
| 30 |
+
ASHRAE_CLASSES,
|
| 31 |
+
DatacenterConfig,
|
| 32 |
+
PowerConfig,
|
| 33 |
+
make_default_datacenter_config,
|
| 34 |
+
load_datacenter_config,
|
| 35 |
+
)
|
| 36 |
+
from ..models import DcOpsAction, DcOpsObservation
|
| 37 |
+
from ..actions.parser import AVAILABLE_ACTIONS, CommandResult, parse_command
|
| 38 |
+
from ..rendering.dashboard import render_dashboard
|
| 39 |
+
from ..simulation.thermal import ThermalAlarm, ThermalSimulation
|
| 40 |
+
from ..simulation.power import PowerAlarm, PowerSimulation
|
| 41 |
+
from ..scenarios.base import Scenario, ScenarioResult
|
| 42 |
+
from ..scenarios.registry import get_scenario, random_scenario
|
| 43 |
+
from ..rewards.reward_function import RewardFunction
|
| 44 |
+
except ImportError:
|
| 45 |
+
from config import (
|
| 46 |
+
ASHRAE_CLASSES,
|
| 47 |
+
DatacenterConfig,
|
| 48 |
+
PowerConfig,
|
| 49 |
+
make_default_datacenter_config,
|
| 50 |
+
load_datacenter_config,
|
| 51 |
+
)
|
| 52 |
+
from models import DcOpsAction, DcOpsObservation
|
| 53 |
+
from actions.parser import AVAILABLE_ACTIONS, CommandResult, parse_command
|
| 54 |
+
from rendering.dashboard import render_dashboard
|
| 55 |
+
from simulation.thermal import ThermalAlarm, ThermalSimulation
|
| 56 |
+
from simulation.power import PowerAlarm, PowerSimulation
|
| 57 |
+
from scenarios.base import Scenario, ScenarioResult
|
| 58 |
+
from scenarios.registry import get_scenario, random_scenario
|
| 59 |
+
from rewards.reward_function import RewardFunction
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# Default episode configuration
|
| 63 |
+
DEFAULT_STEP_BUDGET = 15
|
| 64 |
+
DEFAULT_GAME_TIME_PER_STEP_S = 60.0 # 1 minute of sim time per agent step
|
| 65 |
+
DEFAULT_SIM_DT_S = 1.0 # Physics integration timestep
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class DcOpsEnvironment(Environment):
|
| 69 |
+
"""Datacenter operations environment for LLM-based RL agents.
|
| 70 |
+
|
| 71 |
+
The agent observes a text-based monitoring dashboard and issues
|
| 72 |
+
natural-language operator commands. The environment simulates
|
| 73 |
+
physics-based thermal and power dynamics.
|
| 74 |
+
|
| 75 |
+
Episode flow:
|
| 76 |
+
1. reset() initializes the datacenter and optionally injects a fault
|
| 77 |
+
2. step() parses the command, advances simulation, returns dashboard
|
| 78 |
+
3. Episode ends on: budget exhaustion, critical failure, escalation, or resolution
|
| 79 |
+
"""
|
| 80 |
+
|
| 81 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 82 |
+
|
| 83 |
+
def __init__(self) -> None:
|
| 84 |
+
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 85 |
+
self._thermal_sim: ThermalSimulation | None = None
|
| 86 |
+
self._power_sim: PowerSimulation | None = None
|
| 87 |
+
self._config: DatacenterConfig | None = None
|
| 88 |
+
self._scenario: Scenario | None = None
|
| 89 |
+
self._reward_fn: RewardFunction | None = None
|
| 90 |
+
self._step_budget: int = DEFAULT_STEP_BUDGET
|
| 91 |
+
self._game_time_per_step_s: float = DEFAULT_GAME_TIME_PER_STEP_S
|
| 92 |
+
self._sim_dt_s: float = DEFAULT_SIM_DT_S
|
| 93 |
+
self._alert: str = ""
|
| 94 |
+
self._scenario_type: str = ""
|
| 95 |
+
self._done: bool = False
|
| 96 |
+
self._cumulative_reward: float = 0.0
|
| 97 |
+
self._action_history: list[str] = []
|
| 98 |
+
self._escalated: bool = False
|
| 99 |
+
|
| 100 |
+
def reset(
|
| 101 |
+
self,
|
| 102 |
+
seed: Optional[int] = None,
|
| 103 |
+
episode_id: Optional[str] = None,
|
| 104 |
+
**kwargs: Any,
|
| 105 |
+
) -> DcOpsObservation:
|
| 106 |
+
"""Reset the environment and return initial observation.
|
| 107 |
+
|
| 108 |
+
Kwargs:
|
| 109 |
+
scenario (str | Scenario): Scenario ID (e.g. 'A1') or Scenario instance.
|
| 110 |
+
If provided, overrides config/alert/step_budget/scenario_type.
|
| 111 |
+
If not provided, uses raw kwargs (backward compatible).
|
| 112 |
+
config (DatacenterConfig): Custom datacenter configuration.
|
| 113 |
+
config_name (str): Built-in config name ("default", "small", "large").
|
| 114 |
+
Used when config is not provided (e.g. from WebSocket/HTTP JSON).
|
| 115 |
+
step_budget (int): Max steps for the episode.
|
| 116 |
+
game_time_per_step_s (float): Simulation time per step.
|
| 117 |
+
scenario_type (str): Scenario category label.
|
| 118 |
+
alert (str): Initial alert message.
|
| 119 |
+
fault_injection (dict): Fault to inject, e.g.
|
| 120 |
+
{"type": "crac_fault", "unit_id": "CRAC-3", "fault": "compressor"}
|
| 121 |
+
"""
|
| 122 |
+
# Episode state
|
| 123 |
+
self._state = State(
|
| 124 |
+
episode_id=episode_id or str(uuid4()),
|
| 125 |
+
step_count=0,
|
| 126 |
+
)
|
| 127 |
+
self._done = False
|
| 128 |
+
self._cumulative_reward = 0.0
|
| 129 |
+
self._action_history = []
|
| 130 |
+
self._escalated = False
|
| 131 |
+
|
| 132 |
+
# Resolve scenario
|
| 133 |
+
scenario_arg = kwargs.get("scenario")
|
| 134 |
+
if isinstance(scenario_arg, str):
|
| 135 |
+
self._scenario = get_scenario(scenario_arg)
|
| 136 |
+
elif isinstance(scenario_arg, Scenario):
|
| 137 |
+
self._scenario = scenario_arg
|
| 138 |
+
elif scenario_arg is None and kwargs.get("random_scenario"):
|
| 139 |
+
self._scenario = random_scenario(
|
| 140 |
+
scenario_type=kwargs.get("scenario_type"),
|
| 141 |
+
difficulty=kwargs.get("difficulty"),
|
| 142 |
+
seed=seed,
|
| 143 |
+
)
|
| 144 |
+
else:
|
| 145 |
+
self._scenario = None
|
| 146 |
+
|
| 147 |
+
# Reset scenario mutable state (counters, flags) for episode reuse
|
| 148 |
+
if self._scenario:
|
| 149 |
+
self._scenario.reset_state()
|
| 150 |
+
|
| 151 |
+
# Configuration — scenario can modify the base config
|
| 152 |
+
# Support config_name (string) from JSON APIs, or config (DatacenterConfig) from Python
|
| 153 |
+
config_arg = kwargs.get("config")
|
| 154 |
+
config_name = kwargs.get("config_name")
|
| 155 |
+
if isinstance(config_arg, DatacenterConfig):
|
| 156 |
+
self._config = config_arg
|
| 157 |
+
elif config_name and isinstance(config_name, str) and config_name != "default":
|
| 158 |
+
self._config = load_datacenter_config(config_name)
|
| 159 |
+
else:
|
| 160 |
+
self._config = make_default_datacenter_config()
|
| 161 |
+
if self._scenario:
|
| 162 |
+
self._config = self._scenario.configure(self._config)
|
| 163 |
+
|
| 164 |
+
# Episode parameters — scenario provides defaults, kwargs can override
|
| 165 |
+
if self._scenario:
|
| 166 |
+
self._step_budget = kwargs.get("step_budget", self._scenario.step_budget)
|
| 167 |
+
self._game_time_per_step_s = kwargs.get(
|
| 168 |
+
"game_time_per_step_s", self._scenario.game_time_per_step_s
|
| 169 |
+
)
|
| 170 |
+
self._scenario_type = kwargs.get("scenario_type", self._scenario.scenario_type)
|
| 171 |
+
self._alert = kwargs.get("alert", self._scenario.alert_message)
|
| 172 |
+
else:
|
| 173 |
+
self._step_budget = kwargs.get("step_budget", DEFAULT_STEP_BUDGET)
|
| 174 |
+
self._game_time_per_step_s = kwargs.get("game_time_per_step_s", DEFAULT_GAME_TIME_PER_STEP_S)
|
| 175 |
+
self._scenario_type = kwargs.get("scenario_type", "")
|
| 176 |
+
self._alert = kwargs.get("alert", "")
|
| 177 |
+
|
| 178 |
+
self._sim_dt_s = self._config.simulation_dt_s
|
| 179 |
+
|
| 180 |
+
# Initialize reward function with scenario-type-aware weights
|
| 181 |
+
self._reward_fn = RewardFunction(scenario_type=self._scenario_type)
|
| 182 |
+
|
| 183 |
+
# Initialize simulations
|
| 184 |
+
self._thermal_sim = ThermalSimulation(self._config)
|
| 185 |
+
|
| 186 |
+
# Initialize power sim if config has power infrastructure
|
| 187 |
+
if self._config.power and self._config.power.ups_units:
|
| 188 |
+
it_load = self._thermal_sim.state.total_it_load_kw
|
| 189 |
+
self._power_sim = PowerSimulation(self._config.power, it_load_kw=it_load)
|
| 190 |
+
# Wire power state into datacenter state
|
| 191 |
+
self._thermal_sim.state.power = self._power_sim.state
|
| 192 |
+
else:
|
| 193 |
+
self._power_sim = None
|
| 194 |
+
|
| 195 |
+
# Apply fault injection — scenario or raw kwargs
|
| 196 |
+
if self._scenario:
|
| 197 |
+
# Warmup FIRST, then inject fault (so DC is at steady-state)
|
| 198 |
+
self._warmup_simulation()
|
| 199 |
+
self._scenario.inject_fault(self._thermal_sim, self._power_sim)
|
| 200 |
+
else:
|
| 201 |
+
fault = kwargs.get("fault_injection")
|
| 202 |
+
if fault:
|
| 203 |
+
self._apply_fault_injection(fault)
|
| 204 |
+
self._warmup_simulation()
|
| 205 |
+
|
| 206 |
+
# Render initial observation
|
| 207 |
+
return self._make_observation(action_result="Environment initialized. Awaiting your command.")
|
| 208 |
+
|
| 209 |
+
def step(
|
| 210 |
+
self,
|
| 211 |
+
action: DcOpsAction,
|
| 212 |
+
timeout_s: Optional[float] = None,
|
| 213 |
+
**kwargs: Any,
|
| 214 |
+
) -> DcOpsObservation:
|
| 215 |
+
"""Execute one agent step.
|
| 216 |
+
|
| 217 |
+
1. Parse and execute the command
|
| 218 |
+
2. Advance simulation by game_time_per_step_s
|
| 219 |
+
3. Check for alarms and termination
|
| 220 |
+
4. Compute reward via RewardFunction
|
| 221 |
+
5. Return observation
|
| 222 |
+
"""
|
| 223 |
+
if self._done:
|
| 224 |
+
return self._make_observation(
|
| 225 |
+
action_result="Episode already ended. Call reset().",
|
| 226 |
+
reward=0.0,
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
self._state.step_count += 1
|
| 230 |
+
self._action_history.append(action.command)
|
| 231 |
+
|
| 232 |
+
# 1. Parse and execute command
|
| 233 |
+
cmd_result = parse_command(
|
| 234 |
+
action.command,
|
| 235 |
+
self._thermal_sim,
|
| 236 |
+
self._power_sim,
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
# Handle escalation
|
| 240 |
+
if cmd_result.command_name == "escalate":
|
| 241 |
+
self._escalated = True
|
| 242 |
+
self._done = True
|
| 243 |
+
# Evaluate scenario for procedure penalties
|
| 244 |
+
scenario_result: ScenarioResult | None = None
|
| 245 |
+
if self._scenario:
|
| 246 |
+
scenario_result = self._scenario.evaluate_step(
|
| 247 |
+
self._thermal_sim, self._power_sim,
|
| 248 |
+
action.command, self._action_history,
|
| 249 |
+
self._state.step_count,
|
| 250 |
+
)
|
| 251 |
+
# Compute base reward components — escalation penalty is handled
|
| 252 |
+
# by scenario procedure rules + action_quality, not doubled here
|
| 253 |
+
components = self._reward_fn.compute(
|
| 254 |
+
self._thermal_sim, self._power_sim, cmd_result,
|
| 255 |
+
action.command, self._action_history, scenario_result,
|
| 256 |
+
)
|
| 257 |
+
reward = components.total
|
| 258 |
+
self._cumulative_reward += reward
|
| 259 |
+
return self._make_observation(
|
| 260 |
+
action_result=cmd_result.message,
|
| 261 |
+
reward=reward,
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
# 2. Advance simulation
|
| 265 |
+
thermal_alarms, power_alarms = self._advance_simulation()
|
| 266 |
+
|
| 267 |
+
# 3. Build alert from alarms
|
| 268 |
+
self._update_alert(thermal_alarms, power_alarms)
|
| 269 |
+
|
| 270 |
+
# 4. Evaluate scenario (before reward, so progress is available)
|
| 271 |
+
scenario_result = None
|
| 272 |
+
if self._scenario:
|
| 273 |
+
scenario_result = self._scenario.evaluate_step(
|
| 274 |
+
self._thermal_sim, self._power_sim,
|
| 275 |
+
action.command, self._action_history,
|
| 276 |
+
self._state.step_count,
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
# 5. Compute reward via RewardFunction
|
| 280 |
+
components = self._reward_fn.compute(
|
| 281 |
+
self._thermal_sim, self._power_sim, cmd_result,
|
| 282 |
+
action.command, self._action_history, scenario_result,
|
| 283 |
+
)
|
| 284 |
+
reward = components.total
|
| 285 |
+
|
| 286 |
+
self._cumulative_reward += reward
|
| 287 |
+
|
| 288 |
+
# 6. Check termination
|
| 289 |
+
self._check_termination(thermal_alarms, power_alarms)
|
| 290 |
+
|
| 291 |
+
# 6b. Scenario resolution
|
| 292 |
+
if scenario_result and scenario_result.resolved and not self._done:
|
| 293 |
+
self._done = True
|
| 294 |
+
# Speed bonus: fraction of budget remaining
|
| 295 |
+
speed_bonus = (self._step_budget - self._state.step_count) / self._step_budget
|
| 296 |
+
reward += speed_bonus
|
| 297 |
+
self._cumulative_reward += speed_bonus
|
| 298 |
+
if scenario_result.resolution_message:
|
| 299 |
+
self._alert = scenario_result.resolution_message
|
| 300 |
+
|
| 301 |
+
return self._make_observation(
|
| 302 |
+
action_result=cmd_result.message,
|
| 303 |
+
reward=reward,
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
@property
|
| 307 |
+
def state(self) -> State:
|
| 308 |
+
return self._state
|
| 309 |
+
|
| 310 |
+
# -------------------------------------------------------------------
|
| 311 |
+
# Internal methods
|
| 312 |
+
# -------------------------------------------------------------------
|
| 313 |
+
def _warmup_simulation(self, warmup_steps: int = 120) -> None:
|
| 314 |
+
"""Run simulation for a brief warmup to reach quasi-steady-state."""
|
| 315 |
+
for _ in range(warmup_steps):
|
| 316 |
+
self._thermal_sim.step(self._sim_dt_s)
|
| 317 |
+
if self._power_sim:
|
| 318 |
+
it_load = self._thermal_sim.state.total_it_load_kw
|
| 319 |
+
self._power_sim.step(self._sim_dt_s, it_load)
|
| 320 |
+
|
| 321 |
+
def _advance_simulation(self) -> tuple[list[ThermalAlarm], list[PowerAlarm]]:
|
| 322 |
+
"""Advance simulation by game_time_per_step_s seconds."""
|
| 323 |
+
n_substeps = int(self._game_time_per_step_s / self._sim_dt_s)
|
| 324 |
+
all_thermal_alarms: list[ThermalAlarm] = []
|
| 325 |
+
all_power_alarms: list[PowerAlarm] = []
|
| 326 |
+
|
| 327 |
+
for _ in range(n_substeps):
|
| 328 |
+
# Thermal step
|
| 329 |
+
thermal_result = self._thermal_sim.step(self._sim_dt_s)
|
| 330 |
+
all_thermal_alarms.extend(thermal_result.alarms)
|
| 331 |
+
|
| 332 |
+
# Power step
|
| 333 |
+
if self._power_sim:
|
| 334 |
+
it_load = self._thermal_sim.state.total_it_load_kw
|
| 335 |
+
power_result = self._power_sim.step(self._sim_dt_s, it_load)
|
| 336 |
+
all_power_alarms.extend(power_result.alarms)
|
| 337 |
+
|
| 338 |
+
# Deduplicate alarms by type (keep most recent)
|
| 339 |
+
thermal_alarms = _dedupe_alarms_by_type(all_thermal_alarms)
|
| 340 |
+
power_alarms = _dedupe_alarms_by_type(all_power_alarms)
|
| 341 |
+
|
| 342 |
+
return thermal_alarms, power_alarms
|
| 343 |
+
|
| 344 |
+
def _update_alert(
|
| 345 |
+
self,
|
| 346 |
+
thermal_alarms: list[ThermalAlarm],
|
| 347 |
+
power_alarms: list[PowerAlarm],
|
| 348 |
+
) -> None:
|
| 349 |
+
"""Update the active alert string from current alarms."""
|
| 350 |
+
critical_messages: list[str] = []
|
| 351 |
+
|
| 352 |
+
for alarm in thermal_alarms:
|
| 353 |
+
if alarm.severity == "critical":
|
| 354 |
+
critical_messages.append(alarm.message)
|
| 355 |
+
|
| 356 |
+
for alarm in power_alarms:
|
| 357 |
+
if alarm.severity == "critical":
|
| 358 |
+
critical_messages.append(alarm.message)
|
| 359 |
+
|
| 360 |
+
if critical_messages:
|
| 361 |
+
self._alert = " | ".join(critical_messages[:3]) # Limit to 3 alerts
|
| 362 |
+
else:
|
| 363 |
+
# Check for warnings
|
| 364 |
+
warnings = []
|
| 365 |
+
for alarm in thermal_alarms:
|
| 366 |
+
if alarm.severity == "warning":
|
| 367 |
+
warnings.append(alarm.message)
|
| 368 |
+
for alarm in power_alarms:
|
| 369 |
+
if alarm.severity == "warning":
|
| 370 |
+
warnings.append(alarm.message)
|
| 371 |
+
if warnings:
|
| 372 |
+
self._alert = warnings[0]
|
| 373 |
+
else:
|
| 374 |
+
self._alert = ""
|
| 375 |
+
|
| 376 |
+
def _check_termination(
|
| 377 |
+
self,
|
| 378 |
+
thermal_alarms: list[ThermalAlarm],
|
| 379 |
+
power_alarms: list[PowerAlarm],
|
| 380 |
+
) -> None:
|
| 381 |
+
"""Check if episode should end."""
|
| 382 |
+
# Step budget exhausted
|
| 383 |
+
if self._state.step_count >= self._step_budget:
|
| 384 |
+
self._done = True
|
| 385 |
+
return
|
| 386 |
+
|
| 387 |
+
# Critical thermal failure: any rack above allowable max
|
| 388 |
+
for zone in self._thermal_sim.state.zones:
|
| 389 |
+
ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
|
| 390 |
+
if not ashrae:
|
| 391 |
+
continue
|
| 392 |
+
if zone.max_inlet_temp_c > ashrae.allowable_max_c + 5.0:
|
| 393 |
+
self._done = True
|
| 394 |
+
self._alert = (
|
| 395 |
+
f"CRITICAL: Zone {zone.zone_id} inlet temp "
|
| 396 |
+
f"{zone.max_inlet_temp_c:.1f}°C exceeds allowable max "
|
| 397 |
+
f"{ashrae.allowable_max_c:.1f}°C by >5°C. Emergency shutdown."
|
| 398 |
+
)
|
| 399 |
+
return
|
| 400 |
+
|
| 401 |
+
# UPS battery exhausted
|
| 402 |
+
if self._power_sim:
|
| 403 |
+
for ups in self._power_sim.state.ups_units:
|
| 404 |
+
if ups.mode.value == "fault" and ups.battery_soc <= 0:
|
| 405 |
+
self._done = True
|
| 406 |
+
self._alert = f"CRITICAL: {ups.unit_id} battery exhausted. Unprotected load."
|
| 407 |
+
return
|
| 408 |
+
|
| 409 |
+
def _apply_fault_injection(self, fault: dict) -> None:
|
| 410 |
+
"""Apply a fault injection to the simulation.
|
| 411 |
+
|
| 412 |
+
Supported fault types:
|
| 413 |
+
- crac_fault: {"type": "crac_fault", "unit_id": "CRAC-3", "fault": "compressor"}
|
| 414 |
+
- utility_loss: {"type": "utility_loss"}
|
| 415 |
+
- ups_fault: {"type": "ups_fault", "unit_id": "UPS-1"}
|
| 416 |
+
- rack_load_change: {"type": "rack_load_change", "rack_id": "A-01", "load_kw": 15.0}
|
| 417 |
+
- outside_temp: {"type": "outside_temp", "temp_c": 42.0}
|
| 418 |
+
"""
|
| 419 |
+
fault_type = fault.get("type", "")
|
| 420 |
+
|
| 421 |
+
if fault_type == "crac_fault":
|
| 422 |
+
from ..simulation.types import CRACFaultType
|
| 423 |
+
unit_id = fault.get("unit_id", "")
|
| 424 |
+
fault_name = fault.get("fault", "compressor")
|
| 425 |
+
try:
|
| 426 |
+
ft = CRACFaultType(fault_name)
|
| 427 |
+
except ValueError:
|
| 428 |
+
ft = CRACFaultType.COMPRESSOR
|
| 429 |
+
self._thermal_sim.inject_crac_fault(unit_id, ft)
|
| 430 |
+
|
| 431 |
+
elif fault_type == "utility_loss":
|
| 432 |
+
if self._power_sim:
|
| 433 |
+
self._power_sim.set_utility_available(False)
|
| 434 |
+
|
| 435 |
+
elif fault_type == "ups_fault":
|
| 436 |
+
if self._power_sim:
|
| 437 |
+
unit_id = fault.get("unit_id", "")
|
| 438 |
+
self._power_sim.inject_ups_fault(unit_id)
|
| 439 |
+
|
| 440 |
+
elif fault_type == "rack_load_change":
|
| 441 |
+
rack_id = fault.get("rack_id", "")
|
| 442 |
+
load_kw = fault.get("load_kw", 8.0)
|
| 443 |
+
self._thermal_sim.set_rack_load(rack_id, load_kw)
|
| 444 |
+
|
| 445 |
+
elif fault_type == "outside_temp":
|
| 446 |
+
temp_c = fault.get("temp_c", 35.0)
|
| 447 |
+
self._thermal_sim.set_outside_temp(temp_c)
|
| 448 |
+
|
| 449 |
+
def _make_observation(
|
| 450 |
+
self,
|
| 451 |
+
action_result: str = "",
|
| 452 |
+
reward: float = 0.0,
|
| 453 |
+
) -> DcOpsObservation:
|
| 454 |
+
"""Build the observation to return to the agent."""
|
| 455 |
+
dashboard = render_dashboard(
|
| 456 |
+
self._thermal_sim.state,
|
| 457 |
+
alert=self._alert,
|
| 458 |
+
step=self._state.step_count,
|
| 459 |
+
max_steps=self._step_budget,
|
| 460 |
+
scenario_type=self._scenario_type,
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
+
steps_remaining = max(0, self._step_budget - self._state.step_count)
|
| 464 |
+
|
| 465 |
+
# Build metadata with structured data
|
| 466 |
+
dc_state = self._thermal_sim.state
|
| 467 |
+
metadata = {
|
| 468 |
+
"sim_time_s": dc_state.sim_time_s,
|
| 469 |
+
"total_it_load_kw": dc_state.total_it_load_kw,
|
| 470 |
+
"total_cooling_power_kw": dc_state.total_cooling_power_kw,
|
| 471 |
+
"pue": dc_state.pue,
|
| 472 |
+
"outside_temp_c": dc_state.outside_temp_c,
|
| 473 |
+
"cumulative_reward": self._cumulative_reward,
|
| 474 |
+
"zones": {},
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
for zone in dc_state.zones:
|
| 478 |
+
metadata["zones"][zone.zone_id] = {
|
| 479 |
+
"cold_aisle_temp_c": zone.cold_aisle_temp_c,
|
| 480 |
+
"hot_aisle_temp_c": zone.hot_aisle_temp_c,
|
| 481 |
+
"max_inlet_temp_c": zone.max_inlet_temp_c,
|
| 482 |
+
"total_it_load_kw": zone.total_it_load_kw,
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
+
if self._power_sim:
|
| 486 |
+
power = self._power_sim.state
|
| 487 |
+
metadata["power"] = {
|
| 488 |
+
"utility_available": power.utility_available,
|
| 489 |
+
"on_generator": power.on_generator,
|
| 490 |
+
"total_ups_loss_kw": power.total_ups_loss_kw,
|
| 491 |
+
"total_pdu_loss_kw": power.total_pdu_loss_kw,
|
| 492 |
+
}
|
| 493 |
+
for ups in power.ups_units:
|
| 494 |
+
metadata["power"][ups.unit_id] = {
|
| 495 |
+
"mode": ups.mode.value,
|
| 496 |
+
"battery_soc": ups.battery_soc,
|
| 497 |
+
"load_fraction": ups.load_fraction,
|
| 498 |
+
"efficiency": ups.efficiency,
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
if self._scenario:
|
| 502 |
+
metadata["scenario"] = {
|
| 503 |
+
"id": self._scenario.scenario_id,
|
| 504 |
+
"name": self._scenario.name,
|
| 505 |
+
"difficulty": self._scenario.difficulty,
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
# Use scenario-specific actions if defined, otherwise all actions
|
| 509 |
+
actions = AVAILABLE_ACTIONS
|
| 510 |
+
if self._scenario and self._scenario.available_actions is not None:
|
| 511 |
+
actions = self._scenario.available_actions
|
| 512 |
+
|
| 513 |
+
return DcOpsObservation(
|
| 514 |
+
dashboard=dashboard,
|
| 515 |
+
available_actions=actions,
|
| 516 |
+
alert=self._alert,
|
| 517 |
+
scenario_type=self._scenario_type,
|
| 518 |
+
steps_remaining=steps_remaining,
|
| 519 |
+
action_result=action_result,
|
| 520 |
+
done=self._done,
|
| 521 |
+
reward=reward,
|
| 522 |
+
metadata=metadata,
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
def _dedupe_alarms_by_type(alarms: list) -> list:
|
| 527 |
+
"""Keep only the last alarm of each (component, alarm_type) pair."""
|
| 528 |
+
seen: dict[tuple[str, str], Any] = {}
|
| 529 |
+
for alarm in alarms:
|
| 530 |
+
key = (getattr(alarm, "component", ""), getattr(alarm, "alarm_type", ""))
|
| 531 |
+
seen[key] = alarm
|
| 532 |
+
return list(seen.values())
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv[core]>=0.2.0
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn>=0.24.0
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
|
server/static/index.html
ADDED
|
@@ -0,0 +1,911 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>DC-Ops | Datacenter Operations Console</title>
|
| 7 |
+
<style>
|
| 8 |
+
*,*::before,*::after{box-sizing:border-box;margin:0;padding:0}
|
| 9 |
+
:root{
|
| 10 |
+
--bg:#0a0e17;--bg-card:#111827;--bg-card-hover:#1a2332;
|
| 11 |
+
--border:#1e2d3d;--border-active:#3b82f6;
|
| 12 |
+
--text:#e2e8f0;--text-dim:#94a3b8;--text-muted:#64748b;
|
| 13 |
+
--accent:#3b82f6;--accent-hover:#2563eb;
|
| 14 |
+
--green:#22c55e;--green-dim:#166534;
|
| 15 |
+
--red:#ef4444;--red-dim:#991b1b;
|
| 16 |
+
--yellow:#eab308;--yellow-dim:#854d0e;
|
| 17 |
+
--orange:#f97316;
|
| 18 |
+
--cyan:#06b6d4;
|
| 19 |
+
--terminal-bg:#0d1117;
|
| 20 |
+
--font-mono:'JetBrains Mono','Fira Code','SF Mono','Cascadia Code',Consolas,monospace;
|
| 21 |
+
--font-sans:'Inter',-apple-system,BlinkMacSystemFont,'Segoe UI',Roboto,sans-serif;
|
| 22 |
+
--radius:8px;--radius-lg:12px;
|
| 23 |
+
}
|
| 24 |
+
html{font-size:14px}
|
| 25 |
+
body{background:var(--bg);color:var(--text);font-family:var(--font-sans);min-height:100vh;overflow-x:hidden}
|
| 26 |
+
|
| 27 |
+
/* Layout */
|
| 28 |
+
.app{display:grid;grid-template-rows:auto 1fr;height:100vh}
|
| 29 |
+
.header{background:var(--bg-card);border-bottom:1px solid var(--border);padding:0.75rem 1.5rem;display:flex;align-items:center;justify-content:space-between;gap:1rem;flex-wrap:wrap}
|
| 30 |
+
.header-left{display:flex;align-items:center;gap:0.75rem}
|
| 31 |
+
.logo{font-size:1.25rem;font-weight:700;letter-spacing:-0.02em}
|
| 32 |
+
.logo span{color:var(--accent)}
|
| 33 |
+
.status-badge{display:inline-flex;align-items:center;gap:0.375rem;padding:0.25rem 0.75rem;border-radius:999px;font-size:0.75rem;font-weight:500}
|
| 34 |
+
.status-badge.connected{background:var(--green-dim);color:var(--green)}
|
| 35 |
+
.status-badge.disconnected{background:var(--red-dim);color:var(--red)}
|
| 36 |
+
.status-badge.loading{background:var(--yellow-dim);color:var(--yellow)}
|
| 37 |
+
.status-dot{width:6px;height:6px;border-radius:50%;background:currentColor}
|
| 38 |
+
.status-badge.connected .status-dot{animation:pulse 2s infinite}
|
| 39 |
+
@keyframes pulse{0%,100%{opacity:1}50%{opacity:0.5}}
|
| 40 |
+
|
| 41 |
+
.main{display:grid;grid-template-columns:280px 1fr 300px;gap:0;overflow:hidden}
|
| 42 |
+
|
| 43 |
+
/* Sidebar - Scenario Browser */
|
| 44 |
+
.sidebar{background:var(--bg-card);border-right:1px solid var(--border);display:flex;flex-direction:column;overflow:hidden}
|
| 45 |
+
.sidebar-header{padding:0.875rem 1rem;border-bottom:1px solid var(--border);font-weight:600;font-size:0.8rem;text-transform:uppercase;letter-spacing:0.05em;color:var(--text-dim)}
|
| 46 |
+
.scenario-list{flex:1;overflow-y:auto;padding:0.5rem}
|
| 47 |
+
.scenario-group{margin-bottom:0.75rem}
|
| 48 |
+
.scenario-group-title{padding:0.375rem 0.75rem;font-size:0.65rem;font-weight:600;text-transform:uppercase;letter-spacing:0.1em;color:var(--text-muted)}
|
| 49 |
+
.scenario-card{padding:0.625rem 0.75rem;margin:0.25rem 0;border-radius:var(--radius);cursor:pointer;transition:all 0.15s;border:1px solid transparent}
|
| 50 |
+
.scenario-card:hover{background:var(--bg-card-hover);border-color:var(--border)}
|
| 51 |
+
.scenario-card.active{background:rgba(59,130,246,0.08);border-color:var(--accent)}
|
| 52 |
+
.scenario-card .sc-header{display:flex;align-items:center;justify-content:space-between;margin-bottom:0.25rem}
|
| 53 |
+
.scenario-card .sc-id{font-weight:700;font-family:var(--font-mono);font-size:0.8rem;color:var(--accent)}
|
| 54 |
+
.scenario-card .sc-diff{font-size:0.6rem;font-weight:700;padding:0.1rem 0.5rem;border-radius:999px;text-transform:uppercase;letter-spacing:0.05em}
|
| 55 |
+
.sc-diff.easy{background:var(--green-dim);color:var(--green)}
|
| 56 |
+
.sc-diff.medium{background:var(--yellow-dim);color:var(--yellow)}
|
| 57 |
+
.sc-diff.hard{background:var(--red-dim);color:var(--red)}
|
| 58 |
+
.scenario-card .sc-name{font-size:0.78rem;font-weight:500;color:var(--text);margin-bottom:0.125rem}
|
| 59 |
+
.scenario-card .sc-desc{font-size:0.68rem;color:var(--text-muted);line-height:1.4}
|
| 60 |
+
|
| 61 |
+
.sidebar-actions{padding:0.75rem;border-top:1px solid var(--border);display:flex;flex-direction:column;gap:0.5rem}
|
| 62 |
+
.config-select{width:100%;padding:0.5rem 0.625rem;background:var(--terminal-bg);border:1px solid var(--border);border-radius:var(--radius);color:var(--text);font-size:0.78rem;font-family:var(--font-sans);appearance:none;background-image:url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 12 12'%3E%3Cpath fill='%2394a3b8' d='M3 5l3 3 3-3'/%3E%3C/svg%3E");background-repeat:no-repeat;background-position:right 0.5rem center;padding-right:1.5rem}
|
| 63 |
+
.config-select:focus{outline:none;border-color:var(--accent)}
|
| 64 |
+
.btn{padding:0.625rem 1rem;border-radius:var(--radius);border:none;cursor:pointer;font-weight:600;font-size:0.8rem;transition:all 0.15s;text-align:center;font-family:var(--font-sans);display:flex;align-items:center;justify-content:center;gap:0.5rem}
|
| 65 |
+
.btn-primary{background:var(--accent);color:white}
|
| 66 |
+
.btn-primary:hover:not(:disabled){background:var(--accent-hover)}
|
| 67 |
+
.btn-primary:disabled{opacity:0.5;cursor:not-allowed}
|
| 68 |
+
.btn-danger{background:var(--red-dim);color:var(--red);border:1px solid rgba(239,68,68,0.2)}
|
| 69 |
+
.btn-danger:hover{background:var(--red);color:white}
|
| 70 |
+
.btn-outline{background:transparent;color:var(--text-dim);border:1px solid var(--border)}
|
| 71 |
+
.btn-outline:hover{border-color:var(--text-dim);color:var(--text)}
|
| 72 |
+
|
| 73 |
+
/* Center Panel - Dashboard */
|
| 74 |
+
.center{display:flex;flex-direction:column;overflow:hidden;min-width:0}
|
| 75 |
+
.dashboard-container{flex:1;overflow-y:auto;padding:1rem}
|
| 76 |
+
.dashboard-box{background:var(--terminal-bg);border:1px solid var(--border);border-radius:var(--radius-lg);overflow:hidden}
|
| 77 |
+
.dashboard-title-bar{display:flex;align-items:center;justify-content:space-between;padding:0.5rem 1rem;background:rgba(255,255,255,0.03);border-bottom:1px solid var(--border)}
|
| 78 |
+
.dashboard-title-bar .dots{display:flex;gap:6px}
|
| 79 |
+
.dashboard-title-bar .dots span{width:10px;height:10px;border-radius:50%}
|
| 80 |
+
.dashboard-title-bar .dots span:nth-child(1){background:#ef4444}
|
| 81 |
+
.dashboard-title-bar .dots span:nth-child(2){background:#eab308}
|
| 82 |
+
.dashboard-title-bar .dots span:nth-child(3){background:#22c55e}
|
| 83 |
+
.dashboard-title-bar .title{font-size:0.72rem;color:var(--text-muted);font-family:var(--font-mono)}
|
| 84 |
+
.dashboard-output{padding:1rem;font-family:var(--font-mono);font-size:0.75rem;line-height:1.2;white-space:pre;overflow-x:auto;min-height:200px;color:var(--green)}
|
| 85 |
+
|
| 86 |
+
/* Action result box */
|
| 87 |
+
.action-result{margin-top:0.75rem;background:var(--terminal-bg);border:1px solid var(--border);border-radius:var(--radius);padding:0.625rem 0.875rem;font-family:var(--font-mono);font-size:0.75rem;max-height:100px;overflow-y:auto;transition:all 0.2s}
|
| 88 |
+
.action-result.error{color:var(--red);border-color:rgba(239,68,68,0.3)}
|
| 89 |
+
.action-result.success{color:var(--cyan);border-color:rgba(6,182,212,0.3)}
|
| 90 |
+
|
| 91 |
+
/* Welcome screen */
|
| 92 |
+
.welcome{display:flex;flex-direction:column;align-items:center;justify-content:center;text-align:center;padding:3rem 2rem;min-height:300px;white-space:normal}
|
| 93 |
+
.welcome h2{font-size:1.4rem;color:var(--text);margin-bottom:0.75rem;font-weight:700}
|
| 94 |
+
.welcome p{max-width:380px;line-height:1.7;font-size:0.85rem;color:var(--text-dim)}
|
| 95 |
+
.welcome .hint{margin-top:1.5rem;display:flex;align-items:center;gap:0.5rem;color:var(--accent);font-size:0.8rem;opacity:0.7}
|
| 96 |
+
.welcome .hint svg{width:20px;height:20px}
|
| 97 |
+
|
| 98 |
+
/* Command Bar */
|
| 99 |
+
.command-bar{padding:0.75rem 1rem;border-top:1px solid var(--border);background:var(--bg-card)}
|
| 100 |
+
.command-input-group{display:flex;gap:0.5rem}
|
| 101 |
+
.command-input{flex:1;padding:0.625rem 0.875rem;background:var(--terminal-bg);border:1px solid var(--border);border-radius:var(--radius);color:var(--text);font-family:var(--font-mono);font-size:0.8rem;min-width:0}
|
| 102 |
+
.command-input:focus{outline:none;border-color:var(--accent);box-shadow:0 0 0 3px rgba(59,130,246,0.12)}
|
| 103 |
+
.command-input::placeholder{color:var(--text-muted)}
|
| 104 |
+
.command-input:disabled{opacity:0.4}
|
| 105 |
+
.quick-actions{display:flex;gap:0.375rem;margin-top:0.5rem;flex-wrap:wrap}
|
| 106 |
+
.quick-btn{padding:0.2rem 0.5rem;background:rgba(255,255,255,0.03);border:1px solid var(--border);border-radius:999px;color:var(--text-dim);font-size:0.68rem;cursor:pointer;font-family:var(--font-mono);transition:all 0.15s;white-space:nowrap}
|
| 107 |
+
.quick-btn:hover:not(:disabled){border-color:var(--accent);color:var(--accent)}
|
| 108 |
+
.quick-btn:disabled{opacity:0.3;cursor:not-allowed}
|
| 109 |
+
|
| 110 |
+
/* Right Panel - Metrics */
|
| 111 |
+
.right-panel{background:var(--bg-card);border-left:1px solid var(--border);display:flex;flex-direction:column;overflow-y:auto}
|
| 112 |
+
.panel-section{padding:0.875rem;border-bottom:1px solid var(--border)}
|
| 113 |
+
.panel-section-title{font-size:0.65rem;font-weight:600;text-transform:uppercase;letter-spacing:0.1em;color:var(--text-muted);margin-bottom:0.625rem}
|
| 114 |
+
|
| 115 |
+
/* Metrics grid */
|
| 116 |
+
.metrics-grid{display:grid;grid-template-columns:1fr 1fr;gap:0.375rem}
|
| 117 |
+
.metric{background:var(--terminal-bg);padding:0.5rem 0.625rem;border-radius:var(--radius);border:1px solid var(--border)}
|
| 118 |
+
.metric-label{font-size:0.6rem;color:var(--text-muted);margin-bottom:0.2rem;text-transform:uppercase;letter-spacing:0.05em}
|
| 119 |
+
.metric-value{font-size:1rem;font-weight:700;font-family:var(--font-mono)}
|
| 120 |
+
.metric-value.good{color:var(--green)}
|
| 121 |
+
.metric-value.warn{color:var(--yellow)}
|
| 122 |
+
.metric-value.danger{color:var(--red)}
|
| 123 |
+
.metric-value.neutral{color:var(--text)}
|
| 124 |
+
|
| 125 |
+
/* Episode info */
|
| 126 |
+
.episode-info{display:flex;flex-direction:column;gap:0.375rem}
|
| 127 |
+
.episode-row{display:flex;justify-content:space-between;align-items:center;font-size:0.78rem}
|
| 128 |
+
.episode-row .label{color:var(--text-muted)}
|
| 129 |
+
.episode-row .value{font-family:var(--font-mono);font-weight:600}
|
| 130 |
+
|
| 131 |
+
/* Progress bar */
|
| 132 |
+
.progress-bar{height:5px;background:var(--terminal-bg);border-radius:999px;overflow:hidden;border:1px solid var(--border)}
|
| 133 |
+
.progress-fill{height:100%;border-radius:999px;transition:width 0.3s;background:var(--accent)}
|
| 134 |
+
.progress-fill.low{background:var(--green)}
|
| 135 |
+
.progress-fill.mid{background:var(--yellow)}
|
| 136 |
+
.progress-fill.high{background:var(--red)}
|
| 137 |
+
|
| 138 |
+
/* Power status */
|
| 139 |
+
.power-row{display:flex;justify-content:space-between;align-items:center;padding:0.375rem 0.5rem;background:var(--terminal-bg);border-radius:4px;font-size:0.72rem;font-family:var(--font-mono);border:1px solid var(--border);margin-bottom:0.375rem}
|
| 140 |
+
.power-row .pw-label{color:var(--text-dim)}
|
| 141 |
+
.power-row .pw-val{font-weight:600}
|
| 142 |
+
.power-row .pw-val.ok{color:var(--green)}
|
| 143 |
+
.power-row .pw-val.warn{color:var(--yellow)}
|
| 144 |
+
.power-row .pw-val.bad{color:var(--red)}
|
| 145 |
+
|
| 146 |
+
/* Reward history */
|
| 147 |
+
.reward-history{display:flex;flex-direction:column;gap:0.25rem;max-height:180px;overflow-y:auto}
|
| 148 |
+
.reward-entry{display:flex;justify-content:space-between;align-items:center;padding:0.3rem 0.5rem;background:var(--terminal-bg);border-radius:4px;font-size:0.7rem;font-family:var(--font-mono)}
|
| 149 |
+
.reward-entry .step{color:var(--text-muted);width:24px;flex-shrink:0}
|
| 150 |
+
.reward-entry .cmd{color:var(--text-dim);flex:1;margin:0 0.5rem;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
|
| 151 |
+
.reward-entry .rew{font-weight:700;flex-shrink:0}
|
| 152 |
+
.reward-entry .rew.pos{color:var(--green)}
|
| 153 |
+
.reward-entry .rew.neg{color:var(--red)}
|
| 154 |
+
.reward-entry .rew.zero{color:var(--text-muted)}
|
| 155 |
+
|
| 156 |
+
/* Zone temps bar chart */
|
| 157 |
+
.zone-bars{display:flex;flex-direction:column;gap:0.375rem}
|
| 158 |
+
.zone-bar-row{display:flex;align-items:center;gap:0.5rem;font-size:0.72rem}
|
| 159 |
+
.zone-bar-label{width:44px;color:var(--text-dim);font-family:var(--font-mono);flex-shrink:0}
|
| 160 |
+
.zone-bar-track{flex:1;height:14px;background:var(--terminal-bg);border-radius:3px;position:relative;overflow:hidden;border:1px solid var(--border)}
|
| 161 |
+
.zone-bar-fill{height:100%;border-radius:2px;transition:width 0.3s}
|
| 162 |
+
.zone-bar-fill.safe{background:linear-gradient(90deg,var(--green-dim),var(--green))}
|
| 163 |
+
.zone-bar-fill.warning{background:linear-gradient(90deg,var(--yellow-dim),var(--yellow))}
|
| 164 |
+
.zone-bar-fill.critical{background:linear-gradient(90deg,var(--red-dim),var(--red))}
|
| 165 |
+
.zone-bar-value{width:52px;text-align:right;font-family:var(--font-mono);font-weight:600;flex-shrink:0}
|
| 166 |
+
|
| 167 |
+
/* Episode done banner */
|
| 168 |
+
.episode-done-banner{padding:0.625rem 1rem;text-align:center;font-weight:600;font-size:0.8rem;border-radius:var(--radius);margin-bottom:0.75rem;display:none}
|
| 169 |
+
.episode-done-banner.show{display:block}
|
| 170 |
+
.episode-done-banner.resolved{background:var(--green-dim);color:var(--green);border:1px solid rgba(34,197,94,0.3)}
|
| 171 |
+
.episode-done-banner.failed{background:var(--red-dim);color:var(--red);border:1px solid rgba(239,68,68,0.3)}
|
| 172 |
+
.episode-done-banner.timeout{background:var(--yellow-dim);color:var(--yellow);border:1px solid rgba(234,179,8,0.3)}
|
| 173 |
+
|
| 174 |
+
/* No data placeholder */
|
| 175 |
+
.no-data{font-size:0.75rem;color:var(--text-muted);text-align:center;padding:0.75rem 0.5rem}
|
| 176 |
+
|
| 177 |
+
/* Spinner */
|
| 178 |
+
.spinner{display:inline-block;width:14px;height:14px;border:2px solid rgba(255,255,255,0.2);border-top-color:currentColor;border-radius:50%;animation:spin 0.5s linear infinite}
|
| 179 |
+
@keyframes spin{to{transform:rotate(360deg)}}
|
| 180 |
+
|
| 181 |
+
/* Scrollbar */
|
| 182 |
+
::-webkit-scrollbar{width:5px;height:5px}
|
| 183 |
+
::-webkit-scrollbar-track{background:transparent}
|
| 184 |
+
::-webkit-scrollbar-thumb{background:var(--border);border-radius:3px}
|
| 185 |
+
::-webkit-scrollbar-thumb:hover{background:var(--text-muted)}
|
| 186 |
+
|
| 187 |
+
/* Responsive - Tablet */
|
| 188 |
+
@media(max-width:1100px){
|
| 189 |
+
.main{grid-template-columns:1fr;grid-template-rows:auto 1fr auto}
|
| 190 |
+
.sidebar{border-right:none;border-bottom:1px solid var(--border);max-height:none}
|
| 191 |
+
.sidebar.collapsed{display:none}
|
| 192 |
+
.right-panel{border-left:none;border-top:1px solid var(--border);max-height:none}
|
| 193 |
+
.right-panel.collapsed{display:none}
|
| 194 |
+
.sidebar-header{display:none}
|
| 195 |
+
.scenario-list{display:flex;overflow-x:auto;overflow-y:hidden;padding:0.5rem;gap:0.5rem}
|
| 196 |
+
.scenario-group{display:flex;gap:0.5rem;margin:0;flex-shrink:0}
|
| 197 |
+
.scenario-group-title{writing-mode:vertical-lr;padding:0.5rem 0.25rem;font-size:0.6rem}
|
| 198 |
+
.scenario-card{min-width:160px;flex-shrink:0}
|
| 199 |
+
.sidebar-actions{flex-direction:row}
|
| 200 |
+
.config-select{width:auto;flex:1}
|
| 201 |
+
.right-panel .panel-section{padding:0.625rem 0.75rem}
|
| 202 |
+
.metrics-grid{grid-template-columns:repeat(4,1fr)}
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
/* Responsive - Mobile */
|
| 206 |
+
@media(max-width:640px){
|
| 207 |
+
.header{padding:0.5rem 0.75rem}
|
| 208 |
+
.logo{font-size:1rem}
|
| 209 |
+
.dashboard-output{font-size:0.62rem;padding:0.5rem;line-height:1.2}
|
| 210 |
+
.metrics-grid{grid-template-columns:1fr 1fr}
|
| 211 |
+
html{font-size:13px}
|
| 212 |
+
.command-input{font-size:0.75rem}
|
| 213 |
+
.scenario-card{min-width:140px}
|
| 214 |
+
.sidebar-actions{flex-direction:column}
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
/* Toggle buttons for mobile */
|
| 218 |
+
.mobile-toggles{display:none;gap:0.5rem}
|
| 219 |
+
@media(max-width:1100px){.mobile-toggles{display:flex}}
|
| 220 |
+
.toggle-btn{padding:0.25rem 0.625rem;background:transparent;border:1px solid var(--border);border-radius:var(--radius);color:var(--text-dim);font-size:0.7rem;cursor:pointer;font-family:var(--font-sans);transition:all 0.15s}
|
| 221 |
+
.toggle-btn.active{border-color:var(--accent);color:var(--accent);background:rgba(59,130,246,0.08)}
|
| 222 |
+
</style>
|
| 223 |
+
</head>
|
| 224 |
+
<body>
|
| 225 |
+
|
| 226 |
+
<div class="app">
|
| 227 |
+
<!-- Header -->
|
| 228 |
+
<header class="header">
|
| 229 |
+
<div class="header-left">
|
| 230 |
+
<div class="logo">DC<span>-Ops</span></div>
|
| 231 |
+
<div id="statusBadge" class="status-badge disconnected">
|
| 232 |
+
<span class="status-dot"></span>
|
| 233 |
+
<span id="statusText">Disconnected</span>
|
| 234 |
+
</div>
|
| 235 |
+
</div>
|
| 236 |
+
<div class="mobile-toggles">
|
| 237 |
+
<button class="toggle-btn active" id="toggleScenarios" onclick="togglePanel('sidebar')">Scenarios</button>
|
| 238 |
+
<button class="toggle-btn active" id="toggleMetrics" onclick="togglePanel('right-panel')">Metrics</button>
|
| 239 |
+
</div>
|
| 240 |
+
</header>
|
| 241 |
+
|
| 242 |
+
<!-- Main Layout -->
|
| 243 |
+
<div class="main">
|
| 244 |
+
<!-- Left: Scenario Browser -->
|
| 245 |
+
<aside class="sidebar" id="sidebar">
|
| 246 |
+
<div class="sidebar-header">Scenario Browser</div>
|
| 247 |
+
<div class="scenario-list" id="scenarioList">
|
| 248 |
+
<div class="scenario-group">
|
| 249 |
+
<div class="scenario-group-title">Thermal</div>
|
| 250 |
+
<div class="scenario-card" data-id="A1" onclick="selectScenario('A1')">
|
| 251 |
+
<div class="sc-header">
|
| 252 |
+
<span class="sc-id">A1</span>
|
| 253 |
+
<span class="sc-diff easy">Easy</span>
|
| 254 |
+
</div>
|
| 255 |
+
<div class="sc-name">Cooling Setpoint Optimization</div>
|
| 256 |
+
<div class="sc-desc">CRACs overcooling at 15°C. Optimize for efficiency while staying in ASHRAE range.</div>
|
| 257 |
+
</div>
|
| 258 |
+
<div class="scenario-card" data-id="A2" onclick="selectScenario('A2')">
|
| 259 |
+
<div class="sc-header">
|
| 260 |
+
<span class="sc-id">A2</span>
|
| 261 |
+
<span class="sc-diff medium">Medium</span>
|
| 262 |
+
</div>
|
| 263 |
+
<div class="sc-name">Thermal Event Response</div>
|
| 264 |
+
<div class="sc-desc">CRAC-3 compressor failure. Diagnose and stabilize all zones.</div>
|
| 265 |
+
</div>
|
| 266 |
+
<div class="scenario-card" data-id="A4" onclick="selectScenario('A4')">
|
| 267 |
+
<div class="sc-header">
|
| 268 |
+
<span class="sc-id">A4</span>
|
| 269 |
+
<span class="sc-diff hard">Hard</span>
|
| 270 |
+
</div>
|
| 271 |
+
<div class="sc-name">CRAC Failure Cascade</div>
|
| 272 |
+
<div class="sc-desc">CRAC-1 compressor + CRAC-3 fan failure. Manage cascading thermal event.</div>
|
| 273 |
+
</div>
|
| 274 |
+
</div>
|
| 275 |
+
<div class="scenario-group">
|
| 276 |
+
<div class="scenario-group-title">Power</div>
|
| 277 |
+
<div class="scenario-card" data-id="B1" onclick="selectScenario('B1')">
|
| 278 |
+
<div class="sc-header">
|
| 279 |
+
<span class="sc-id">B1</span>
|
| 280 |
+
<span class="sc-diff medium">Medium</span>
|
| 281 |
+
</div>
|
| 282 |
+
<div class="sc-name">UPS Alarm Response</div>
|
| 283 |
+
<div class="sc-desc">UPS transferred to battery after utility event. Diagnose and acknowledge.</div>
|
| 284 |
+
</div>
|
| 285 |
+
<div class="scenario-card" data-id="B3" onclick="selectScenario('B3')">
|
| 286 |
+
<div class="sc-header">
|
| 287 |
+
<span class="sc-id">B3</span>
|
| 288 |
+
<span class="sc-diff easy">Easy</span>
|
| 289 |
+
</div>
|
| 290 |
+
<div class="sc-name">Generator Test Protocol</div>
|
| 291 |
+
<div class="sc-desc">Routine monthly generator test. Follow 5-step protocol correctly.</div>
|
| 292 |
+
</div>
|
| 293 |
+
<div class="scenario-card" data-id="B4" onclick="selectScenario('B4')">
|
| 294 |
+
<div class="sc-header">
|
| 295 |
+
<span class="sc-id">B4</span>
|
| 296 |
+
<span class="sc-diff hard">Hard</span>
|
| 297 |
+
</div>
|
| 298 |
+
<div class="sc-name">Power Failure Cascade</div>
|
| 299 |
+
<div class="sc-desc">Utility loss + extended generator warmup. Manage battery and thermal.</div>
|
| 300 |
+
</div>
|
| 301 |
+
</div>
|
| 302 |
+
</div>
|
| 303 |
+
<div class="sidebar-actions">
|
| 304 |
+
<select id="configSelect" class="config-select">
|
| 305 |
+
<option value="default">Default Facility (2 zones, 160 kW)</option>
|
| 306 |
+
<option value="small">Small Facility (1 zone, 80 kW)</option>
|
| 307 |
+
<option value="large">Large Facility (4 zones, 600 kW)</option>
|
| 308 |
+
</select>
|
| 309 |
+
<button id="startBtn" class="btn btn-primary" onclick="startEpisode()" disabled>
|
| 310 |
+
Select a Scenario
|
| 311 |
+
</button>
|
| 312 |
+
<button id="resetBtn" class="btn btn-outline" onclick="resetEpisode()" style="display:none">
|
| 313 |
+
Reset Episode
|
| 314 |
+
</button>
|
| 315 |
+
</div>
|
| 316 |
+
</aside>
|
| 317 |
+
|
| 318 |
+
<!-- Center: Dashboard Display -->
|
| 319 |
+
<div class="center">
|
| 320 |
+
<div class="dashboard-container" id="dashboardContainer">
|
| 321 |
+
<div id="doneBanner" class="episode-done-banner"></div>
|
| 322 |
+
<div class="dashboard-box">
|
| 323 |
+
<div class="dashboard-title-bar">
|
| 324 |
+
<div class="dots"><span></span><span></span><span></span></div>
|
| 325 |
+
<div class="title" id="terminalTitle">dc-ops-console</div>
|
| 326 |
+
</div>
|
| 327 |
+
<div class="dashboard-output" id="dashboardOutput"><div class="welcome">
|
| 328 |
+
<h2>DC-Ops Operations Console</h2>
|
| 329 |
+
<p>Select a scenario from the panel to begin a datacenter operations episode. Issue commands and monitor the facility in real-time.</p>
|
| 330 |
+
<div class="hint">
|
| 331 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M19 12H5M12 19l-7-7 7-7"/></svg>
|
| 332 |
+
Pick a scenario to start
|
| 333 |
+
</div>
|
| 334 |
+
</div></div>
|
| 335 |
+
</div>
|
| 336 |
+
<div id="actionResult" class="action-result" style="display:none"></div>
|
| 337 |
+
</div>
|
| 338 |
+
|
| 339 |
+
<!-- Command Bar -->
|
| 340 |
+
<div class="command-bar">
|
| 341 |
+
<div class="command-input-group">
|
| 342 |
+
<input type="text" id="commandInput" class="command-input"
|
| 343 |
+
placeholder="Enter command (e.g., diagnose CRAC-3)"
|
| 344 |
+
disabled autocomplete="off"
|
| 345 |
+
onkeydown="if(event.key==='Enter'&&!event.shiftKey)sendCommand()">
|
| 346 |
+
<button id="sendBtn" class="btn btn-primary" onclick="sendCommand()" disabled>Send</button>
|
| 347 |
+
</div>
|
| 348 |
+
<div class="quick-actions" id="quickActions">
|
| 349 |
+
<button class="quick-btn" disabled onclick="quickCmd('check_status')">check_status</button>
|
| 350 |
+
<button class="quick-btn" disabled onclick="quickCmd('diagnose CRAC-1')">diagnose CRAC-1</button>
|
| 351 |
+
<button class="quick-btn" disabled onclick="quickCmd('diagnose CRAC-3')">diagnose CRAC-3</button>
|
| 352 |
+
<button class="quick-btn" disabled onclick="quickCmd('acknowledge_alarm')">ack_alarm</button>
|
| 353 |
+
<button class="quick-btn" disabled onclick="quickCmd('start_generator')">start_gen</button>
|
| 354 |
+
<button class="quick-btn" disabled onclick="quickCmd('wait')">wait</button>
|
| 355 |
+
<button class="quick-btn" disabled onclick="quickCmd('escalate')">escalate</button>
|
| 356 |
+
</div>
|
| 357 |
+
</div>
|
| 358 |
+
</div>
|
| 359 |
+
|
| 360 |
+
<!-- Right: Metrics Panel -->
|
| 361 |
+
<aside class="right-panel" id="right-panel">
|
| 362 |
+
<div class="panel-section">
|
| 363 |
+
<div class="panel-section-title">Episode</div>
|
| 364 |
+
<div class="episode-info">
|
| 365 |
+
<div class="episode-row">
|
| 366 |
+
<span class="label">Scenario</span>
|
| 367 |
+
<span class="value" id="metaScenario">--</span>
|
| 368 |
+
</div>
|
| 369 |
+
<div class="episode-row">
|
| 370 |
+
<span class="label">Step</span>
|
| 371 |
+
<span class="value"><span id="metaStep">0</span> / <span id="metaMaxSteps">--</span></span>
|
| 372 |
+
</div>
|
| 373 |
+
<div class="progress-bar">
|
| 374 |
+
<div class="progress-fill" id="stepProgress" style="width:0%"></div>
|
| 375 |
+
</div>
|
| 376 |
+
<div class="episode-row">
|
| 377 |
+
<span class="label">Total Reward</span>
|
| 378 |
+
<span class="value" id="metaCumReward" style="color:var(--text)">0.00</span>
|
| 379 |
+
</div>
|
| 380 |
+
</div>
|
| 381 |
+
</div>
|
| 382 |
+
|
| 383 |
+
<div class="panel-section">
|
| 384 |
+
<div class="panel-section-title">Key Metrics</div>
|
| 385 |
+
<div class="metrics-grid">
|
| 386 |
+
<div class="metric">
|
| 387 |
+
<div class="metric-label">PUE</div>
|
| 388 |
+
<div class="metric-value neutral" id="metricPUE">--</div>
|
| 389 |
+
</div>
|
| 390 |
+
<div class="metric">
|
| 391 |
+
<div class="metric-label">IT Load</div>
|
| 392 |
+
<div class="metric-value neutral" id="metricIT">--</div>
|
| 393 |
+
</div>
|
| 394 |
+
<div class="metric">
|
| 395 |
+
<div class="metric-label">Cooling</div>
|
| 396 |
+
<div class="metric-value neutral" id="metricCooling">--</div>
|
| 397 |
+
</div>
|
| 398 |
+
<div class="metric">
|
| 399 |
+
<div class="metric-label">Outside</div>
|
| 400 |
+
<div class="metric-value neutral" id="metricOutside">--</div>
|
| 401 |
+
</div>
|
| 402 |
+
</div>
|
| 403 |
+
</div>
|
| 404 |
+
|
| 405 |
+
<div class="panel-section">
|
| 406 |
+
<div class="panel-section-title">Zone Temperatures</div>
|
| 407 |
+
<div class="zone-bars" id="zoneBars">
|
| 408 |
+
<div class="no-data">No data</div>
|
| 409 |
+
</div>
|
| 410 |
+
</div>
|
| 411 |
+
|
| 412 |
+
<div class="panel-section">
|
| 413 |
+
<div class="panel-section-title">Power</div>
|
| 414 |
+
<div id="powerInfo">
|
| 415 |
+
<div class="no-data">No data</div>
|
| 416 |
+
</div>
|
| 417 |
+
</div>
|
| 418 |
+
|
| 419 |
+
<div class="panel-section">
|
| 420 |
+
<div class="panel-section-title">Reward History</div>
|
| 421 |
+
<div class="reward-history" id="rewardHistory">
|
| 422 |
+
<div class="no-data">No steps yet</div>
|
| 423 |
+
</div>
|
| 424 |
+
</div>
|
| 425 |
+
</aside>
|
| 426 |
+
</div>
|
| 427 |
+
</div>
|
| 428 |
+
|
| 429 |
+
<script>
|
| 430 |
+
// ─── State ───────────────────────────────────────────────────────────
|
| 431 |
+
let selectedScenario = null;
|
| 432 |
+
let episodeActive = false;
|
| 433 |
+
let stepCount = 0;
|
| 434 |
+
let maxSteps = 0;
|
| 435 |
+
let cumulativeReward = 0;
|
| 436 |
+
let rewardEntries = [];
|
| 437 |
+
let isProcessing = false;
|
| 438 |
+
let ws = null;
|
| 439 |
+
let pendingResolve = null; // For awaiting WS responses
|
| 440 |
+
const BASE_URL = window.location.origin;
|
| 441 |
+
|
| 442 |
+
// ─── Scenario metadata ──────────────────────────────────────────────
|
| 443 |
+
const SCENARIOS = {
|
| 444 |
+
A1: { name: 'Cooling Setpoint Optimization', type: 'thermal', diff: 'Easy' },
|
| 445 |
+
A2: { name: 'Thermal Event Response', type: 'thermal', diff: 'Medium' },
|
| 446 |
+
A4: { name: 'CRAC Failure Cascade', type: 'thermal', diff: 'Hard' },
|
| 447 |
+
B1: { name: 'UPS Alarm Response', type: 'power', diff: 'Medium' },
|
| 448 |
+
B3: { name: 'Generator Test Protocol', type: 'power', diff: 'Easy' },
|
| 449 |
+
B4: { name: 'Power Failure Cascade', type: 'power', diff: 'Hard' },
|
| 450 |
+
};
|
| 451 |
+
|
| 452 |
+
// ─── WebSocket connection ────────────────────────────────────────────
|
| 453 |
+
function connectWebSocket() {
|
| 454 |
+
return new Promise((resolve, reject) => {
|
| 455 |
+
const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
| 456 |
+
const wsUrl = `${wsProtocol}//${window.location.host}/ws`;
|
| 457 |
+
ws = new WebSocket(wsUrl);
|
| 458 |
+
|
| 459 |
+
ws.onopen = () => {
|
| 460 |
+
setStatus('connected');
|
| 461 |
+
resolve();
|
| 462 |
+
};
|
| 463 |
+
|
| 464 |
+
ws.onmessage = (event) => {
|
| 465 |
+
const msg = JSON.parse(event.data);
|
| 466 |
+
if (pendingResolve) {
|
| 467 |
+
const resolver = pendingResolve;
|
| 468 |
+
pendingResolve = null;
|
| 469 |
+
resolver(msg);
|
| 470 |
+
}
|
| 471 |
+
};
|
| 472 |
+
|
| 473 |
+
ws.onerror = (err) => {
|
| 474 |
+
setStatus('disconnected');
|
| 475 |
+
reject(new Error('WebSocket connection failed'));
|
| 476 |
+
};
|
| 477 |
+
|
| 478 |
+
ws.onclose = () => {
|
| 479 |
+
setStatus('disconnected');
|
| 480 |
+
ws = null;
|
| 481 |
+
if (episodeActive) {
|
| 482 |
+
episodeActive = false;
|
| 483 |
+
setControlsEnabled(false);
|
| 484 |
+
showActionResult('WebSocket disconnected. Reset to reconnect.', 'error');
|
| 485 |
+
}
|
| 486 |
+
};
|
| 487 |
+
});
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
function wsSend(message) {
|
| 491 |
+
return new Promise((resolve, reject) => {
|
| 492 |
+
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
| 493 |
+
reject(new Error('WebSocket not connected'));
|
| 494 |
+
return;
|
| 495 |
+
}
|
| 496 |
+
pendingResolve = resolve;
|
| 497 |
+
ws.send(JSON.stringify(message));
|
| 498 |
+
// Timeout after 30s
|
| 499 |
+
setTimeout(() => {
|
| 500 |
+
if (pendingResolve === resolve) {
|
| 501 |
+
pendingResolve = null;
|
| 502 |
+
reject(new Error('WebSocket request timed out'));
|
| 503 |
+
}
|
| 504 |
+
}, 30000);
|
| 505 |
+
});
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
function closeWebSocket() {
|
| 509 |
+
if (ws) {
|
| 510 |
+
ws.close();
|
| 511 |
+
ws = null;
|
| 512 |
+
}
|
| 513 |
+
pendingResolve = null;
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
// ─── UI helpers ──────────────────────────────────────────────────────
|
| 517 |
+
function selectScenario(id) {
|
| 518 |
+
if (episodeActive) return;
|
| 519 |
+
selectedScenario = id;
|
| 520 |
+
document.querySelectorAll('.scenario-card').forEach(c => c.classList.remove('active'));
|
| 521 |
+
const card = document.querySelector(`.scenario-card[data-id="${id}"]`);
|
| 522 |
+
if (card) card.classList.add('active');
|
| 523 |
+
const btn = document.getElementById('startBtn');
|
| 524 |
+
btn.disabled = false;
|
| 525 |
+
btn.textContent = `Start ${id}: ${SCENARIOS[id].name}`;
|
| 526 |
+
}
|
| 527 |
+
|
| 528 |
+
function togglePanel(id) {
|
| 529 |
+
const panel = document.getElementById(id);
|
| 530 |
+
panel.classList.toggle('collapsed');
|
| 531 |
+
const btnId = id === 'sidebar' ? 'toggleScenarios' : 'toggleMetrics';
|
| 532 |
+
document.getElementById(btnId).classList.toggle('active');
|
| 533 |
+
}
|
| 534 |
+
|
| 535 |
+
function setControlsEnabled(enabled) {
|
| 536 |
+
document.getElementById('commandInput').disabled = !enabled;
|
| 537 |
+
document.getElementById('sendBtn').disabled = !enabled;
|
| 538 |
+
document.querySelectorAll('.quick-btn').forEach(b => b.disabled = !enabled);
|
| 539 |
+
}
|
| 540 |
+
|
| 541 |
+
function quickCmd(cmd) {
|
| 542 |
+
if (!episodeActive || isProcessing) return;
|
| 543 |
+
document.getElementById('commandInput').value = cmd;
|
| 544 |
+
sendCommand();
|
| 545 |
+
}
|
| 546 |
+
|
| 547 |
+
function showActionResult(msg, type) {
|
| 548 |
+
const el = document.getElementById('actionResult');
|
| 549 |
+
el.style.display = 'block';
|
| 550 |
+
el.textContent = msg;
|
| 551 |
+
el.className = 'action-result ' + type;
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
function setStatus(state) {
|
| 555 |
+
const badge = document.getElementById('statusBadge');
|
| 556 |
+
const text = document.getElementById('statusText');
|
| 557 |
+
badge.className = 'status-badge ' + state;
|
| 558 |
+
text.textContent = state === 'connected' ? 'Connected' :
|
| 559 |
+
state === 'loading' ? 'Loading...' : 'Disconnected';
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
// ─── Dashboard text parsing ──────────────────────────────────────────
|
| 563 |
+
function parseDashboard(dashboard) {
|
| 564 |
+
const metrics = {};
|
| 565 |
+
|
| 566 |
+
// PUE
|
| 567 |
+
const pueMatch = dashboard.match(/PUE:\s+([\d.]+)/);
|
| 568 |
+
if (pueMatch) metrics.pue = parseFloat(pueMatch[1]);
|
| 569 |
+
|
| 570 |
+
// IT Load
|
| 571 |
+
const itMatch = dashboard.match(/IT Load:\s+([\d.]+)\s*kW/);
|
| 572 |
+
if (itMatch) metrics.itLoad = parseFloat(itMatch[1]);
|
| 573 |
+
|
| 574 |
+
// Cooling
|
| 575 |
+
const coolMatch = dashboard.match(/Cooling:\s+([\d.]+)\s*kW/);
|
| 576 |
+
if (coolMatch) metrics.cooling = parseFloat(coolMatch[1]);
|
| 577 |
+
|
| 578 |
+
// Outside temp
|
| 579 |
+
const outMatch = dashboard.match(/Outside:\s+([\d.]+)°C/);
|
| 580 |
+
if (outMatch) metrics.outside = parseFloat(outMatch[1]);
|
| 581 |
+
|
| 582 |
+
// Zone temperatures
|
| 583 |
+
metrics.zones = [];
|
| 584 |
+
const zoneRegex = /(zone_\w+)\s+([\d.]+)°C\s+([\d.]+)°C\s+([\d.]+)°C/g;
|
| 585 |
+
let zm;
|
| 586 |
+
while ((zm = zoneRegex.exec(dashboard)) !== null) {
|
| 587 |
+
metrics.zones.push({
|
| 588 |
+
id: zm[1],
|
| 589 |
+
cold: parseFloat(zm[2]),
|
| 590 |
+
hot: parseFloat(zm[3]),
|
| 591 |
+
inlet: parseFloat(zm[4])
|
| 592 |
+
});
|
| 593 |
+
}
|
| 594 |
+
|
| 595 |
+
// Power info
|
| 596 |
+
const utilMatch = dashboard.match(/Utility:\s+(\w+)/);
|
| 597 |
+
if (utilMatch) metrics.utility = utilMatch[1];
|
| 598 |
+
|
| 599 |
+
const genMatch = dashboard.match(/Gen:\s+([^\n|]+)/);
|
| 600 |
+
if (genMatch) metrics.generator = genMatch[1].trim();
|
| 601 |
+
|
| 602 |
+
const atsMatch = dashboard.match(/ATS:\s+(\w+)/);
|
| 603 |
+
if (atsMatch) metrics.ats = atsMatch[1];
|
| 604 |
+
|
| 605 |
+
const upsMatch = dashboard.match(/UPS:\s+(.+)/);
|
| 606 |
+
if (upsMatch) metrics.ups = upsMatch[1].trim();
|
| 607 |
+
|
| 608 |
+
return metrics;
|
| 609 |
+
}
|
| 610 |
+
|
| 611 |
+
// ─── Start episode ───────────────────────────────────────────────────
|
| 612 |
+
async function startEpisode() {
|
| 613 |
+
if (!selectedScenario || isProcessing) return;
|
| 614 |
+
isProcessing = true;
|
| 615 |
+
const btn = document.getElementById('startBtn');
|
| 616 |
+
btn.disabled = true;
|
| 617 |
+
btn.innerHTML = '<span class="spinner"></span> Starting...';
|
| 618 |
+
setStatus('loading');
|
| 619 |
+
|
| 620 |
+
try {
|
| 621 |
+
// Close any existing WebSocket connection
|
| 622 |
+
closeWebSocket();
|
| 623 |
+
|
| 624 |
+
// Open a fresh WebSocket session (each WS gets its own env instance)
|
| 625 |
+
await connectWebSocket();
|
| 626 |
+
|
| 627 |
+
// Send reset via WebSocket
|
| 628 |
+
const resetData = { scenario: selectedScenario };
|
| 629 |
+
const configName = document.getElementById('configSelect').value;
|
| 630 |
+
if (configName && configName !== 'default') {
|
| 631 |
+
resetData.config_name = configName;
|
| 632 |
+
}
|
| 633 |
+
|
| 634 |
+
const resp = await wsSend({ type: 'reset', data: resetData });
|
| 635 |
+
|
| 636 |
+
if (resp.type === 'error') {
|
| 637 |
+
throw new Error(resp.data?.message || 'Reset failed');
|
| 638 |
+
}
|
| 639 |
+
|
| 640 |
+
// resp: { type: "observation", data: { observation: {...}, reward: float, done: bool } }
|
| 641 |
+
episodeActive = true;
|
| 642 |
+
stepCount = 0;
|
| 643 |
+
cumulativeReward = 0;
|
| 644 |
+
rewardEntries = [];
|
| 645 |
+
|
| 646 |
+
processResponse(resp.data);
|
| 647 |
+
|
| 648 |
+
setControlsEnabled(true);
|
| 649 |
+
document.getElementById('startBtn').style.display = 'none';
|
| 650 |
+
document.getElementById('resetBtn').style.display = 'block';
|
| 651 |
+
document.getElementById('doneBanner').classList.remove('show');
|
| 652 |
+
document.getElementById('rewardHistory').innerHTML = '<div class="no-data">No steps yet</div>';
|
| 653 |
+
document.getElementById('actionResult').style.display = 'none';
|
| 654 |
+
|
| 655 |
+
const info = SCENARIOS[selectedScenario];
|
| 656 |
+
document.getElementById('metaScenario').textContent = `${selectedScenario} - ${info.name}`;
|
| 657 |
+
|
| 658 |
+
} catch (e) {
|
| 659 |
+
setStatus('disconnected');
|
| 660 |
+
showActionResult('Failed to start: ' + e.message, 'error');
|
| 661 |
+
btn.disabled = false;
|
| 662 |
+
btn.textContent = `Start ${selectedScenario}: ${SCENARIOS[selectedScenario].name}`;
|
| 663 |
+
closeWebSocket();
|
| 664 |
+
} finally {
|
| 665 |
+
isProcessing = false;
|
| 666 |
+
}
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
// ─── Reset episode ───────────────────────────────────────────────────
|
| 670 |
+
function resetEpisode() {
|
| 671 |
+
episodeActive = false;
|
| 672 |
+
setControlsEnabled(false);
|
| 673 |
+
closeWebSocket();
|
| 674 |
+
document.getElementById('startBtn').style.display = 'block';
|
| 675 |
+
document.getElementById('startBtn').disabled = false;
|
| 676 |
+
document.getElementById('startBtn').textContent =
|
| 677 |
+
selectedScenario ? `Start ${selectedScenario}: ${SCENARIOS[selectedScenario].name}` : 'Select a Scenario';
|
| 678 |
+
document.getElementById('resetBtn').style.display = 'none';
|
| 679 |
+
document.getElementById('doneBanner').classList.remove('show');
|
| 680 |
+
document.getElementById('terminalTitle').textContent = 'dc-ops-console';
|
| 681 |
+
setStatus('disconnected');
|
| 682 |
+
}
|
| 683 |
+
|
| 684 |
+
// ─── Send command ────────────────────────────────────────────────────
|
| 685 |
+
async function sendCommand() {
|
| 686 |
+
const input = document.getElementById('commandInput');
|
| 687 |
+
const cmd = input.value.trim();
|
| 688 |
+
if (!cmd || !episodeActive || isProcessing) return;
|
| 689 |
+
|
| 690 |
+
input.value = '';
|
| 691 |
+
isProcessing = true;
|
| 692 |
+
setControlsEnabled(false);
|
| 693 |
+
const sendBtn = document.getElementById('sendBtn');
|
| 694 |
+
sendBtn.disabled = false;
|
| 695 |
+
sendBtn.innerHTML = '<span class="spinner"></span>';
|
| 696 |
+
|
| 697 |
+
try {
|
| 698 |
+
// WebSocket step: { type: "step", data: { command: "...", reasoning: "" } }
|
| 699 |
+
const resp = await wsSend({
|
| 700 |
+
type: 'step',
|
| 701 |
+
data: { command: cmd, reasoning: '' }
|
| 702 |
+
});
|
| 703 |
+
|
| 704 |
+
if (resp.type === 'error') {
|
| 705 |
+
throw new Error(resp.data?.message || 'Step failed');
|
| 706 |
+
}
|
| 707 |
+
|
| 708 |
+
stepCount++;
|
| 709 |
+
processResponse(resp.data, cmd);
|
| 710 |
+
} catch (e) {
|
| 711 |
+
showActionResult('Error: ' + e.message, 'error');
|
| 712 |
+
} finally {
|
| 713 |
+
isProcessing = false;
|
| 714 |
+
sendBtn.textContent = 'Send';
|
| 715 |
+
if (episodeActive) {
|
| 716 |
+
setControlsEnabled(true);
|
| 717 |
+
input.focus();
|
| 718 |
+
}
|
| 719 |
+
}
|
| 720 |
+
}
|
| 721 |
+
|
| 722 |
+
// ─── Process API response ────────────────────────────────────────────
|
| 723 |
+
function processResponse(data, command = null) {
|
| 724 |
+
// Response format: { observation: {...}, reward: float|null, done: bool }
|
| 725 |
+
const obs = data.observation || {};
|
| 726 |
+
const reward = data.reward || 0;
|
| 727 |
+
const done = data.done || false;
|
| 728 |
+
|
| 729 |
+
// ── Dashboard display ──
|
| 730 |
+
const dashEl = document.getElementById('dashboardOutput');
|
| 731 |
+
const dashboard = obs.dashboard || '';
|
| 732 |
+
if (dashboard) {
|
| 733 |
+
dashEl.textContent = dashboard;
|
| 734 |
+
}
|
| 735 |
+
|
| 736 |
+
// Auto-scroll dashboard to bottom
|
| 737 |
+
const container = document.getElementById('dashboardContainer');
|
| 738 |
+
container.scrollTop = container.scrollHeight;
|
| 739 |
+
|
| 740 |
+
// ── Action result ──
|
| 741 |
+
if (obs.action_result && command) {
|
| 742 |
+
const isErr = /error|invalid|unknown|unrecognized|fail/i.test(obs.action_result);
|
| 743 |
+
showActionResult(obs.action_result, isErr ? 'error' : 'success');
|
| 744 |
+
}
|
| 745 |
+
|
| 746 |
+
// ── Parse metrics from dashboard text ──
|
| 747 |
+
const metrics = parseDashboard(dashboard);
|
| 748 |
+
|
| 749 |
+
// ── Steps ──
|
| 750 |
+
const stepsRemaining = obs.steps_remaining || 0;
|
| 751 |
+
maxSteps = stepsRemaining + stepCount;
|
| 752 |
+
document.getElementById('metaStep').textContent = stepCount;
|
| 753 |
+
document.getElementById('metaMaxSteps').textContent = maxSteps;
|
| 754 |
+
|
| 755 |
+
const pct = maxSteps > 0 ? (stepCount / maxSteps) * 100 : 0;
|
| 756 |
+
const progEl = document.getElementById('stepProgress');
|
| 757 |
+
progEl.style.width = pct + '%';
|
| 758 |
+
progEl.className = 'progress-fill ' + (pct < 50 ? 'low' : pct < 80 ? 'mid' : 'high');
|
| 759 |
+
|
| 760 |
+
// ── Cumulative reward ──
|
| 761 |
+
cumulativeReward += reward;
|
| 762 |
+
const cumEl = document.getElementById('metaCumReward');
|
| 763 |
+
cumEl.textContent = cumulativeReward.toFixed(2);
|
| 764 |
+
cumEl.style.color = cumulativeReward > 0 ? 'var(--green)' : cumulativeReward < -0.5 ? 'var(--red)' : 'var(--text)';
|
| 765 |
+
|
| 766 |
+
// ── Key metrics from parsed dashboard ──
|
| 767 |
+
if (metrics.pue !== undefined) {
|
| 768 |
+
const el = document.getElementById('metricPUE');
|
| 769 |
+
el.textContent = metrics.pue.toFixed(2);
|
| 770 |
+
el.className = 'metric-value ' + (metrics.pue < 1.5 ? 'good' : metrics.pue < 1.8 ? 'warn' : 'danger');
|
| 771 |
+
}
|
| 772 |
+
if (metrics.itLoad !== undefined) {
|
| 773 |
+
document.getElementById('metricIT').textContent = metrics.itLoad.toFixed(0) + ' kW';
|
| 774 |
+
document.getElementById('metricIT').className = 'metric-value neutral';
|
| 775 |
+
}
|
| 776 |
+
if (metrics.cooling !== undefined) {
|
| 777 |
+
document.getElementById('metricCooling').textContent = metrics.cooling.toFixed(0) + ' kW';
|
| 778 |
+
document.getElementById('metricCooling').className = 'metric-value neutral';
|
| 779 |
+
}
|
| 780 |
+
if (metrics.outside !== undefined) {
|
| 781 |
+
document.getElementById('metricOutside').textContent = metrics.outside.toFixed(0) + '°C';
|
| 782 |
+
document.getElementById('metricOutside').className = 'metric-value neutral';
|
| 783 |
+
}
|
| 784 |
+
|
| 785 |
+
// ── Zone temperature bars ──
|
| 786 |
+
if (metrics.zones && metrics.zones.length > 0) {
|
| 787 |
+
updateZoneBars(metrics.zones);
|
| 788 |
+
}
|
| 789 |
+
|
| 790 |
+
// ── Power info ──
|
| 791 |
+
updatePowerInfo(metrics);
|
| 792 |
+
|
| 793 |
+
// ── Reward history ──
|
| 794 |
+
if (command) {
|
| 795 |
+
rewardEntries.push({ step: stepCount, cmd: command, reward: reward });
|
| 796 |
+
updateRewardHistory();
|
| 797 |
+
}
|
| 798 |
+
|
| 799 |
+
// ── Terminal title ──
|
| 800 |
+
document.getElementById('terminalTitle').textContent =
|
| 801 |
+
`dc-ops — ${selectedScenario} — step ${stepCount}/${maxSteps}`;
|
| 802 |
+
|
| 803 |
+
// ── Episode done ──
|
| 804 |
+
if (done) {
|
| 805 |
+
episodeActive = false;
|
| 806 |
+
setControlsEnabled(false);
|
| 807 |
+
const banner = document.getElementById('doneBanner');
|
| 808 |
+
banner.classList.add('show');
|
| 809 |
+
const alert = obs.alert || '';
|
| 810 |
+
if (alert.toLowerCase().includes('resolved') || alert.toLowerCase().includes('success') ||
|
| 811 |
+
alert.toLowerCase().includes('complete')) {
|
| 812 |
+
banner.className = 'episode-done-banner show resolved';
|
| 813 |
+
banner.textContent = 'Scenario Resolved Successfully';
|
| 814 |
+
} else if (alert.toLowerCase().includes('critical') || alert.toLowerCase().includes('emergency') ||
|
| 815 |
+
alert.toLowerCase().includes('shutdown')) {
|
| 816 |
+
banner.className = 'episode-done-banner show failed';
|
| 817 |
+
banner.textContent = 'Episode Ended — Critical Failure';
|
| 818 |
+
} else {
|
| 819 |
+
banner.className = 'episode-done-banner show timeout';
|
| 820 |
+
banner.textContent = `Episode Ended — ${stepCount >= maxSteps ? 'Budget exhausted' : 'Terminated'}`;
|
| 821 |
+
}
|
| 822 |
+
}
|
| 823 |
+
}
|
| 824 |
+
|
| 825 |
+
// ─── Zone bars ───────────────────────────────────────────────────────
|
| 826 |
+
function updateZoneBars(zones) {
|
| 827 |
+
const container = document.getElementById('zoneBars');
|
| 828 |
+
container.innerHTML = '';
|
| 829 |
+
for (const z of zones) {
|
| 830 |
+
const temp = z.inlet;
|
| 831 |
+
const pct = Math.max(0, Math.min(100, ((temp - 15) / 30) * 100));
|
| 832 |
+
const cls = temp <= 27 ? 'safe' : temp <= 35 ? 'warning' : 'critical';
|
| 833 |
+
const colorVar = cls === 'safe' ? '--green' : cls === 'warning' ? '--yellow' : '--red';
|
| 834 |
+
const label = z.id.replace('zone_', '').toUpperCase();
|
| 835 |
+
const row = document.createElement('div');
|
| 836 |
+
row.className = 'zone-bar-row';
|
| 837 |
+
row.innerHTML = `
|
| 838 |
+
<span class="zone-bar-label">${label}</span>
|
| 839 |
+
<div class="zone-bar-track">
|
| 840 |
+
<div class="zone-bar-fill ${cls}" style="width:${pct}%"></div>
|
| 841 |
+
</div>
|
| 842 |
+
<span class="zone-bar-value" style="color:var(${colorVar})">${temp.toFixed(1)}°C</span>`;
|
| 843 |
+
container.appendChild(row);
|
| 844 |
+
}
|
| 845 |
+
}
|
| 846 |
+
|
| 847 |
+
// ─── Power info ──────────────────────────────────────────────────────
|
| 848 |
+
function updatePowerInfo(metrics) {
|
| 849 |
+
const container = document.getElementById('powerInfo');
|
| 850 |
+
let html = '';
|
| 851 |
+
|
| 852 |
+
if (metrics.utility) {
|
| 853 |
+
const cls = metrics.utility === 'NORMAL' ? 'ok' : 'bad';
|
| 854 |
+
html += `<div class="power-row"><span class="pw-label">Utility</span><span class="pw-val ${cls}">${metrics.utility}</span></div>`;
|
| 855 |
+
}
|
| 856 |
+
if (metrics.generator) {
|
| 857 |
+
const cls = metrics.generator.startsWith('OFF') ? 'ok' :
|
| 858 |
+
metrics.generator.startsWith('LOADED') ? 'warn' : 'warn';
|
| 859 |
+
html += `<div class="power-row"><span class="pw-label">Generator</span><span class="pw-val ${cls}">${metrics.generator}</span></div>`;
|
| 860 |
+
}
|
| 861 |
+
if (metrics.ats) {
|
| 862 |
+
const cls = metrics.ats === 'UTILITY' ? 'ok' : 'warn';
|
| 863 |
+
html += `<div class="power-row"><span class="pw-label">ATS</span><span class="pw-val ${cls}">${metrics.ats}</span></div>`;
|
| 864 |
+
}
|
| 865 |
+
if (metrics.ups) {
|
| 866 |
+
const parts = metrics.ups.split('|').map(s => s.trim()).filter(Boolean);
|
| 867 |
+
for (const p of parts) {
|
| 868 |
+
const hasBattery = /BATTERY/i.test(p);
|
| 869 |
+
const hasFault = /FAULT/i.test(p);
|
| 870 |
+
const cls = hasFault ? 'bad' : hasBattery ? 'warn' : 'ok';
|
| 871 |
+
html += `<div class="power-row"><span class="pw-label">UPS</span><span class="pw-val ${cls}">${p}</span></div>`;
|
| 872 |
+
}
|
| 873 |
+
}
|
| 874 |
+
|
| 875 |
+
container.innerHTML = html || '<div class="no-data">No data</div>';
|
| 876 |
+
}
|
| 877 |
+
|
| 878 |
+
// ─── Reward history ──────────────────────────────────────────────────
|
| 879 |
+
function updateRewardHistory() {
|
| 880 |
+
const container = document.getElementById('rewardHistory');
|
| 881 |
+
container.innerHTML = '';
|
| 882 |
+
for (let i = rewardEntries.length - 1; i >= 0; i--) {
|
| 883 |
+
const e = rewardEntries[i];
|
| 884 |
+
const cls = e.reward > 0.005 ? 'pos' : e.reward < -0.005 ? 'neg' : 'zero';
|
| 885 |
+
const sign = e.reward >= 0 ? '+' : '';
|
| 886 |
+
const div = document.createElement('div');
|
| 887 |
+
div.className = 'reward-entry';
|
| 888 |
+
div.innerHTML = `
|
| 889 |
+
<span class="step">${e.step}</span>
|
| 890 |
+
<span class="cmd" title="${e.cmd}">${e.cmd}</span>
|
| 891 |
+
<span class="rew ${cls}">${sign}${e.reward.toFixed(3)}</span>`;
|
| 892 |
+
container.appendChild(div);
|
| 893 |
+
}
|
| 894 |
+
}
|
| 895 |
+
|
| 896 |
+
// ─── Health check ────────────────────────────────────────────────────
|
| 897 |
+
async function checkHealth() {
|
| 898 |
+
try {
|
| 899 |
+
const resp = await fetch(`${BASE_URL}/health`);
|
| 900 |
+
if (resp.ok) setStatus('connected');
|
| 901 |
+
else setStatus('disconnected');
|
| 902 |
+
} catch (e) {
|
| 903 |
+
setStatus('disconnected');
|
| 904 |
+
}
|
| 905 |
+
}
|
| 906 |
+
|
| 907 |
+
// ─── Init ────────────────────────────────────────────────────────────
|
| 908 |
+
checkHealth();
|
| 909 |
+
</script>
|
| 910 |
+
</body>
|
| 911 |
+
</html>
|
simulation/__init__.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Datacenter simulation engine."""
|
| 8 |
+
|
| 9 |
+
from .power import PowerAlarm, PowerSimulation, PowerStepResult
|
| 10 |
+
from .thermal import ThermalAlarm, ThermalSimulation, ThermalStepResult
|
| 11 |
+
from .types import (
|
| 12 |
+
ATSPosition,
|
| 13 |
+
ATSState,
|
| 14 |
+
CRACFaultType,
|
| 15 |
+
CRACState,
|
| 16 |
+
CRACStatus,
|
| 17 |
+
DatacenterState,
|
| 18 |
+
GeneratorState,
|
| 19 |
+
GensetState,
|
| 20 |
+
PDUState,
|
| 21 |
+
PowerState,
|
| 22 |
+
RackState,
|
| 23 |
+
UPSMode,
|
| 24 |
+
UPSState,
|
| 25 |
+
ZoneState,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
__all__ = [
|
| 29 |
+
"PowerAlarm",
|
| 30 |
+
"PowerSimulation",
|
| 31 |
+
"PowerStepResult",
|
| 32 |
+
"ThermalAlarm",
|
| 33 |
+
"ThermalSimulation",
|
| 34 |
+
"ThermalStepResult",
|
| 35 |
+
"ATSPosition",
|
| 36 |
+
"ATSState",
|
| 37 |
+
"CRACFaultType",
|
| 38 |
+
"CRACState",
|
| 39 |
+
"CRACStatus",
|
| 40 |
+
"DatacenterState",
|
| 41 |
+
"GeneratorState",
|
| 42 |
+
"GensetState",
|
| 43 |
+
"PDUState",
|
| 44 |
+
"PowerState",
|
| 45 |
+
"RackState",
|
| 46 |
+
"UPSMode",
|
| 47 |
+
"UPSState",
|
| 48 |
+
"ZoneState",
|
| 49 |
+
]
|
simulation/power.py
ADDED
|
@@ -0,0 +1,668 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Power subsystem simulation: UPS, PDU, Generator, ATS.
|
| 9 |
+
|
| 10 |
+
Models the electrical power chain from utility/generator through UPS and PDU
|
| 11 |
+
to IT loads. Tracks efficiency losses, battery state-of-charge, generator
|
| 12 |
+
fuel consumption, and automatic transfer switching.
|
| 13 |
+
|
| 14 |
+
Physics references:
|
| 15 |
+
- UPS quadratic loss model: APC White Paper 108
|
| 16 |
+
- PDU three-phase power: P = √3 × V_LL × I_L × PF
|
| 17 |
+
- Generator fuel: linear with load fraction + 10% idle
|
| 18 |
+
- ATS transfer: mechanical switch timing (50-200 ms)
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
import math
|
| 24 |
+
from dataclasses import dataclass, field
|
| 25 |
+
|
| 26 |
+
from ..config import (
|
| 27 |
+
ATSConfig,
|
| 28 |
+
GeneratorConfig,
|
| 29 |
+
PDUConfig,
|
| 30 |
+
PowerConfig,
|
| 31 |
+
UPSConfig,
|
| 32 |
+
)
|
| 33 |
+
from .types import (
|
| 34 |
+
ATSPosition,
|
| 35 |
+
ATSState,
|
| 36 |
+
GeneratorState,
|
| 37 |
+
GensetState,
|
| 38 |
+
PDUState,
|
| 39 |
+
PowerState,
|
| 40 |
+
UPSMode,
|
| 41 |
+
UPSState,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# ---------------------------------------------------------------------------
|
| 46 |
+
# Power step result
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
@dataclass
|
| 49 |
+
class PowerAlarm:
|
| 50 |
+
"""A power subsystem alarm."""
|
| 51 |
+
component: str # e.g. "UPS-1", "PDU-A1", "GEN-1", "ATS-1"
|
| 52 |
+
alarm_type: str # e.g. "on_battery", "low_battery", "overload", "fuel_low"
|
| 53 |
+
severity: str # "warning", "critical"
|
| 54 |
+
message: str
|
| 55 |
+
value: float = 0.0 # Relevant numeric value (SOC, load%, fuel level, etc.)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass
|
| 59 |
+
class PowerStepResult:
|
| 60 |
+
"""Result of a single power simulation step."""
|
| 61 |
+
total_ups_loss_kw: float = 0.0
|
| 62 |
+
total_pdu_loss_kw: float = 0.0
|
| 63 |
+
total_power_overhead_kw: float = 0.0
|
| 64 |
+
generator_output_kw: float = 0.0
|
| 65 |
+
generator_fuel_remaining_liters: float = 0.0
|
| 66 |
+
utility_available: bool = True
|
| 67 |
+
on_generator: bool = False
|
| 68 |
+
power_available: bool = True
|
| 69 |
+
alarms: list[PowerAlarm] = field(default_factory=list)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# ---------------------------------------------------------------------------
|
| 73 |
+
# Power simulation
|
| 74 |
+
# ---------------------------------------------------------------------------
|
| 75 |
+
class PowerSimulation:
|
| 76 |
+
"""Simulates the datacenter power distribution chain.
|
| 77 |
+
|
| 78 |
+
Power flow:
|
| 79 |
+
Utility/Generator → ATS → UPS(es) → PDU(s) → IT Load
|
| 80 |
+
|
| 81 |
+
Each step():
|
| 82 |
+
1. ATS: detect utility loss/restoration, manage transfer
|
| 83 |
+
2. Generator: state machine (off → start_delay → cranking → warming → ready → loaded)
|
| 84 |
+
3. UPS: compute efficiency, manage battery SOC
|
| 85 |
+
4. PDU: compute losses, check phase currents
|
| 86 |
+
"""
|
| 87 |
+
|
| 88 |
+
def __init__(self, power_config: PowerConfig, it_load_kw: float = 160.0) -> None:
|
| 89 |
+
self._config = power_config
|
| 90 |
+
self._state = self._init_state(power_config)
|
| 91 |
+
self._it_load_kw = it_load_kw
|
| 92 |
+
|
| 93 |
+
@property
|
| 94 |
+
def state(self) -> PowerState:
|
| 95 |
+
return self._state
|
| 96 |
+
|
| 97 |
+
@staticmethod
|
| 98 |
+
def _init_state(config: PowerConfig) -> PowerState:
|
| 99 |
+
"""Initialize power state from configuration."""
|
| 100 |
+
ups_units = []
|
| 101 |
+
for uc in config.ups_units:
|
| 102 |
+
ups = UPSState(
|
| 103 |
+
unit_id=uc.unit_id,
|
| 104 |
+
mode=UPSMode(uc.initial_mode),
|
| 105 |
+
rated_capacity_kw=uc.rated_capacity_kw,
|
| 106 |
+
loss_c0=uc.loss_c0,
|
| 107 |
+
loss_c1=uc.loss_c1,
|
| 108 |
+
loss_c2=uc.loss_c2,
|
| 109 |
+
battery_capacity_kwh=uc.battery_capacity_kwh,
|
| 110 |
+
battery_discharge_efficiency=uc.battery_discharge_efficiency,
|
| 111 |
+
battery_aging_factor=uc.battery_aging_factor,
|
| 112 |
+
recharge_rate_kw=uc.recharge_rate_kw,
|
| 113 |
+
battery_soc=1.0,
|
| 114 |
+
)
|
| 115 |
+
ups_units.append(ups)
|
| 116 |
+
|
| 117 |
+
pdus = []
|
| 118 |
+
for pc in config.pdus:
|
| 119 |
+
pdu = PDUState(
|
| 120 |
+
pdu_id=pc.pdu_id,
|
| 121 |
+
voltage_ll_v=pc.voltage_ll_v,
|
| 122 |
+
max_current_per_phase_a=pc.max_current_per_phase_a,
|
| 123 |
+
num_phases=pc.num_phases,
|
| 124 |
+
breaker_rating_a=pc.breaker_rating_a,
|
| 125 |
+
efficiency=pc.efficiency,
|
| 126 |
+
continuous_derating=pc.continuous_derating,
|
| 127 |
+
)
|
| 128 |
+
pdus.append(pdu)
|
| 129 |
+
|
| 130 |
+
gen_cfg = config.generator
|
| 131 |
+
generator = GensetState(
|
| 132 |
+
gen_id=gen_cfg.gen_id,
|
| 133 |
+
rated_capacity_kw=gen_cfg.rated_capacity_kw,
|
| 134 |
+
start_delay_s=gen_cfg.start_delay_s,
|
| 135 |
+
crank_time_s=gen_cfg.crank_time_s,
|
| 136 |
+
warmup_time_s=gen_cfg.warmup_time_s,
|
| 137 |
+
cooldown_time_s=gen_cfg.cooldown_time_s,
|
| 138 |
+
fuel_tank_liters=gen_cfg.fuel_tank_liters,
|
| 139 |
+
fuel_level_liters=gen_cfg.fuel_tank_liters,
|
| 140 |
+
consumption_lph_full=gen_cfg.consumption_lph_full,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
ats_cfg = config.ats
|
| 144 |
+
ats = ATSState(
|
| 145 |
+
ats_id=ats_cfg.ats_id,
|
| 146 |
+
transfer_time_ms=ats_cfg.transfer_time_ms,
|
| 147 |
+
retransfer_delay_s=ats_cfg.retransfer_delay_s,
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
return PowerState(
|
| 151 |
+
ups_units=ups_units,
|
| 152 |
+
pdus=pdus,
|
| 153 |
+
generator=generator,
|
| 154 |
+
ats=ats,
|
| 155 |
+
utility_available=config.utility_available,
|
| 156 |
+
utility_voltage_v=config.utility_voltage_v,
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
def step(self, dt_s: float, it_load_kw: float) -> PowerStepResult:
|
| 160 |
+
"""Advance the power simulation by dt_s seconds.
|
| 161 |
+
|
| 162 |
+
Args:
|
| 163 |
+
dt_s: Timestep in seconds.
|
| 164 |
+
it_load_kw: Total IT power demand in kW.
|
| 165 |
+
|
| 166 |
+
Returns:
|
| 167 |
+
PowerStepResult with losses, alarms, and status.
|
| 168 |
+
"""
|
| 169 |
+
self._it_load_kw = it_load_kw
|
| 170 |
+
alarms: list[PowerAlarm] = []
|
| 171 |
+
|
| 172 |
+
# 1. ATS logic: detect utility state changes
|
| 173 |
+
self._step_ats(dt_s, alarms)
|
| 174 |
+
|
| 175 |
+
# 2. Generator state machine
|
| 176 |
+
self._step_generator(dt_s, alarms)
|
| 177 |
+
|
| 178 |
+
# 3. Determine if load-side power is available
|
| 179 |
+
power_available = self._state.power_available
|
| 180 |
+
|
| 181 |
+
# 4. UPS: efficiency, battery, losses
|
| 182 |
+
total_ups_loss = self._step_ups_units(dt_s, it_load_kw, alarms)
|
| 183 |
+
|
| 184 |
+
# 5. PDU: losses, phase currents
|
| 185 |
+
total_pdu_loss = self._step_pdus(it_load_kw, alarms)
|
| 186 |
+
|
| 187 |
+
return PowerStepResult(
|
| 188 |
+
total_ups_loss_kw=total_ups_loss,
|
| 189 |
+
total_pdu_loss_kw=total_pdu_loss,
|
| 190 |
+
total_power_overhead_kw=total_ups_loss + total_pdu_loss,
|
| 191 |
+
generator_output_kw=self._state.generator.output_power_kw,
|
| 192 |
+
generator_fuel_remaining_liters=self._state.generator.fuel_level_liters,
|
| 193 |
+
utility_available=self._state.utility_available,
|
| 194 |
+
on_generator=self._state.on_generator,
|
| 195 |
+
power_available=power_available,
|
| 196 |
+
alarms=alarms,
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# -------------------------------------------------------------------
|
| 200 |
+
# ATS
|
| 201 |
+
# -------------------------------------------------------------------
|
| 202 |
+
def _step_ats(self, dt_s: float, alarms: list[PowerAlarm]) -> None:
|
| 203 |
+
"""Handle ATS transfer logic."""
|
| 204 |
+
ats = self._state.ats
|
| 205 |
+
gen = self._state.generator
|
| 206 |
+
utility_ok = self._state.utility_available
|
| 207 |
+
|
| 208 |
+
if ats.position == ATSPosition.UTILITY:
|
| 209 |
+
if not utility_ok:
|
| 210 |
+
# Utility lost — initiate transfer to generator
|
| 211 |
+
ats.position = ATSPosition.TRANSFERRING
|
| 212 |
+
ats.transfer_elapsed_ms = 0.0
|
| 213 |
+
ats.retransfer_timer_s = 0.0
|
| 214 |
+
# Start generator if not already running
|
| 215 |
+
if gen.state == GeneratorState.OFF:
|
| 216 |
+
gen.state = GeneratorState.START_DELAY
|
| 217 |
+
gen.state_elapsed_s = 0.0
|
| 218 |
+
alarms.append(PowerAlarm(
|
| 219 |
+
component=ats.ats_id,
|
| 220 |
+
alarm_type="utility_lost",
|
| 221 |
+
severity="critical",
|
| 222 |
+
message="Utility power lost, initiating transfer to generator",
|
| 223 |
+
))
|
| 224 |
+
|
| 225 |
+
elif ats.position == ATSPosition.TRANSFERRING:
|
| 226 |
+
ats.transfer_elapsed_ms += dt_s * 1000.0
|
| 227 |
+
if ats.transfer_elapsed_ms >= ats.transfer_time_ms:
|
| 228 |
+
# Transfer complete
|
| 229 |
+
if utility_ok:
|
| 230 |
+
# Utility came back during transfer — go back to utility
|
| 231 |
+
ats.position = ATSPosition.UTILITY
|
| 232 |
+
ats.transfer_elapsed_ms = 0.0
|
| 233 |
+
elif gen.is_available:
|
| 234 |
+
ats.position = ATSPosition.GENERATOR
|
| 235 |
+
ats.transfer_elapsed_ms = 0.0
|
| 236 |
+
alarms.append(PowerAlarm(
|
| 237 |
+
component=ats.ats_id,
|
| 238 |
+
alarm_type="on_generator",
|
| 239 |
+
severity="warning",
|
| 240 |
+
message="Load transferred to generator",
|
| 241 |
+
))
|
| 242 |
+
# else: stay transferring until generator is ready
|
| 243 |
+
|
| 244 |
+
elif ats.position == ATSPosition.GENERATOR:
|
| 245 |
+
if utility_ok:
|
| 246 |
+
# Utility restored — wait retransfer delay before switching back
|
| 247 |
+
ats.retransfer_timer_s += dt_s
|
| 248 |
+
if ats.retransfer_timer_s >= ats.retransfer_delay_s:
|
| 249 |
+
ats.position = ATSPosition.TRANSFERRING
|
| 250 |
+
ats.transfer_elapsed_ms = 0.0
|
| 251 |
+
alarms.append(PowerAlarm(
|
| 252 |
+
component=ats.ats_id,
|
| 253 |
+
alarm_type="retransfer",
|
| 254 |
+
severity="warning",
|
| 255 |
+
message="Utility restored, initiating retransfer",
|
| 256 |
+
))
|
| 257 |
+
else:
|
| 258 |
+
ats.retransfer_timer_s = 0.0
|
| 259 |
+
|
| 260 |
+
# -------------------------------------------------------------------
|
| 261 |
+
# Generator
|
| 262 |
+
# -------------------------------------------------------------------
|
| 263 |
+
def _step_generator(self, dt_s: float, alarms: list[PowerAlarm]) -> None:
|
| 264 |
+
"""Advance generator state machine."""
|
| 265 |
+
gen = self._state.generator
|
| 266 |
+
|
| 267 |
+
if gen.state == GeneratorState.OFF:
|
| 268 |
+
gen.output_power_kw = 0.0
|
| 269 |
+
gen.load_fraction = 0.0
|
| 270 |
+
gen.fuel_consumption_lph = 0.0
|
| 271 |
+
return
|
| 272 |
+
|
| 273 |
+
gen.state_elapsed_s += dt_s
|
| 274 |
+
|
| 275 |
+
if gen.state == GeneratorState.START_DELAY:
|
| 276 |
+
if gen.state_elapsed_s >= gen.start_delay_s:
|
| 277 |
+
gen.state = GeneratorState.CRANKING
|
| 278 |
+
gen.state_elapsed_s = 0.0
|
| 279 |
+
|
| 280 |
+
elif gen.state == GeneratorState.CRANKING:
|
| 281 |
+
if gen.state_elapsed_s >= gen.crank_time_s:
|
| 282 |
+
gen.state = GeneratorState.WARMING
|
| 283 |
+
gen.state_elapsed_s = 0.0
|
| 284 |
+
alarms.append(PowerAlarm(
|
| 285 |
+
component=gen.gen_id,
|
| 286 |
+
alarm_type="engine_started",
|
| 287 |
+
severity="warning",
|
| 288 |
+
message="Generator engine started, warming up",
|
| 289 |
+
))
|
| 290 |
+
|
| 291 |
+
elif gen.state == GeneratorState.WARMING:
|
| 292 |
+
# Idle fuel consumption during warmup
|
| 293 |
+
gen.fuel_consumption_lph = gen.consumption_lph_full * 0.1
|
| 294 |
+
self._consume_fuel(gen, dt_s)
|
| 295 |
+
if gen.state_elapsed_s >= gen.warmup_time_s:
|
| 296 |
+
gen.state = GeneratorState.READY
|
| 297 |
+
gen.state_elapsed_s = 0.0
|
| 298 |
+
alarms.append(PowerAlarm(
|
| 299 |
+
component=gen.gen_id,
|
| 300 |
+
alarm_type="ready",
|
| 301 |
+
severity="warning",
|
| 302 |
+
message="Generator ready to accept load",
|
| 303 |
+
))
|
| 304 |
+
|
| 305 |
+
elif gen.state == GeneratorState.READY:
|
| 306 |
+
gen.fuel_consumption_lph = gen.consumption_lph_full * 0.1
|
| 307 |
+
self._consume_fuel(gen, dt_s)
|
| 308 |
+
# If ATS has switched to generator, transition to loaded
|
| 309 |
+
if self._state.ats.position == ATSPosition.GENERATOR:
|
| 310 |
+
gen.state = GeneratorState.LOADED
|
| 311 |
+
gen.state_elapsed_s = 0.0
|
| 312 |
+
|
| 313 |
+
elif gen.state == GeneratorState.LOADED:
|
| 314 |
+
gen.load_fraction = min(self._it_load_kw / gen.rated_capacity_kw, 1.0)
|
| 315 |
+
gen.output_power_kw = min(self._it_load_kw, gen.rated_capacity_kw)
|
| 316 |
+
gen.fuel_consumption_lph = gen.compute_fuel_consumption_lph()
|
| 317 |
+
self._consume_fuel(gen, dt_s)
|
| 318 |
+
|
| 319 |
+
# Check fuel level
|
| 320 |
+
if gen.fuel_level_liters <= 0:
|
| 321 |
+
gen.fuel_level_liters = 0.0
|
| 322 |
+
gen.state = GeneratorState.OFF
|
| 323 |
+
gen.output_power_kw = 0.0
|
| 324 |
+
alarms.append(PowerAlarm(
|
| 325 |
+
component=gen.gen_id,
|
| 326 |
+
alarm_type="fuel_exhausted",
|
| 327 |
+
severity="critical",
|
| 328 |
+
message="Generator fuel exhausted — engine shutdown",
|
| 329 |
+
))
|
| 330 |
+
elif gen.fuel_remaining_hours < 2.0:
|
| 331 |
+
alarms.append(PowerAlarm(
|
| 332 |
+
component=gen.gen_id,
|
| 333 |
+
alarm_type="fuel_low",
|
| 334 |
+
severity="warning",
|
| 335 |
+
message=f"Generator fuel low: {gen.fuel_level_liters:.0f}L "
|
| 336 |
+
f"(~{gen.fuel_remaining_hours:.1f}h remaining)",
|
| 337 |
+
value=gen.fuel_level_liters,
|
| 338 |
+
))
|
| 339 |
+
|
| 340 |
+
# If utility is back and ATS has switched away, go to cooldown
|
| 341 |
+
if self._state.ats.position != ATSPosition.GENERATOR:
|
| 342 |
+
gen.state = GeneratorState.COOLDOWN
|
| 343 |
+
gen.state_elapsed_s = 0.0
|
| 344 |
+
gen.output_power_kw = 0.0
|
| 345 |
+
gen.load_fraction = 0.0
|
| 346 |
+
|
| 347 |
+
elif gen.state == GeneratorState.COOLDOWN:
|
| 348 |
+
gen.output_power_kw = 0.0
|
| 349 |
+
gen.load_fraction = 0.0
|
| 350 |
+
gen.fuel_consumption_lph = gen.consumption_lph_full * 0.1
|
| 351 |
+
self._consume_fuel(gen, dt_s)
|
| 352 |
+
if gen.state_elapsed_s >= gen.cooldown_time_s:
|
| 353 |
+
gen.state = GeneratorState.OFF
|
| 354 |
+
gen.state_elapsed_s = 0.0
|
| 355 |
+
gen.fuel_consumption_lph = 0.0
|
| 356 |
+
alarms.append(PowerAlarm(
|
| 357 |
+
component=gen.gen_id,
|
| 358 |
+
alarm_type="shutdown",
|
| 359 |
+
severity="warning",
|
| 360 |
+
message="Generator cooldown complete, engine off",
|
| 361 |
+
))
|
| 362 |
+
|
| 363 |
+
@staticmethod
|
| 364 |
+
def _consume_fuel(gen: GensetState, dt_s: float) -> None:
|
| 365 |
+
"""Consume fuel for the given timestep."""
|
| 366 |
+
if gen.fuel_consumption_lph > 0:
|
| 367 |
+
consumed = gen.fuel_consumption_lph * dt_s / 3600.0 # hours → seconds
|
| 368 |
+
gen.fuel_level_liters = max(0.0, gen.fuel_level_liters - consumed)
|
| 369 |
+
|
| 370 |
+
# -------------------------------------------------------------------
|
| 371 |
+
# UPS
|
| 372 |
+
# -------------------------------------------------------------------
|
| 373 |
+
def _step_ups_units(
|
| 374 |
+
self, dt_s: float, it_load_kw: float, alarms: list[PowerAlarm]
|
| 375 |
+
) -> float:
|
| 376 |
+
"""Step all UPS units and return total UPS losses in kW."""
|
| 377 |
+
if not self._state.ups_units:
|
| 378 |
+
return 0.0
|
| 379 |
+
|
| 380 |
+
# Distribute IT load evenly across UPS units
|
| 381 |
+
load_per_ups = it_load_kw / len(self._state.ups_units)
|
| 382 |
+
total_loss = 0.0
|
| 383 |
+
|
| 384 |
+
for ups in self._state.ups_units:
|
| 385 |
+
loss = self._step_single_ups(ups, dt_s, load_per_ups, alarms)
|
| 386 |
+
total_loss += loss
|
| 387 |
+
|
| 388 |
+
return total_loss
|
| 389 |
+
|
| 390 |
+
def _step_single_ups(
|
| 391 |
+
self,
|
| 392 |
+
ups: UPSState,
|
| 393 |
+
dt_s: float,
|
| 394 |
+
load_kw: float,
|
| 395 |
+
alarms: list[PowerAlarm],
|
| 396 |
+
) -> float:
|
| 397 |
+
"""Step a single UPS unit. Returns loss in kW."""
|
| 398 |
+
ups.output_power_kw = load_kw
|
| 399 |
+
ups.load_fraction = load_kw / ups.rated_capacity_kw if ups.rated_capacity_kw > 0 else 0.0
|
| 400 |
+
|
| 401 |
+
utility_ok = self._state.utility_available
|
| 402 |
+
ats_ok = self._state.ats.load_powered
|
| 403 |
+
|
| 404 |
+
# Mode transitions
|
| 405 |
+
if ups.mode == UPSMode.FAULT:
|
| 406 |
+
# Fault state: no output, no charging
|
| 407 |
+
ups.efficiency = 0.0
|
| 408 |
+
ups.heat_output_kw = 0.0
|
| 409 |
+
ups.input_power_kw = 0.0
|
| 410 |
+
ups.battery_power_kw = 0.0
|
| 411 |
+
return 0.0
|
| 412 |
+
|
| 413 |
+
if ups.mode == UPSMode.BYPASS:
|
| 414 |
+
# Bypass: no UPS processing, minimal losses
|
| 415 |
+
ups.efficiency = 1.0
|
| 416 |
+
ups.heat_output_kw = 0.0
|
| 417 |
+
ups.input_power_kw = load_kw
|
| 418 |
+
ups.battery_power_kw = 0.0
|
| 419 |
+
return 0.0
|
| 420 |
+
|
| 421 |
+
# Check if we need to switch to battery
|
| 422 |
+
source_ok = utility_ok and ats_ok
|
| 423 |
+
if ups.mode == UPSMode.ON_BATTERY:
|
| 424 |
+
if source_ok:
|
| 425 |
+
# Source restored — switch back to normal mode
|
| 426 |
+
ups.mode = UPSMode.DOUBLE_CONVERSION
|
| 427 |
+
alarms.append(PowerAlarm(
|
| 428 |
+
component=ups.unit_id,
|
| 429 |
+
alarm_type="utility_restored",
|
| 430 |
+
severity="warning",
|
| 431 |
+
message=f"UPS {ups.unit_id} back on utility power",
|
| 432 |
+
))
|
| 433 |
+
elif not source_ok and ups.mode in (
|
| 434 |
+
UPSMode.DOUBLE_CONVERSION, UPSMode.LINE_INTERACTIVE, UPSMode.ECO
|
| 435 |
+
):
|
| 436 |
+
ups.mode = UPSMode.ON_BATTERY
|
| 437 |
+
alarms.append(PowerAlarm(
|
| 438 |
+
component=ups.unit_id,
|
| 439 |
+
alarm_type="on_battery",
|
| 440 |
+
severity="critical",
|
| 441 |
+
message=f"UPS {ups.unit_id} switched to battery",
|
| 442 |
+
value=ups.battery_soc,
|
| 443 |
+
))
|
| 444 |
+
|
| 445 |
+
# Compute efficiency based on mode
|
| 446 |
+
if ups.mode == UPSMode.ECO:
|
| 447 |
+
# Eco mode: ~99% efficiency (minimal processing)
|
| 448 |
+
ups.efficiency = 0.99
|
| 449 |
+
elif ups.mode == UPSMode.LINE_INTERACTIVE:
|
| 450 |
+
# Line interactive: ~97% (some processing)
|
| 451 |
+
ups.efficiency = min(0.97, ups.compute_efficiency() + 0.03)
|
| 452 |
+
else:
|
| 453 |
+
# Double conversion or on_battery: full quadratic model
|
| 454 |
+
ups.efficiency = ups.compute_efficiency()
|
| 455 |
+
|
| 456 |
+
# Compute losses
|
| 457 |
+
if ups.efficiency > 0:
|
| 458 |
+
ups_loss = load_kw * (1.0 / ups.efficiency - 1.0)
|
| 459 |
+
else:
|
| 460 |
+
ups_loss = ups.rated_capacity_kw * ups.loss_c0
|
| 461 |
+
ups.heat_output_kw = ups_loss
|
| 462 |
+
ups.input_power_kw = load_kw + ups_loss
|
| 463 |
+
|
| 464 |
+
# Battery management
|
| 465 |
+
if ups.mode == UPSMode.ON_BATTERY:
|
| 466 |
+
# Discharging: SOC decreases
|
| 467 |
+
# P_discharge = P_output / η_discharge (battery must supply more than output)
|
| 468 |
+
p_discharge = load_kw / ups.battery_discharge_efficiency if ups.battery_discharge_efficiency > 0 else load_kw
|
| 469 |
+
ups.battery_power_kw = p_discharge
|
| 470 |
+
energy_used_kwh = p_discharge * dt_s / 3600.0
|
| 471 |
+
effective_capacity = ups.battery_capacity_kwh * ups.battery_aging_factor
|
| 472 |
+
if effective_capacity > 0:
|
| 473 |
+
ups.battery_soc -= energy_used_kwh / effective_capacity
|
| 474 |
+
ups.battery_soc = max(0.0, ups.battery_soc)
|
| 475 |
+
ups.battery_time_remaining_s = ups.compute_battery_time_remaining_s()
|
| 476 |
+
ups.input_power_kw = 0.0 # Not drawing from mains
|
| 477 |
+
|
| 478 |
+
# Battery alarms
|
| 479 |
+
if ups.battery_soc <= 0.0:
|
| 480 |
+
ups.mode = UPSMode.FAULT
|
| 481 |
+
alarms.append(PowerAlarm(
|
| 482 |
+
component=ups.unit_id,
|
| 483 |
+
alarm_type="battery_exhausted",
|
| 484 |
+
severity="critical",
|
| 485 |
+
message=f"UPS {ups.unit_id} battery exhausted — load unprotected",
|
| 486 |
+
))
|
| 487 |
+
elif ups.battery_soc < 0.10:
|
| 488 |
+
alarms.append(PowerAlarm(
|
| 489 |
+
component=ups.unit_id,
|
| 490 |
+
alarm_type="battery_critical",
|
| 491 |
+
severity="critical",
|
| 492 |
+
message=f"UPS {ups.unit_id} battery critical: {ups.battery_soc*100:.0f}%",
|
| 493 |
+
value=ups.battery_soc,
|
| 494 |
+
))
|
| 495 |
+
elif ups.battery_soc < 0.25:
|
| 496 |
+
alarms.append(PowerAlarm(
|
| 497 |
+
component=ups.unit_id,
|
| 498 |
+
alarm_type="battery_low",
|
| 499 |
+
severity="warning",
|
| 500 |
+
message=f"UPS {ups.unit_id} battery low: {ups.battery_soc*100:.0f}%",
|
| 501 |
+
value=ups.battery_soc,
|
| 502 |
+
))
|
| 503 |
+
else:
|
| 504 |
+
# On mains — charge battery if not full
|
| 505 |
+
ups.battery_power_kw = 0.0
|
| 506 |
+
ups.battery_time_remaining_s = float("inf")
|
| 507 |
+
if ups.battery_soc < 1.0:
|
| 508 |
+
charge_kw = min(ups.recharge_rate_kw, ups.rated_capacity_kw * 0.1)
|
| 509 |
+
energy_charged_kwh = charge_kw * dt_s / 3600.0
|
| 510 |
+
effective_capacity = ups.battery_capacity_kwh * ups.battery_aging_factor
|
| 511 |
+
if effective_capacity > 0:
|
| 512 |
+
ups.battery_soc += energy_charged_kwh / effective_capacity
|
| 513 |
+
ups.battery_soc = min(1.0, ups.battery_soc)
|
| 514 |
+
ups.battery_power_kw = -charge_kw # Negative = charging
|
| 515 |
+
ups.input_power_kw += charge_kw # Charging draws additional power
|
| 516 |
+
|
| 517 |
+
# Overload alarm
|
| 518 |
+
if ups.load_fraction > 1.0:
|
| 519 |
+
alarms.append(PowerAlarm(
|
| 520 |
+
component=ups.unit_id,
|
| 521 |
+
alarm_type="overload",
|
| 522 |
+
severity="critical",
|
| 523 |
+
message=f"UPS {ups.unit_id} overloaded at {ups.load_fraction*100:.0f}%",
|
| 524 |
+
value=ups.load_fraction,
|
| 525 |
+
))
|
| 526 |
+
|
| 527 |
+
return ups_loss
|
| 528 |
+
|
| 529 |
+
# -------------------------------------------------------------------
|
| 530 |
+
# PDU
|
| 531 |
+
# -------------------------------------------------------------------
|
| 532 |
+
def _step_pdus(
|
| 533 |
+
self, it_load_kw: float, alarms: list[PowerAlarm]
|
| 534 |
+
) -> float:
|
| 535 |
+
"""Step all PDUs and return total PDU losses in kW."""
|
| 536 |
+
if not self._state.pdus:
|
| 537 |
+
return 0.0
|
| 538 |
+
|
| 539 |
+
# Distribute IT load evenly across PDUs
|
| 540 |
+
load_per_pdu = it_load_kw / len(self._state.pdus)
|
| 541 |
+
total_loss = 0.0
|
| 542 |
+
|
| 543 |
+
for pdu in self._state.pdus:
|
| 544 |
+
loss = self._step_single_pdu(pdu, load_per_pdu, alarms)
|
| 545 |
+
total_loss += loss
|
| 546 |
+
|
| 547 |
+
return total_loss
|
| 548 |
+
|
| 549 |
+
def _step_single_pdu(
|
| 550 |
+
self,
|
| 551 |
+
pdu: PDUState,
|
| 552 |
+
load_kw: float,
|
| 553 |
+
alarms: list[PowerAlarm],
|
| 554 |
+
) -> float:
|
| 555 |
+
"""Step a single PDU. Returns loss in kW."""
|
| 556 |
+
pdu.output_power_kw = load_kw
|
| 557 |
+
pdu.input_power_kw = load_kw / pdu.efficiency if pdu.efficiency > 0 else load_kw
|
| 558 |
+
pdu_loss = pdu.input_power_kw - pdu.output_power_kw
|
| 559 |
+
pdu.heat_output_kw = pdu_loss
|
| 560 |
+
|
| 561 |
+
# Compute per-phase currents (assume balanced load across phases)
|
| 562 |
+
# P = √3 × V_LL × I_L × PF (assume PF = 1.0 for IT loads with PFC)
|
| 563 |
+
if pdu.voltage_ll_v > 0:
|
| 564 |
+
total_current = (load_kw * 1000.0) / (math.sqrt(3) * pdu.voltage_ll_v)
|
| 565 |
+
per_phase = total_current / pdu.num_phases if pdu.num_phases > 0 else total_current
|
| 566 |
+
pdu.phase_currents_a = [per_phase] * pdu.num_phases
|
| 567 |
+
else:
|
| 568 |
+
pdu.phase_currents_a = [0.0] * pdu.num_phases
|
| 569 |
+
|
| 570 |
+
# Load fraction of derated capacity
|
| 571 |
+
derated = pdu.derated_capacity_kw
|
| 572 |
+
pdu.load_fraction = load_kw / derated if derated > 0 else 0.0
|
| 573 |
+
|
| 574 |
+
# Phase imbalance (0 for balanced load — will be nonzero when
|
| 575 |
+
# individual rack loads are modeled)
|
| 576 |
+
pdu.phase_imbalance_pct = pdu.compute_phase_imbalance()
|
| 577 |
+
|
| 578 |
+
# Check overload
|
| 579 |
+
max_phase_current = max(pdu.phase_currents_a) if pdu.phase_currents_a else 0.0
|
| 580 |
+
if max_phase_current > pdu.max_current_per_phase_a:
|
| 581 |
+
pdu.overload = True
|
| 582 |
+
alarms.append(PowerAlarm(
|
| 583 |
+
component=pdu.pdu_id,
|
| 584 |
+
alarm_type="phase_overcurrent",
|
| 585 |
+
severity="critical",
|
| 586 |
+
message=f"PDU {pdu.pdu_id} phase overcurrent: "
|
| 587 |
+
f"{max_phase_current:.1f}A > {pdu.max_current_per_phase_a:.0f}A",
|
| 588 |
+
value=max_phase_current,
|
| 589 |
+
))
|
| 590 |
+
else:
|
| 591 |
+
pdu.overload = False
|
| 592 |
+
|
| 593 |
+
# Breaker trip check (per-branch, simplified as aggregate)
|
| 594 |
+
if max_phase_current > pdu.breaker_rating_a / pdu.continuous_derating:
|
| 595 |
+
pdu.breaker_tripped = True
|
| 596 |
+
alarms.append(PowerAlarm(
|
| 597 |
+
component=pdu.pdu_id,
|
| 598 |
+
alarm_type="breaker_trip",
|
| 599 |
+
severity="critical",
|
| 600 |
+
message=f"PDU {pdu.pdu_id} breaker tripped",
|
| 601 |
+
value=max_phase_current,
|
| 602 |
+
))
|
| 603 |
+
|
| 604 |
+
# Warn on high utilization
|
| 605 |
+
if pdu.load_fraction > 0.80 and not pdu.overload:
|
| 606 |
+
alarms.append(PowerAlarm(
|
| 607 |
+
component=pdu.pdu_id,
|
| 608 |
+
alarm_type="high_utilization",
|
| 609 |
+
severity="warning",
|
| 610 |
+
message=f"PDU {pdu.pdu_id} at {pdu.load_fraction*100:.0f}% of derated capacity",
|
| 611 |
+
value=pdu.load_fraction,
|
| 612 |
+
))
|
| 613 |
+
|
| 614 |
+
return pdu_loss
|
| 615 |
+
|
| 616 |
+
# -------------------------------------------------------------------
|
| 617 |
+
# Mutation helpers (for agent actions)
|
| 618 |
+
# -------------------------------------------------------------------
|
| 619 |
+
def set_utility_available(self, available: bool) -> None:
|
| 620 |
+
"""Set utility power availability (for scenario injection)."""
|
| 621 |
+
self._state.utility_available = available
|
| 622 |
+
|
| 623 |
+
def set_ups_mode(self, unit_id: str, mode: UPSMode) -> bool:
|
| 624 |
+
"""Manually set UPS operating mode. Returns True if found."""
|
| 625 |
+
for ups in self._state.ups_units:
|
| 626 |
+
if ups.unit_id == unit_id:
|
| 627 |
+
ups.mode = mode
|
| 628 |
+
return True
|
| 629 |
+
return False
|
| 630 |
+
|
| 631 |
+
def inject_ups_fault(self, unit_id: str) -> bool:
|
| 632 |
+
"""Put a UPS into fault mode. Returns True if found."""
|
| 633 |
+
return self.set_ups_mode(unit_id, UPSMode.FAULT)
|
| 634 |
+
|
| 635 |
+
def clear_ups_fault(self, unit_id: str) -> bool:
|
| 636 |
+
"""Restore a faulted UPS to double conversion. Returns True if found."""
|
| 637 |
+
for ups in self._state.ups_units:
|
| 638 |
+
if ups.unit_id == unit_id and ups.mode == UPSMode.FAULT:
|
| 639 |
+
ups.mode = UPSMode.DOUBLE_CONVERSION
|
| 640 |
+
return True
|
| 641 |
+
return False
|
| 642 |
+
|
| 643 |
+
def start_generator(self) -> None:
|
| 644 |
+
"""Manually start the generator."""
|
| 645 |
+
gen = self._state.generator
|
| 646 |
+
if gen.state == GeneratorState.OFF:
|
| 647 |
+
gen.state = GeneratorState.START_DELAY
|
| 648 |
+
gen.state_elapsed_s = 0.0
|
| 649 |
+
|
| 650 |
+
def stop_generator(self) -> None:
|
| 651 |
+
"""Initiate generator cooldown/shutdown."""
|
| 652 |
+
gen = self._state.generator
|
| 653 |
+
if gen.state in (GeneratorState.READY, GeneratorState.LOADED):
|
| 654 |
+
gen.state = GeneratorState.COOLDOWN
|
| 655 |
+
gen.state_elapsed_s = 0.0
|
| 656 |
+
gen.output_power_kw = 0.0
|
| 657 |
+
gen.load_fraction = 0.0
|
| 658 |
+
|
| 659 |
+
def refuel_generator(self, liters: float | None = None) -> None:
|
| 660 |
+
"""Refuel the generator (default: full tank)."""
|
| 661 |
+
gen = self._state.generator
|
| 662 |
+
if liters is None:
|
| 663 |
+
gen.fuel_level_liters = gen.fuel_tank_liters
|
| 664 |
+
else:
|
| 665 |
+
gen.fuel_level_liters = min(
|
| 666 |
+
gen.fuel_level_liters + liters,
|
| 667 |
+
gen.fuel_tank_liters,
|
| 668 |
+
)
|
simulation/thermal.py
ADDED
|
@@ -0,0 +1,515 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
RC thermal network simulation for datacenter zones.
|
| 9 |
+
|
| 10 |
+
Physics model (lumped-capacitance, per zone):
|
| 11 |
+
|
| 12 |
+
C_zone × dT_zone/dt = Q_IT - Q_cooling + Q_envelope + Q_internal
|
| 13 |
+
|
| 14 |
+
Where:
|
| 15 |
+
C_zone = C_air + C_equipment [J/K]
|
| 16 |
+
Q_IT = sum of rack IT loads × 1000 [W]
|
| 17 |
+
Q_cool = sum of CRAC cooling outputs × 1000 [W]
|
| 18 |
+
Q_env = (T_outside - T_zone) / R_envelope [W]
|
| 19 |
+
Q_int = UPS losses + PDU losses + lighting [W]
|
| 20 |
+
|
| 21 |
+
Cold aisle temperature accounts for hot-air recirculation:
|
| 22 |
+
T_cold_effective = (1-r) × T_supply_weighted + r × T_hot_aisle
|
| 23 |
+
|
| 24 |
+
where r is the recirculation factor (0 = perfect containment).
|
| 25 |
+
|
| 26 |
+
Hot aisle temperature from server energy balance:
|
| 27 |
+
T_hot = T_cold + Q_IT / (m_dot_rack × c_p)
|
| 28 |
+
|
| 29 |
+
Integration: Forward Euler with configurable dt (default 1.0 s).
|
| 30 |
+
Target: <1 ms per step for a 20-rack, 4-CRAC datacenter.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
from __future__ import annotations
|
| 34 |
+
|
| 35 |
+
from dataclasses import dataclass, field
|
| 36 |
+
|
| 37 |
+
from ..config import (
|
| 38 |
+
AIR_DENSITY_KG_M3,
|
| 39 |
+
AIR_SPECIFIC_HEAT_J_KGK,
|
| 40 |
+
ASHRAE_CLASSES,
|
| 41 |
+
DatacenterConfig,
|
| 42 |
+
RackConfig,
|
| 43 |
+
CRACConfig,
|
| 44 |
+
ZoneConfig,
|
| 45 |
+
cfm_to_m3s,
|
| 46 |
+
make_default_datacenter_config,
|
| 47 |
+
)
|
| 48 |
+
from .types import (
|
| 49 |
+
CRACFaultType,
|
| 50 |
+
CRACState,
|
| 51 |
+
CRACStatus,
|
| 52 |
+
DatacenterState,
|
| 53 |
+
RackState,
|
| 54 |
+
ZoneState,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass
|
| 59 |
+
class ThermalAlarm:
|
| 60 |
+
"""An active thermal alarm."""
|
| 61 |
+
rack_id: str
|
| 62 |
+
zone_id: str
|
| 63 |
+
inlet_temp_c: float
|
| 64 |
+
threshold_c: float
|
| 65 |
+
severity: str # "warning" (recommended exceeded) or "critical" (allowable exceeded)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@dataclass
|
| 69 |
+
class ThermalStepResult:
|
| 70 |
+
"""Result of a single simulation step."""
|
| 71 |
+
state: DatacenterState
|
| 72 |
+
alarms: list[ThermalAlarm] = field(default_factory=list)
|
| 73 |
+
total_cooling_output_kw: float = 0.0
|
| 74 |
+
total_cooling_power_kw: float = 0.0
|
| 75 |
+
energy_consumed_kwh: float = 0.0 # Energy consumed in this step
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class ThermalSimulation:
|
| 79 |
+
"""Multi-zone RC thermal network simulation.
|
| 80 |
+
|
| 81 |
+
Owns the DatacenterState and advances it forward in time.
|
| 82 |
+
Each call to step() integrates the thermal ODEs by dt seconds.
|
| 83 |
+
"""
|
| 84 |
+
|
| 85 |
+
def __init__(self, config: DatacenterConfig | None = None):
|
| 86 |
+
if config is None:
|
| 87 |
+
config = make_default_datacenter_config()
|
| 88 |
+
self._config = config
|
| 89 |
+
self._state = self._build_initial_state(config)
|
| 90 |
+
self._dt = config.simulation_dt_s
|
| 91 |
+
|
| 92 |
+
@property
|
| 93 |
+
def state(self) -> DatacenterState:
|
| 94 |
+
return self._state
|
| 95 |
+
|
| 96 |
+
@property
|
| 97 |
+
def config(self) -> DatacenterConfig:
|
| 98 |
+
return self._config
|
| 99 |
+
|
| 100 |
+
@property
|
| 101 |
+
def dt(self) -> float:
|
| 102 |
+
return self._dt
|
| 103 |
+
|
| 104 |
+
# ------------------------------------------------------------------
|
| 105 |
+
# Initialization
|
| 106 |
+
# ------------------------------------------------------------------
|
| 107 |
+
|
| 108 |
+
@staticmethod
|
| 109 |
+
def _build_initial_state(config: DatacenterConfig) -> DatacenterState:
|
| 110 |
+
"""Construct the initial DatacenterState from configuration."""
|
| 111 |
+
zones: list[ZoneState] = []
|
| 112 |
+
for zc in config.zones:
|
| 113 |
+
racks = ThermalSimulation._build_racks(zc, zc.initial_cold_aisle_temp_c)
|
| 114 |
+
cracs = ThermalSimulation._build_cracs(zc)
|
| 115 |
+
zone = ZoneState(
|
| 116 |
+
zone_id=zc.zone_id,
|
| 117 |
+
cold_aisle_temp_c=zc.initial_cold_aisle_temp_c,
|
| 118 |
+
hot_aisle_temp_c=zc.initial_cold_aisle_temp_c + 15.0, # Initial estimate
|
| 119 |
+
humidity_rh=zc.initial_humidity_rh,
|
| 120 |
+
recirculation_factor=zc.recirculation_factor,
|
| 121 |
+
racks=racks,
|
| 122 |
+
crac_units=cracs,
|
| 123 |
+
air_volume_m3=zc.air_volume_m3,
|
| 124 |
+
envelope_r_kw=zc.envelope_r_kw,
|
| 125 |
+
ashrae_class=zc.ashrae_class,
|
| 126 |
+
)
|
| 127 |
+
zones.append(zone)
|
| 128 |
+
|
| 129 |
+
state = DatacenterState(
|
| 130 |
+
zones=zones,
|
| 131 |
+
outside_temp_c=config.outside_temp_c,
|
| 132 |
+
outside_humidity_rh=config.outside_humidity_rh,
|
| 133 |
+
lighting_power_kw=config.lighting_w_per_m2 * config.floor_area_m2 / 1000.0,
|
| 134 |
+
ups_loss_fraction=config.ups_loss_fraction,
|
| 135 |
+
pdu_loss_fraction=config.pdu_loss_fraction,
|
| 136 |
+
sim_time_s=0.0,
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# Run a few settling steps so initial temps are physically consistent
|
| 140 |
+
sim = ThermalSimulation.__new__(ThermalSimulation)
|
| 141 |
+
sim._state = state
|
| 142 |
+
sim._config = make_default_datacenter_config()
|
| 143 |
+
sim._dt = 1.0
|
| 144 |
+
for _ in range(300):
|
| 145 |
+
sim._integrate_step(1.0)
|
| 146 |
+
|
| 147 |
+
return state
|
| 148 |
+
|
| 149 |
+
@staticmethod
|
| 150 |
+
def _build_racks(zone_config: ZoneConfig, initial_temp_c: float) -> list[RackState]:
|
| 151 |
+
racks: list[RackState] = []
|
| 152 |
+
for rc in zone_config.racks:
|
| 153 |
+
airflow_cfm = rc.airflow_cfm_per_kw * rc.it_load_kw
|
| 154 |
+
airflow_m3s = cfm_to_m3s(airflow_cfm)
|
| 155 |
+
thermal_mass = rc.num_servers_2u * rc.server_thermal_mass_jk
|
| 156 |
+
|
| 157 |
+
rack = RackState(
|
| 158 |
+
rack_id=rc.rack_id,
|
| 159 |
+
row=rc.row,
|
| 160 |
+
position=rc.position,
|
| 161 |
+
it_load_kw=rc.it_load_kw,
|
| 162 |
+
inlet_temp_c=initial_temp_c,
|
| 163 |
+
outlet_temp_c=initial_temp_c + 15.0, # Will be corrected by settling
|
| 164 |
+
airflow_m3s=airflow_m3s,
|
| 165 |
+
thermal_mass_jk=thermal_mass,
|
| 166 |
+
)
|
| 167 |
+
racks.append(rack)
|
| 168 |
+
return racks
|
| 169 |
+
|
| 170 |
+
@staticmethod
|
| 171 |
+
def _build_cracs(zone_config: ZoneConfig) -> list[CRACState]:
|
| 172 |
+
cracs: list[CRACState] = []
|
| 173 |
+
for cc in zone_config.crac_units:
|
| 174 |
+
crac = CRACState(
|
| 175 |
+
unit_id=cc.unit_id,
|
| 176 |
+
setpoint_c=cc.initial_setpoint_c,
|
| 177 |
+
supply_temp_c=cc.initial_setpoint_c,
|
| 178 |
+
fan_speed_pct=cc.initial_fan_speed_pct,
|
| 179 |
+
max_airflow_m3s=cfm_to_m3s(cc.max_airflow_cfm),
|
| 180 |
+
rated_capacity_kw=cc.rated_capacity_kw,
|
| 181 |
+
rated_return_temp_c=cc.rated_return_temp_c,
|
| 182 |
+
capacity_slope_per_c=cc.capacity_slope_per_c,
|
| 183 |
+
fan_rated_power_kw=cc.fan_rated_power_kw,
|
| 184 |
+
cop_rated=cc.cop_rated,
|
| 185 |
+
cop_degradation_per_c=cc.cop_degradation_per_c,
|
| 186 |
+
supply_temp_lag_s=cc.supply_temp_lag_s,
|
| 187 |
+
)
|
| 188 |
+
cracs.append(crac)
|
| 189 |
+
return cracs
|
| 190 |
+
|
| 191 |
+
# ------------------------------------------------------------------
|
| 192 |
+
# Simulation step
|
| 193 |
+
# ------------------------------------------------------------------
|
| 194 |
+
|
| 195 |
+
def step(self, dt: float | None = None) -> ThermalStepResult:
|
| 196 |
+
"""Advance the simulation by dt seconds.
|
| 197 |
+
|
| 198 |
+
Returns a ThermalStepResult with updated state, alarms, and energy metrics.
|
| 199 |
+
"""
|
| 200 |
+
if dt is None:
|
| 201 |
+
dt = self._dt
|
| 202 |
+
|
| 203 |
+
result = self._integrate_step(dt)
|
| 204 |
+
self._state.sim_time_s += dt
|
| 205 |
+
return result
|
| 206 |
+
|
| 207 |
+
def step_n(self, n: int, dt: float | None = None) -> ThermalStepResult:
|
| 208 |
+
"""Advance simulation by n steps. Returns result of the last step."""
|
| 209 |
+
result = ThermalStepResult(state=self._state)
|
| 210 |
+
for _ in range(n):
|
| 211 |
+
result = self.step(dt)
|
| 212 |
+
return result
|
| 213 |
+
|
| 214 |
+
def _integrate_step(self, dt: float) -> ThermalStepResult:
|
| 215 |
+
"""Core integration: one Forward Euler step across all zones.
|
| 216 |
+
|
| 217 |
+
Physics model — **cold aisle energy balance** (not total-zone):
|
| 218 |
+
|
| 219 |
+
The cold aisle is a mixing volume. Heat flows into/out of it:
|
| 220 |
+
q_crac = m_dot_crac × c_p × (T_supply − T_cold) [cooling from CRACs]
|
| 221 |
+
q_recirc = r × m_dot_crac × c_p × (T_hot − T_cold) [recirculated hot air]
|
| 222 |
+
q_env = (T_outside − T_cold) / R_envelope [building heat gain]
|
| 223 |
+
q_int = UPS losses + PDU losses + lighting [internal gains]
|
| 224 |
+
|
| 225 |
+
IT heat does NOT appear directly — servers move cold air to the hot
|
| 226 |
+
aisle, raising T_hot. IT heat affects the cold aisle only through
|
| 227 |
+
recirculation (hot air leaking back) and indirectly via CRAC return
|
| 228 |
+
temperature.
|
| 229 |
+
|
| 230 |
+
Hot aisle temperature (algebraic, not ODE):
|
| 231 |
+
T_hot = T_cold + Q_IT / (m_dot_rack × c_p)
|
| 232 |
+
|
| 233 |
+
CRAC return air temperature accounts for bypass airflow:
|
| 234 |
+
When CRAC airflow > rack airflow, excess cold air bypasses servers
|
| 235 |
+
and returns directly to the CRAC at T_cold, lowering the effective
|
| 236 |
+
return air temperature and thus CRAC cooling output.
|
| 237 |
+
T_return = (1 − bypass) × T_hot + bypass × T_cold
|
| 238 |
+
"""
|
| 239 |
+
state = self._state
|
| 240 |
+
alarms: list[ThermalAlarm] = []
|
| 241 |
+
total_cooling_output_kw = 0.0
|
| 242 |
+
total_cooling_power_kw = 0.0
|
| 243 |
+
total_power_kw = 0.0
|
| 244 |
+
|
| 245 |
+
for zone in state.zones:
|
| 246 |
+
# 1. Update CRAC supply temperatures (first-order lag toward setpoint)
|
| 247 |
+
for crac in zone.crac_units:
|
| 248 |
+
crac.update_supply_temp(dt)
|
| 249 |
+
|
| 250 |
+
# 2. Airflow quantities
|
| 251 |
+
q_it_w = zone.total_it_load_kw * 1000.0
|
| 252 |
+
m_dot_rack = zone.total_rack_airflow_m3s * AIR_DENSITY_KG_M3 # kg/s
|
| 253 |
+
m_dot_crac = zone.total_crac_airflow_m3s * AIR_DENSITY_KG_M3 # kg/s
|
| 254 |
+
|
| 255 |
+
# Server temperature rise [°C]
|
| 256 |
+
if m_dot_rack > 0:
|
| 257 |
+
dt_server = q_it_w / (m_dot_rack * AIR_SPECIFIC_HEAT_J_KGK)
|
| 258 |
+
else:
|
| 259 |
+
dt_server = 50.0 # No airflow → extreme rise
|
| 260 |
+
t_hot = zone.cold_aisle_temp_c + dt_server
|
| 261 |
+
|
| 262 |
+
# 3. Bypass fraction: excess CRAC airflow that bypasses servers
|
| 263 |
+
if m_dot_crac > 0 and m_dot_rack > 0:
|
| 264 |
+
bypass_frac = max(0.0, 1.0 - m_dot_rack / m_dot_crac)
|
| 265 |
+
else:
|
| 266 |
+
bypass_frac = 0.0
|
| 267 |
+
|
| 268 |
+
# CRAC return air temp (mixed hot exhaust + bypassed cold air)
|
| 269 |
+
t_return = (1.0 - bypass_frac) * t_hot + bypass_frac * zone.cold_aisle_temp_c
|
| 270 |
+
|
| 271 |
+
# 4. CRAC cooling output (based on bypass-corrected return temp)
|
| 272 |
+
q_cooling_extracted_w = 0.0
|
| 273 |
+
zone_cooling_power_kw = 0.0
|
| 274 |
+
for crac in zone.crac_units:
|
| 275 |
+
q_crac_kw = crac.compute_cooling_output_kw(t_return)
|
| 276 |
+
q_cooling_extracted_w += q_crac_kw * 1000.0
|
| 277 |
+
total_cooling_output_kw += q_crac_kw
|
| 278 |
+
|
| 279 |
+
p_crac_kw = crac.compute_power_consumption_kw(q_crac_kw, state.outside_temp_c)
|
| 280 |
+
zone_cooling_power_kw += p_crac_kw
|
| 281 |
+
total_cooling_power_kw += p_crac_kw
|
| 282 |
+
|
| 283 |
+
# 5. Cold aisle energy balance [all in Watts]
|
| 284 |
+
|
| 285 |
+
# CRAC supply mixing: each CRAC injects air into the cold aisle.
|
| 286 |
+
# Running CRACs inject air at their supply temp (near setpoint).
|
| 287 |
+
# Compressor-faulted CRACs with fans running inject air at the
|
| 288 |
+
# return air temp (air passes through the inactive coil unconditioned).
|
| 289 |
+
q_crac_mixing_w = 0.0
|
| 290 |
+
for crac in zone.crac_units:
|
| 291 |
+
crac_flow = crac.current_airflow_m3s * AIR_DENSITY_KG_M3
|
| 292 |
+
if crac_flow <= 0:
|
| 293 |
+
continue
|
| 294 |
+
if crac.fault_type in (CRACFaultType.COMPRESSOR, CRACFaultType.REFRIGERANT_LEAK):
|
| 295 |
+
effective_supply = t_return # No cooling — just recirculating
|
| 296 |
+
else:
|
| 297 |
+
effective_supply = crac.supply_temp_c
|
| 298 |
+
q_crac_mixing_w += crac_flow * AIR_SPECIFIC_HEAT_J_KGK * (
|
| 299 |
+
effective_supply - zone.cold_aisle_temp_c
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
# Hot air entering cold aisle from two mechanisms:
|
| 303 |
+
#
|
| 304 |
+
# (a) Containment recirculation: fraction r of air leaks through
|
| 305 |
+
# containment gaps regardless of CRAC flow balance.
|
| 306 |
+
# Uses max(m_dot_rack, m_dot_crac) — recirculation is driven
|
| 307 |
+
# by pressure differentials from whichever airflow is dominant.
|
| 308 |
+
# When CRACs are off, server fans still drive leakage.
|
| 309 |
+
r = zone.recirculation_factor
|
| 310 |
+
m_dot_dominant = max(m_dot_rack, m_dot_crac)
|
| 311 |
+
q_recirc_w = r * m_dot_dominant * AIR_SPECIFIC_HEAT_J_KGK * dt_server
|
| 312 |
+
|
| 313 |
+
# (b) Natural return: when CRAC airflow < rack airflow, servers
|
| 314 |
+
# exhaust more hot air than CRACs can capture. The uncaptured
|
| 315 |
+
# fraction returns to the cold aisle via natural convection.
|
| 316 |
+
# When CRACs are completely off, ALL server exhaust returns
|
| 317 |
+
# (= Q_IT returns to cold aisle as heat).
|
| 318 |
+
if m_dot_rack > 0 and m_dot_crac < m_dot_rack:
|
| 319 |
+
natural_return_frac = 1.0 - m_dot_crac / m_dot_rack
|
| 320 |
+
q_natural_return_w = (
|
| 321 |
+
natural_return_frac * m_dot_rack * AIR_SPECIFIC_HEAT_J_KGK * dt_server
|
| 322 |
+
)
|
| 323 |
+
else:
|
| 324 |
+
q_natural_return_w = 0.0
|
| 325 |
+
|
| 326 |
+
# Envelope heat gain
|
| 327 |
+
if zone.envelope_r_kw > 0:
|
| 328 |
+
q_envelope_w = (state.outside_temp_c - zone.cold_aisle_temp_c) / zone.envelope_r_kw
|
| 329 |
+
else:
|
| 330 |
+
q_envelope_w = 0.0
|
| 331 |
+
|
| 332 |
+
# Internal gains (UPS/PDU losses + lighting)
|
| 333 |
+
q_ups_w = zone.total_it_load_kw * state.ups_loss_fraction * 1000.0
|
| 334 |
+
q_pdu_w = zone.total_it_load_kw * state.pdu_loss_fraction * 1000.0
|
| 335 |
+
num_zones = len(state.zones) if state.zones else 1
|
| 336 |
+
q_lighting_w = state.lighting_power_kw * 1000.0 / num_zones
|
| 337 |
+
q_internal_w = q_ups_w + q_pdu_w + q_lighting_w
|
| 338 |
+
|
| 339 |
+
# 6. Net heat into cold aisle [W]
|
| 340 |
+
q_net_w = (
|
| 341 |
+
q_crac_mixing_w + q_recirc_w + q_natural_return_w
|
| 342 |
+
+ q_envelope_w + q_internal_w
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
+
# 7. Forward Euler integration
|
| 346 |
+
c_total = zone.compute_thermal_capacitance_jk()
|
| 347 |
+
if c_total > 0:
|
| 348 |
+
dT = q_net_w * dt / c_total
|
| 349 |
+
zone.cold_aisle_temp_c += dT
|
| 350 |
+
|
| 351 |
+
# 8. Update hot aisle (algebraic: T_hot = T_cold + server ΔT)
|
| 352 |
+
if m_dot_rack > 0:
|
| 353 |
+
zone.hot_aisle_temp_c = (
|
| 354 |
+
zone.cold_aisle_temp_c
|
| 355 |
+
+ q_it_w / (m_dot_rack * AIR_SPECIFIC_HEAT_J_KGK)
|
| 356 |
+
)
|
| 357 |
+
else:
|
| 358 |
+
zone.hot_aisle_temp_c = zone.cold_aisle_temp_c + 50.0
|
| 359 |
+
|
| 360 |
+
# 9. Update individual rack inlet/outlet temperatures
|
| 361 |
+
for rack in zone.racks:
|
| 362 |
+
rack.inlet_temp_c = zone.cold_aisle_temp_c
|
| 363 |
+
rack.outlet_temp_c = rack.compute_outlet_temp()
|
| 364 |
+
|
| 365 |
+
# 10. Check ASHRAE alarms
|
| 366 |
+
ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
|
| 367 |
+
if ashrae:
|
| 368 |
+
for rack in zone.racks:
|
| 369 |
+
if rack.inlet_temp_c > ashrae.allowable_max_c:
|
| 370 |
+
alarms.append(ThermalAlarm(
|
| 371 |
+
rack_id=rack.rack_id,
|
| 372 |
+
zone_id=zone.zone_id,
|
| 373 |
+
inlet_temp_c=rack.inlet_temp_c,
|
| 374 |
+
threshold_c=ashrae.allowable_max_c,
|
| 375 |
+
severity="critical",
|
| 376 |
+
))
|
| 377 |
+
elif rack.inlet_temp_c > ashrae.recommended_max_c:
|
| 378 |
+
alarms.append(ThermalAlarm(
|
| 379 |
+
rack_id=rack.rack_id,
|
| 380 |
+
zone_id=zone.zone_id,
|
| 381 |
+
inlet_temp_c=rack.inlet_temp_c,
|
| 382 |
+
threshold_c=ashrae.recommended_max_c,
|
| 383 |
+
severity="warning",
|
| 384 |
+
))
|
| 385 |
+
|
| 386 |
+
total_power_kw += zone.total_it_load_kw
|
| 387 |
+
|
| 388 |
+
# Energy consumed in this step [kWh]
|
| 389 |
+
total_facility_kw = total_power_kw + total_cooling_power_kw + (
|
| 390 |
+
total_power_kw * (state.ups_loss_fraction + state.pdu_loss_fraction)
|
| 391 |
+
+ state.lighting_power_kw
|
| 392 |
+
)
|
| 393 |
+
energy_kwh = total_facility_kw * dt / 3600.0
|
| 394 |
+
|
| 395 |
+
return ThermalStepResult(
|
| 396 |
+
state=state,
|
| 397 |
+
alarms=alarms,
|
| 398 |
+
total_cooling_output_kw=total_cooling_output_kw,
|
| 399 |
+
total_cooling_power_kw=total_cooling_power_kw,
|
| 400 |
+
energy_consumed_kwh=energy_kwh,
|
| 401 |
+
)
|
| 402 |
+
|
| 403 |
+
@staticmethod
|
| 404 |
+
def _compute_weighted_supply_temp(zone: ZoneState) -> float | None:
|
| 405 |
+
"""Flow-weighted average of CRAC supply temperatures.
|
| 406 |
+
|
| 407 |
+
T_supply_weighted = Σ(T_supply_i × m_dot_i) / Σ(m_dot_i)
|
| 408 |
+
|
| 409 |
+
Returns None if no CRACs are producing airflow.
|
| 410 |
+
"""
|
| 411 |
+
total_flow = 0.0
|
| 412 |
+
weighted_temp = 0.0
|
| 413 |
+
for crac in zone.crac_units:
|
| 414 |
+
flow = crac.current_airflow_m3s
|
| 415 |
+
if flow > 0:
|
| 416 |
+
weighted_temp += crac.supply_temp_c * flow
|
| 417 |
+
total_flow += flow
|
| 418 |
+
|
| 419 |
+
if total_flow <= 0:
|
| 420 |
+
return None
|
| 421 |
+
return weighted_temp / total_flow
|
| 422 |
+
|
| 423 |
+
# ------------------------------------------------------------------
|
| 424 |
+
# Mutation helpers (used by action parser in later phases)
|
| 425 |
+
# ------------------------------------------------------------------
|
| 426 |
+
|
| 427 |
+
def set_crac_setpoint(self, unit_id: str, setpoint_c: float) -> bool:
|
| 428 |
+
"""Adjust a CRAC unit's supply air temperature setpoint. Returns success."""
|
| 429 |
+
crac = self._find_crac(unit_id)
|
| 430 |
+
if crac is None:
|
| 431 |
+
return False
|
| 432 |
+
crac.setpoint_c = setpoint_c
|
| 433 |
+
return True
|
| 434 |
+
|
| 435 |
+
def set_crac_fan_speed(self, unit_id: str, speed_pct: float) -> bool:
|
| 436 |
+
"""Set CRAC fan speed (0-100%). Returns success."""
|
| 437 |
+
crac = self._find_crac(unit_id)
|
| 438 |
+
if crac is None:
|
| 439 |
+
return False
|
| 440 |
+
crac.fan_speed_pct = max(0.0, min(100.0, speed_pct))
|
| 441 |
+
return True
|
| 442 |
+
|
| 443 |
+
def set_crac_status(self, unit_id: str, status: CRACStatus) -> bool:
|
| 444 |
+
"""Change CRAC operating status. Returns success."""
|
| 445 |
+
crac = self._find_crac(unit_id)
|
| 446 |
+
if crac is None:
|
| 447 |
+
return False
|
| 448 |
+
crac.status = status
|
| 449 |
+
return True
|
| 450 |
+
|
| 451 |
+
def inject_crac_fault(
|
| 452 |
+
self, unit_id: str, fault_type: CRACFaultType
|
| 453 |
+
) -> bool:
|
| 454 |
+
"""Inject a fault into a CRAC unit. Returns success."""
|
| 455 |
+
crac = self._find_crac(unit_id)
|
| 456 |
+
if crac is None:
|
| 457 |
+
return False
|
| 458 |
+
crac.status = CRACStatus.FAULT
|
| 459 |
+
crac.fault_type = fault_type
|
| 460 |
+
return True
|
| 461 |
+
|
| 462 |
+
def clear_crac_fault(self, unit_id: str) -> bool:
|
| 463 |
+
"""Clear a CRAC fault and return to running. Returns success."""
|
| 464 |
+
crac = self._find_crac(unit_id)
|
| 465 |
+
if crac is None:
|
| 466 |
+
return False
|
| 467 |
+
crac.status = CRACStatus.RUNNING
|
| 468 |
+
crac.fault_type = CRACFaultType.NONE
|
| 469 |
+
return True
|
| 470 |
+
|
| 471 |
+
def set_rack_load(self, rack_id: str, load_kw: float) -> bool:
|
| 472 |
+
"""Change a rack's IT load. Returns success."""
|
| 473 |
+
rack = self._find_rack(rack_id)
|
| 474 |
+
if rack is None:
|
| 475 |
+
return False
|
| 476 |
+
rack.it_load_kw = max(0.0, load_kw)
|
| 477 |
+
# Update airflow proportionally (servers spin fans with load)
|
| 478 |
+
from ..config import RackConfig
|
| 479 |
+
default_cfm_per_kw = RackConfig().airflow_cfm_per_kw
|
| 480 |
+
rack.airflow_m3s = cfm_to_m3s(default_cfm_per_kw * rack.it_load_kw)
|
| 481 |
+
return True
|
| 482 |
+
|
| 483 |
+
def set_outside_temp(self, temp_c: float) -> None:
|
| 484 |
+
"""Set outside temperature."""
|
| 485 |
+
self._state.outside_temp_c = temp_c
|
| 486 |
+
|
| 487 |
+
def _find_crac(self, unit_id: str) -> CRACState | None:
|
| 488 |
+
for zone in self._state.zones:
|
| 489 |
+
for crac in zone.crac_units:
|
| 490 |
+
if crac.unit_id == unit_id:
|
| 491 |
+
return crac
|
| 492 |
+
return None
|
| 493 |
+
|
| 494 |
+
def _find_rack(self, rack_id: str) -> RackState | None:
|
| 495 |
+
for zone in self._state.zones:
|
| 496 |
+
for rack in zone.racks:
|
| 497 |
+
if rack.rack_id == rack_id:
|
| 498 |
+
return rack
|
| 499 |
+
return None
|
| 500 |
+
|
| 501 |
+
def find_zone_for_crac(self, unit_id: str) -> ZoneState | None:
|
| 502 |
+
"""Find the zone containing a given CRAC unit."""
|
| 503 |
+
for zone in self._state.zones:
|
| 504 |
+
for crac in zone.crac_units:
|
| 505 |
+
if crac.unit_id == unit_id:
|
| 506 |
+
return zone
|
| 507 |
+
return None
|
| 508 |
+
|
| 509 |
+
def find_zone_for_rack(self, rack_id: str) -> ZoneState | None:
|
| 510 |
+
"""Find the zone containing a given rack."""
|
| 511 |
+
for zone in self._state.zones:
|
| 512 |
+
for rack in zone.racks:
|
| 513 |
+
if rack.rack_id == rack_id:
|
| 514 |
+
return zone
|
| 515 |
+
return None
|
simulation/types.py
ADDED
|
@@ -0,0 +1,598 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Runtime state dataclasses for the datacenter simulation.
|
| 9 |
+
|
| 10 |
+
These are plain dataclasses (not Pydantic) for performance on the simulation
|
| 11 |
+
hot path. Pydantic models are only used at the API boundary (models.py).
|
| 12 |
+
|
| 13 |
+
All values in SI units:
|
| 14 |
+
- Temperature: °C
|
| 15 |
+
- Power/Heat: kW (for readability; converted to W in physics calculations)
|
| 16 |
+
- Airflow: m³/s
|
| 17 |
+
- Thermal capacitance: J/K
|
| 18 |
+
- Thermal resistance: K/W
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
from dataclasses import dataclass, field
|
| 24 |
+
from enum import Enum
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class CRACStatus(Enum):
|
| 28 |
+
RUNNING = "running"
|
| 29 |
+
STANDBY = "standby"
|
| 30 |
+
FAULT = "fault"
|
| 31 |
+
MAINTENANCE = "maintenance"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class CRACFaultType(Enum):
|
| 35 |
+
NONE = "none"
|
| 36 |
+
COMPRESSOR = "compressor"
|
| 37 |
+
FAN = "fan"
|
| 38 |
+
REFRIGERANT_LEAK = "refrigerant_leak"
|
| 39 |
+
SENSOR = "sensor"
|
| 40 |
+
ELECTRICAL = "electrical"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# ---------------------------------------------------------------------------
|
| 44 |
+
# Power subsystem enums
|
| 45 |
+
# ---------------------------------------------------------------------------
|
| 46 |
+
class UPSMode(Enum):
|
| 47 |
+
"""UPS operating mode."""
|
| 48 |
+
DOUBLE_CONVERSION = "double_conversion" # Normal: AC→DC→AC, full protection
|
| 49 |
+
LINE_INTERACTIVE = "line_interactive" # Reduced losses, slower transfer
|
| 50 |
+
ECO = "eco" # Bypass with monitoring, minimal losses
|
| 51 |
+
BYPASS = "bypass" # Manual bypass, no protection
|
| 52 |
+
ON_BATTERY = "on_battery" # Utility lost, discharging battery
|
| 53 |
+
FAULT = "fault" # UPS fault, load on raw utility or dead
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class GeneratorState(Enum):
|
| 57 |
+
"""Diesel generator state machine states."""
|
| 58 |
+
OFF = "off" # Not running
|
| 59 |
+
START_DELAY = "start_delay" # Programmed delay before cranking
|
| 60 |
+
CRANKING = "cranking" # Engine cranking
|
| 61 |
+
WARMING = "warming" # Warm-up period before load acceptance
|
| 62 |
+
READY = "ready" # Running, ready to accept load
|
| 63 |
+
LOADED = "loaded" # Running under load
|
| 64 |
+
COOLDOWN = "cooldown" # Unloaded cool-down before shutdown
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class ATSPosition(Enum):
|
| 68 |
+
"""ATS switch position."""
|
| 69 |
+
UTILITY = "utility"
|
| 70 |
+
GENERATOR = "generator"
|
| 71 |
+
TRANSFERRING = "transferring" # Mid-transfer (load momentarily interrupted)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@dataclass
|
| 75 |
+
class RackState:
|
| 76 |
+
"""Runtime state of a single server rack."""
|
| 77 |
+
rack_id: str
|
| 78 |
+
row: str
|
| 79 |
+
position: int
|
| 80 |
+
|
| 81 |
+
# Electrical / thermal load
|
| 82 |
+
it_load_kw: float # Current IT power draw (≈ heat dissipation)
|
| 83 |
+
|
| 84 |
+
# Temperatures
|
| 85 |
+
inlet_temp_c: float # Cold aisle side (server intake)
|
| 86 |
+
outlet_temp_c: float # Hot aisle side (server exhaust)
|
| 87 |
+
|
| 88 |
+
# Airflow
|
| 89 |
+
airflow_m3s: float # Total server fan airflow through this rack
|
| 90 |
+
|
| 91 |
+
# Thermal inertia
|
| 92 |
+
thermal_mass_jk: float # Equipment thermal capacitance [J/K]
|
| 93 |
+
|
| 94 |
+
def compute_outlet_temp(self) -> float:
|
| 95 |
+
"""Compute outlet temp from energy balance: Q = m_dot * c_p * dT.
|
| 96 |
+
|
| 97 |
+
Returns outlet temperature in °C.
|
| 98 |
+
"""
|
| 99 |
+
from ..config import AIR_DENSITY_KG_M3, AIR_SPECIFIC_HEAT_J_KGK
|
| 100 |
+
|
| 101 |
+
m_dot = self.airflow_m3s * AIR_DENSITY_KG_M3 # kg/s
|
| 102 |
+
if m_dot <= 0:
|
| 103 |
+
# No airflow — temperature rises unboundedly in theory;
|
| 104 |
+
# clamp to a high value to signal danger
|
| 105 |
+
return self.inlet_temp_c + 50.0
|
| 106 |
+
|
| 107 |
+
q_w = self.it_load_kw * 1000.0 # Convert kW → W
|
| 108 |
+
delta_t = q_w / (m_dot * AIR_SPECIFIC_HEAT_J_KGK)
|
| 109 |
+
return self.inlet_temp_c + delta_t
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
@dataclass
|
| 113 |
+
class CRACState:
|
| 114 |
+
"""Runtime state of a CRAC/CRAH cooling unit."""
|
| 115 |
+
unit_id: str
|
| 116 |
+
|
| 117 |
+
# Operating status
|
| 118 |
+
status: CRACStatus = CRACStatus.RUNNING
|
| 119 |
+
fault_type: CRACFaultType = CRACFaultType.NONE
|
| 120 |
+
|
| 121 |
+
# Setpoints and actuals
|
| 122 |
+
setpoint_c: float = 18.0 # Desired supply air temperature
|
| 123 |
+
supply_temp_c: float = 18.0 # Actual supply air temperature (lags setpoint)
|
| 124 |
+
fan_speed_pct: float = 100.0 # 0-100
|
| 125 |
+
|
| 126 |
+
# Rated specifications (from config, immutable during episode)
|
| 127 |
+
max_airflow_m3s: float = 0.0 # At 100% fan speed
|
| 128 |
+
rated_capacity_kw: float = 70.0 # At rated return temp
|
| 129 |
+
rated_return_temp_c: float = 24.0 # Return temp for rated capacity
|
| 130 |
+
capacity_slope_per_c: float = 0.03 # Fractional capacity change per °C
|
| 131 |
+
fan_rated_power_kw: float = 5.0
|
| 132 |
+
cop_rated: float = 3.5
|
| 133 |
+
cop_degradation_per_c: float = 0.04
|
| 134 |
+
supply_temp_lag_s: float = 30.0 # First-order lag time constant
|
| 135 |
+
|
| 136 |
+
@property
|
| 137 |
+
def current_airflow_m3s(self) -> float:
|
| 138 |
+
"""Actual airflow based on fan speed and status."""
|
| 139 |
+
if self.status != CRACStatus.RUNNING:
|
| 140 |
+
return 0.0
|
| 141 |
+
if self.fault_type == CRACFaultType.FAN:
|
| 142 |
+
return 0.0
|
| 143 |
+
return self.max_airflow_m3s * (self.fan_speed_pct / 100.0)
|
| 144 |
+
|
| 145 |
+
def compute_cooling_output_kw(self, return_air_temp_c: float) -> float:
|
| 146 |
+
"""Compute actual cooling output [kW].
|
| 147 |
+
|
| 148 |
+
Cooling capacity depends on return air temperature:
|
| 149 |
+
Q_actual = Q_rated × [1 + α × (T_return - T_rated)]
|
| 150 |
+
|
| 151 |
+
But is also limited by airflow × deltaT:
|
| 152 |
+
Q_airflow = m_dot × c_p × (T_return - T_supply)
|
| 153 |
+
|
| 154 |
+
The actual output is the minimum of both limits.
|
| 155 |
+
"""
|
| 156 |
+
from ..config import AIR_DENSITY_KG_M3, AIR_SPECIFIC_HEAT_J_KGK
|
| 157 |
+
|
| 158 |
+
if self.status != CRACStatus.RUNNING:
|
| 159 |
+
return 0.0
|
| 160 |
+
if self.fault_type in (CRACFaultType.COMPRESSOR, CRACFaultType.REFRIGERANT_LEAK):
|
| 161 |
+
return 0.0
|
| 162 |
+
|
| 163 |
+
# Capacity limit (refrigeration cycle capacity)
|
| 164 |
+
delta_return = return_air_temp_c - self.rated_return_temp_c
|
| 165 |
+
q_capacity = self.rated_capacity_kw * (1.0 + self.capacity_slope_per_c * delta_return)
|
| 166 |
+
q_capacity = max(q_capacity, 0.0)
|
| 167 |
+
|
| 168 |
+
# Airflow limit
|
| 169 |
+
m_dot = self.current_airflow_m3s * AIR_DENSITY_KG_M3 # kg/s
|
| 170 |
+
if m_dot <= 0:
|
| 171 |
+
return 0.0
|
| 172 |
+
delta_t = return_air_temp_c - self.supply_temp_c
|
| 173 |
+
if delta_t <= 0:
|
| 174 |
+
return 0.0
|
| 175 |
+
q_airflow = m_dot * AIR_SPECIFIC_HEAT_J_KGK * delta_t / 1000.0 # W → kW
|
| 176 |
+
|
| 177 |
+
return min(q_capacity, q_airflow)
|
| 178 |
+
|
| 179 |
+
def compute_power_consumption_kw(
|
| 180 |
+
self, cooling_output_kw: float, outside_temp_c: float
|
| 181 |
+
) -> float:
|
| 182 |
+
"""Compute CRAC electrical power consumption [kW].
|
| 183 |
+
|
| 184 |
+
Fan power: cubic relationship with speed (affinity laws).
|
| 185 |
+
P_fan = P_rated × (speed/100)³
|
| 186 |
+
|
| 187 |
+
Compressor power: Q_cooling / COP
|
| 188 |
+
COP degrades at higher outside temperatures.
|
| 189 |
+
"""
|
| 190 |
+
if self.status != CRACStatus.RUNNING:
|
| 191 |
+
return 0.0
|
| 192 |
+
|
| 193 |
+
# Fan power (affinity law: power ∝ speed³)
|
| 194 |
+
speed_frac = self.fan_speed_pct / 100.0
|
| 195 |
+
p_fan = self.fan_rated_power_kw * (speed_frac ** 3)
|
| 196 |
+
|
| 197 |
+
# Compressor power
|
| 198 |
+
cop = self.cop_rated
|
| 199 |
+
if outside_temp_c > 35.0:
|
| 200 |
+
# COP degrades linearly above 35°C
|
| 201 |
+
cop *= max(0.3, 1.0 - self.cop_degradation_per_c * (outside_temp_c - 35.0))
|
| 202 |
+
|
| 203 |
+
if self.fault_type in (CRACFaultType.COMPRESSOR, CRACFaultType.REFRIGERANT_LEAK):
|
| 204 |
+
p_compressor = 0.0
|
| 205 |
+
elif cop > 0 and cooling_output_kw > 0:
|
| 206 |
+
p_compressor = cooling_output_kw / cop
|
| 207 |
+
else:
|
| 208 |
+
p_compressor = 0.0
|
| 209 |
+
|
| 210 |
+
return p_fan + p_compressor
|
| 211 |
+
|
| 212 |
+
def update_supply_temp(self, dt_s: float) -> None:
|
| 213 |
+
"""First-order lag: supply temp approaches setpoint with time constant.
|
| 214 |
+
|
| 215 |
+
T_supply(t+dt) = T_supply(t) + (T_setpoint - T_supply(t)) × (1 - e^(-dt/τ))
|
| 216 |
+
|
| 217 |
+
For small dt/τ this approximates: T += (T_set - T) × dt/τ
|
| 218 |
+
"""
|
| 219 |
+
import math
|
| 220 |
+
|
| 221 |
+
if self.status != CRACStatus.RUNNING:
|
| 222 |
+
return
|
| 223 |
+
if self.fault_type == CRACFaultType.COMPRESSOR:
|
| 224 |
+
# Compressor fault: supply temp drifts toward return air (no cooling)
|
| 225 |
+
return
|
| 226 |
+
if self.supply_temp_lag_s <= 0:
|
| 227 |
+
self.supply_temp_c = self.setpoint_c
|
| 228 |
+
return
|
| 229 |
+
|
| 230 |
+
alpha = 1.0 - math.exp(-dt_s / self.supply_temp_lag_s)
|
| 231 |
+
self.supply_temp_c += (self.setpoint_c - self.supply_temp_c) * alpha
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
@dataclass
|
| 235 |
+
class ZoneState:
|
| 236 |
+
"""Runtime state of a thermal zone (a section of the datacenter)."""
|
| 237 |
+
zone_id: str
|
| 238 |
+
|
| 239 |
+
# Temperatures
|
| 240 |
+
cold_aisle_temp_c: float = 20.0
|
| 241 |
+
hot_aisle_temp_c: float = 35.0
|
| 242 |
+
|
| 243 |
+
# Humidity (tracked, not yet fully modeled psychrometrically)
|
| 244 |
+
humidity_rh: float = 0.45 # Fraction 0-1
|
| 245 |
+
|
| 246 |
+
# Containment / recirculation
|
| 247 |
+
recirculation_factor: float = 0.08 # 0 = perfect containment
|
| 248 |
+
|
| 249 |
+
# Equipment
|
| 250 |
+
racks: list[RackState] = field(default_factory=list)
|
| 251 |
+
crac_units: list[CRACState] = field(default_factory=list)
|
| 252 |
+
|
| 253 |
+
# Zone thermal properties
|
| 254 |
+
air_volume_m3: float = 500.0
|
| 255 |
+
envelope_r_kw: float = 0.02 # Thermal resistance to outside [K/W]
|
| 256 |
+
|
| 257 |
+
# ASHRAE class for this zone
|
| 258 |
+
ashrae_class: str = "A2"
|
| 259 |
+
|
| 260 |
+
@property
|
| 261 |
+
def total_it_load_kw(self) -> float:
|
| 262 |
+
return sum(r.it_load_kw for r in self.racks)
|
| 263 |
+
|
| 264 |
+
@property
|
| 265 |
+
def total_rack_airflow_m3s(self) -> float:
|
| 266 |
+
return sum(r.airflow_m3s for r in self.racks)
|
| 267 |
+
|
| 268 |
+
@property
|
| 269 |
+
def total_crac_airflow_m3s(self) -> float:
|
| 270 |
+
return sum(c.current_airflow_m3s for c in self.crac_units)
|
| 271 |
+
|
| 272 |
+
@property
|
| 273 |
+
def max_inlet_temp_c(self) -> float:
|
| 274 |
+
if not self.racks:
|
| 275 |
+
return self.cold_aisle_temp_c
|
| 276 |
+
return max(r.inlet_temp_c for r in self.racks)
|
| 277 |
+
|
| 278 |
+
def compute_thermal_capacitance_jk(self) -> float:
|
| 279 |
+
"""Total thermal capacitance of this zone [J/K].
|
| 280 |
+
|
| 281 |
+
C_total = C_air + C_equipment
|
| 282 |
+
|
| 283 |
+
C_air = ρ × V × c_p (~1-2 kJ/K for typical zone)
|
| 284 |
+
C_equipment = Σ rack thermal masses (dominant term, ~100+ kJ/K)
|
| 285 |
+
"""
|
| 286 |
+
from ..config import AIR_DENSITY_KG_M3, AIR_SPECIFIC_HEAT_J_KGK
|
| 287 |
+
|
| 288 |
+
c_air = AIR_DENSITY_KG_M3 * self.air_volume_m3 * AIR_SPECIFIC_HEAT_J_KGK
|
| 289 |
+
c_equipment = sum(r.thermal_mass_jk for r in self.racks)
|
| 290 |
+
return c_air + c_equipment
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
# ---------------------------------------------------------------------------
|
| 294 |
+
# Power subsystem state
|
| 295 |
+
# ---------------------------------------------------------------------------
|
| 296 |
+
@dataclass
|
| 297 |
+
class UPSState:
|
| 298 |
+
"""Runtime state of a UPS unit."""
|
| 299 |
+
unit_id: str
|
| 300 |
+
|
| 301 |
+
# Operating mode
|
| 302 |
+
mode: UPSMode = UPSMode.DOUBLE_CONVERSION
|
| 303 |
+
|
| 304 |
+
# Load
|
| 305 |
+
input_power_kw: float = 0.0 # Power entering UPS from utility/generator
|
| 306 |
+
output_power_kw: float = 0.0 # Power delivered to IT load
|
| 307 |
+
load_fraction: float = 0.0 # output / rated_capacity (0-1)
|
| 308 |
+
|
| 309 |
+
# Efficiency and losses
|
| 310 |
+
efficiency: float = 0.97 # Current operating efficiency
|
| 311 |
+
heat_output_kw: float = 0.0 # Waste heat = input - output
|
| 312 |
+
|
| 313 |
+
# Battery
|
| 314 |
+
battery_soc: float = 1.0 # State of charge (0-1)
|
| 315 |
+
battery_power_kw: float = 0.0 # Positive = discharging, negative = charging
|
| 316 |
+
battery_time_remaining_s: float = 0.0 # Estimated time at current draw
|
| 317 |
+
|
| 318 |
+
# Rated specs (from config, immutable during episode)
|
| 319 |
+
rated_capacity_kw: float = 500.0
|
| 320 |
+
loss_c0: float = 0.013
|
| 321 |
+
loss_c1: float = 0.006
|
| 322 |
+
loss_c2: float = 0.011
|
| 323 |
+
battery_capacity_kwh: float = 8.3
|
| 324 |
+
battery_discharge_efficiency: float = 0.90
|
| 325 |
+
battery_aging_factor: float = 0.85
|
| 326 |
+
recharge_rate_kw: float = 5.0
|
| 327 |
+
|
| 328 |
+
def compute_efficiency(self) -> float:
|
| 329 |
+
"""UPS efficiency using quadratic loss model.
|
| 330 |
+
|
| 331 |
+
η(x) = x / (x + c_0 + c_1·x + c_2·x²)
|
| 332 |
+
where x = load_fraction.
|
| 333 |
+
|
| 334 |
+
At very low loads (x < 0.01), efficiency is undefined / near-zero.
|
| 335 |
+
"""
|
| 336 |
+
x = self.load_fraction
|
| 337 |
+
if x < 0.01:
|
| 338 |
+
return 0.0
|
| 339 |
+
denominator = x + self.loss_c0 + self.loss_c1 * x + self.loss_c2 * x * x
|
| 340 |
+
if denominator <= 0:
|
| 341 |
+
return 0.0
|
| 342 |
+
return x / denominator
|
| 343 |
+
|
| 344 |
+
def compute_losses_kw(self) -> float:
|
| 345 |
+
"""Power losses at current load.
|
| 346 |
+
|
| 347 |
+
P_loss = P_output × (1/η - 1)
|
| 348 |
+
"""
|
| 349 |
+
eta = self.compute_efficiency()
|
| 350 |
+
if eta <= 0:
|
| 351 |
+
# No-load losses: just transformer/control board idle draw
|
| 352 |
+
return self.rated_capacity_kw * self.loss_c0
|
| 353 |
+
return self.output_power_kw * (1.0 / eta - 1.0)
|
| 354 |
+
|
| 355 |
+
def compute_battery_time_remaining_s(self) -> float:
|
| 356 |
+
"""Estimate remaining battery runtime at current discharge rate.
|
| 357 |
+
|
| 358 |
+
t = (SOC × E_battery × η_discharge × aging) / P_discharge
|
| 359 |
+
"""
|
| 360 |
+
if self.battery_power_kw <= 0:
|
| 361 |
+
return float("inf")
|
| 362 |
+
usable_kwh = (
|
| 363 |
+
self.battery_soc
|
| 364 |
+
* self.battery_capacity_kwh
|
| 365 |
+
* self.battery_discharge_efficiency
|
| 366 |
+
* self.battery_aging_factor
|
| 367 |
+
)
|
| 368 |
+
return usable_kwh / self.battery_power_kw * 3600.0 # hours → seconds
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
@dataclass
|
| 372 |
+
class PDUState:
|
| 373 |
+
"""Runtime state of a three-phase PDU."""
|
| 374 |
+
pdu_id: str
|
| 375 |
+
|
| 376 |
+
# Per-phase currents [A]
|
| 377 |
+
phase_currents_a: list[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])
|
| 378 |
+
|
| 379 |
+
# Power
|
| 380 |
+
input_power_kw: float = 0.0
|
| 381 |
+
output_power_kw: float = 0.0
|
| 382 |
+
heat_output_kw: float = 0.0 # Transformer losses
|
| 383 |
+
|
| 384 |
+
# Utilization
|
| 385 |
+
load_fraction: float = 0.0 # Of derated capacity
|
| 386 |
+
phase_imbalance_pct: float = 0.0 # Max deviation from average phase current
|
| 387 |
+
|
| 388 |
+
# Alarms
|
| 389 |
+
breaker_tripped: bool = False
|
| 390 |
+
overload: bool = False
|
| 391 |
+
|
| 392 |
+
# Rated specs (from config)
|
| 393 |
+
voltage_ll_v: float = 208.0
|
| 394 |
+
max_current_per_phase_a: float = 24.0
|
| 395 |
+
num_phases: int = 3
|
| 396 |
+
breaker_rating_a: float = 20.0
|
| 397 |
+
efficiency: float = 0.98
|
| 398 |
+
continuous_derating: float = 0.80
|
| 399 |
+
|
| 400 |
+
@property
|
| 401 |
+
def nameplate_capacity_kw(self) -> float:
|
| 402 |
+
"""Total nameplate capacity: P = √3 × V_LL × I_phase × num_phases_factor."""
|
| 403 |
+
import math
|
| 404 |
+
return math.sqrt(3) * self.voltage_ll_v * self.max_current_per_phase_a / 1000.0
|
| 405 |
+
|
| 406 |
+
@property
|
| 407 |
+
def derated_capacity_kw(self) -> float:
|
| 408 |
+
"""NEC 80% continuous derating applied."""
|
| 409 |
+
return self.nameplate_capacity_kw * self.continuous_derating
|
| 410 |
+
|
| 411 |
+
def compute_phase_imbalance(self) -> float:
|
| 412 |
+
"""Phase imbalance as percentage deviation from average.
|
| 413 |
+
|
| 414 |
+
imbalance = max(|I_phase - I_avg|) / I_avg × 100
|
| 415 |
+
Returns 0 if no load.
|
| 416 |
+
"""
|
| 417 |
+
if not self.phase_currents_a:
|
| 418 |
+
return 0.0
|
| 419 |
+
avg = sum(self.phase_currents_a) / len(self.phase_currents_a)
|
| 420 |
+
if avg <= 0:
|
| 421 |
+
return 0.0
|
| 422 |
+
max_dev = max(abs(i - avg) for i in self.phase_currents_a)
|
| 423 |
+
return max_dev / avg * 100.0
|
| 424 |
+
|
| 425 |
+
def compute_heat_output_kw(self) -> float:
|
| 426 |
+
"""PDU transformer losses."""
|
| 427 |
+
return self.input_power_kw * (1.0 - self.efficiency)
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
@dataclass
|
| 431 |
+
class GensetState:
|
| 432 |
+
"""Runtime state of a diesel standby generator."""
|
| 433 |
+
gen_id: str
|
| 434 |
+
|
| 435 |
+
# State machine
|
| 436 |
+
state: GeneratorState = GeneratorState.OFF
|
| 437 |
+
state_elapsed_s: float = 0.0 # Time in current state
|
| 438 |
+
|
| 439 |
+
# Output
|
| 440 |
+
output_power_kw: float = 0.0
|
| 441 |
+
load_fraction: float = 0.0 # output / rated_capacity
|
| 442 |
+
|
| 443 |
+
# Fuel
|
| 444 |
+
fuel_level_liters: float = 2000.0
|
| 445 |
+
fuel_consumption_lph: float = 0.0 # Current consumption rate
|
| 446 |
+
|
| 447 |
+
# Timing specs (from config)
|
| 448 |
+
rated_capacity_kw: float = 750.0
|
| 449 |
+
start_delay_s: float = 4.0
|
| 450 |
+
crank_time_s: float = 5.0
|
| 451 |
+
warmup_time_s: float = 8.0
|
| 452 |
+
cooldown_time_s: float = 300.0
|
| 453 |
+
fuel_tank_liters: float = 2000.0
|
| 454 |
+
consumption_lph_full: float = 180.0
|
| 455 |
+
|
| 456 |
+
@property
|
| 457 |
+
def is_available(self) -> bool:
|
| 458 |
+
"""Generator is ready to accept load."""
|
| 459 |
+
return self.state in (GeneratorState.READY, GeneratorState.LOADED)
|
| 460 |
+
|
| 461 |
+
@property
|
| 462 |
+
def fuel_remaining_hours(self) -> float:
|
| 463 |
+
"""Estimated hours of fuel at current consumption rate."""
|
| 464 |
+
if self.fuel_consumption_lph <= 0:
|
| 465 |
+
return float("inf")
|
| 466 |
+
return self.fuel_level_liters / self.fuel_consumption_lph
|
| 467 |
+
|
| 468 |
+
def compute_fuel_consumption_lph(self) -> float:
|
| 469 |
+
"""Fuel consumption scales roughly linearly with load.
|
| 470 |
+
|
| 471 |
+
Includes ~10% idle consumption when running unloaded.
|
| 472 |
+
"""
|
| 473 |
+
if self.state == GeneratorState.OFF:
|
| 474 |
+
return 0.0
|
| 475 |
+
if self.state in (GeneratorState.CRANKING, GeneratorState.START_DELAY):
|
| 476 |
+
return 0.0 # Not yet burning fuel
|
| 477 |
+
# Idle + proportional: consumption = full × (0.1 + 0.9 × load_fraction)
|
| 478 |
+
return self.consumption_lph_full * (0.1 + 0.9 * self.load_fraction)
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
@dataclass
|
| 482 |
+
class ATSState:
|
| 483 |
+
"""Runtime state of an Automatic Transfer Switch."""
|
| 484 |
+
ats_id: str
|
| 485 |
+
|
| 486 |
+
position: ATSPosition = ATSPosition.UTILITY
|
| 487 |
+
transfer_elapsed_ms: float = 0.0 # Progress through transfer
|
| 488 |
+
|
| 489 |
+
# Specs (from config)
|
| 490 |
+
transfer_time_ms: float = 100.0
|
| 491 |
+
retransfer_delay_s: float = 300.0
|
| 492 |
+
|
| 493 |
+
# Timer for retransfer delay (counts up when utility is restored)
|
| 494 |
+
retransfer_timer_s: float = 0.0
|
| 495 |
+
|
| 496 |
+
@property
|
| 497 |
+
def load_powered(self) -> bool:
|
| 498 |
+
"""Whether the load side has power (False only during transfer gap)."""
|
| 499 |
+
return self.position != ATSPosition.TRANSFERRING
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
@dataclass
|
| 503 |
+
class PowerState:
|
| 504 |
+
"""Aggregated power subsystem state."""
|
| 505 |
+
ups_units: list[UPSState] = field(default_factory=list)
|
| 506 |
+
pdus: list[PDUState] = field(default_factory=list)
|
| 507 |
+
generator: GensetState = field(default_factory=lambda: GensetState(gen_id="GEN-1"))
|
| 508 |
+
ats: ATSState = field(default_factory=lambda: ATSState(ats_id="ATS-1"))
|
| 509 |
+
|
| 510 |
+
# Utility
|
| 511 |
+
utility_available: bool = True
|
| 512 |
+
utility_voltage_v: float = 480.0
|
| 513 |
+
|
| 514 |
+
@property
|
| 515 |
+
def total_ups_loss_kw(self) -> float:
|
| 516 |
+
return sum(u.heat_output_kw for u in self.ups_units)
|
| 517 |
+
|
| 518 |
+
@property
|
| 519 |
+
def total_pdu_loss_kw(self) -> float:
|
| 520 |
+
return sum(p.heat_output_kw for p in self.pdus)
|
| 521 |
+
|
| 522 |
+
@property
|
| 523 |
+
def total_power_overhead_kw(self) -> float:
|
| 524 |
+
"""Total electrical overhead from power distribution."""
|
| 525 |
+
return self.total_ups_loss_kw + self.total_pdu_loss_kw
|
| 526 |
+
|
| 527 |
+
@property
|
| 528 |
+
def on_generator(self) -> bool:
|
| 529 |
+
return self.ats.position == ATSPosition.GENERATOR
|
| 530 |
+
|
| 531 |
+
@property
|
| 532 |
+
def power_available(self) -> bool:
|
| 533 |
+
"""Whether load-side power is available (from any source)."""
|
| 534 |
+
if not self.ats.load_powered:
|
| 535 |
+
return False
|
| 536 |
+
if self.ats.position == ATSPosition.UTILITY:
|
| 537 |
+
return self.utility_available
|
| 538 |
+
if self.ats.position == ATSPosition.GENERATOR:
|
| 539 |
+
return self.generator.is_available
|
| 540 |
+
return False
|
| 541 |
+
|
| 542 |
+
|
| 543 |
+
@dataclass
|
| 544 |
+
class DatacenterState:
|
| 545 |
+
"""Top-level simulation state aggregating all subsystems."""
|
| 546 |
+
zones: list[ZoneState] = field(default_factory=list)
|
| 547 |
+
|
| 548 |
+
# Power subsystem (None = use stub loss fractions for backward compat)
|
| 549 |
+
power: PowerState | None = None
|
| 550 |
+
|
| 551 |
+
# Environment
|
| 552 |
+
outside_temp_c: float = 35.0
|
| 553 |
+
outside_humidity_rh: float = 0.40
|
| 554 |
+
|
| 555 |
+
# Facility overhead
|
| 556 |
+
lighting_power_kw: float = 5.0 # Total lighting load
|
| 557 |
+
|
| 558 |
+
# Power distribution stub losses (fractions of IT load)
|
| 559 |
+
# Used only when power subsystem is not initialized
|
| 560 |
+
ups_loss_fraction: float = 0.05
|
| 561 |
+
pdu_loss_fraction: float = 0.02
|
| 562 |
+
|
| 563 |
+
# Simulation clock
|
| 564 |
+
sim_time_s: float = 0.0
|
| 565 |
+
|
| 566 |
+
@property
|
| 567 |
+
def total_it_load_kw(self) -> float:
|
| 568 |
+
return sum(z.total_it_load_kw for z in self.zones)
|
| 569 |
+
|
| 570 |
+
@property
|
| 571 |
+
def total_cooling_power_kw(self) -> float:
|
| 572 |
+
total = 0.0
|
| 573 |
+
for zone in self.zones:
|
| 574 |
+
for crac in zone.crac_units:
|
| 575 |
+
q_cool = crac.compute_cooling_output_kw(zone.hot_aisle_temp_c)
|
| 576 |
+
total += crac.compute_power_consumption_kw(q_cool, self.outside_temp_c)
|
| 577 |
+
return total
|
| 578 |
+
|
| 579 |
+
@property
|
| 580 |
+
def pue(self) -> float:
|
| 581 |
+
"""Dynamic PUE = Total Facility Power / IT Power.
|
| 582 |
+
|
| 583 |
+
When power subsystem is active, uses real UPS/PDU losses.
|
| 584 |
+
Otherwise falls back to stub loss fractions.
|
| 585 |
+
"""
|
| 586 |
+
p_it = self.total_it_load_kw
|
| 587 |
+
if p_it <= 0:
|
| 588 |
+
return 1.0
|
| 589 |
+
|
| 590 |
+
p_cooling = self.total_cooling_power_kw
|
| 591 |
+
|
| 592 |
+
if self.power is not None:
|
| 593 |
+
p_distribution_loss = self.power.total_power_overhead_kw
|
| 594 |
+
else:
|
| 595 |
+
p_distribution_loss = p_it * (self.ups_loss_fraction + self.pdu_loss_fraction)
|
| 596 |
+
|
| 597 |
+
p_total = p_it + p_cooling + p_distribution_loss + self.lighting_power_kw
|
| 598 |
+
return p_total / p_it
|
tests/__init__.py
ADDED
|
File without changes
|
tests/test_environment.py
ADDED
|
@@ -0,0 +1,439 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Tests for the DC-Ops environment, action parser, and dashboard renderer.
|
| 8 |
+
|
| 9 |
+
Validates:
|
| 10 |
+
- OpenEnv interface contract (reset/step/state)
|
| 11 |
+
- Action parsing (valid and invalid commands)
|
| 12 |
+
- Dashboard rendering output format
|
| 13 |
+
- Episode termination conditions
|
| 14 |
+
- Fault injection
|
| 15 |
+
- Reward computation
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import pytest
|
| 21 |
+
|
| 22 |
+
from dc_ops_env.models import DcOpsAction, DcOpsObservation
|
| 23 |
+
from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# ===========================================================================
|
| 27 |
+
# OpenEnv Interface Contract
|
| 28 |
+
# ===========================================================================
|
| 29 |
+
class TestOpenEnvContract:
|
| 30 |
+
"""Verify the environment satisfies the OpenEnv Environment ABC."""
|
| 31 |
+
|
| 32 |
+
def test_reset_returns_observation(self) -> None:
|
| 33 |
+
env = DcOpsEnvironment()
|
| 34 |
+
obs = env.reset()
|
| 35 |
+
assert isinstance(obs, DcOpsObservation)
|
| 36 |
+
assert obs.done is False
|
| 37 |
+
assert obs.reward == 0.0
|
| 38 |
+
|
| 39 |
+
def test_reset_has_dashboard(self) -> None:
|
| 40 |
+
env = DcOpsEnvironment()
|
| 41 |
+
obs = env.reset()
|
| 42 |
+
assert len(obs.dashboard) > 100
|
| 43 |
+
assert "DC-OPS MONITORING DASHBOARD" in obs.dashboard
|
| 44 |
+
|
| 45 |
+
def test_reset_has_available_actions(self) -> None:
|
| 46 |
+
env = DcOpsEnvironment()
|
| 47 |
+
obs = env.reset()
|
| 48 |
+
assert len(obs.available_actions) > 5
|
| 49 |
+
|
| 50 |
+
def test_step_returns_observation(self) -> None:
|
| 51 |
+
env = DcOpsEnvironment()
|
| 52 |
+
env.reset()
|
| 53 |
+
obs = env.step(DcOpsAction(command="check_status"))
|
| 54 |
+
assert isinstance(obs, DcOpsObservation)
|
| 55 |
+
assert obs.done is False
|
| 56 |
+
|
| 57 |
+
def test_step_advances_step_count(self) -> None:
|
| 58 |
+
env = DcOpsEnvironment()
|
| 59 |
+
env.reset()
|
| 60 |
+
assert env.state.step_count == 0
|
| 61 |
+
env.step(DcOpsAction(command="wait"))
|
| 62 |
+
assert env.state.step_count == 1
|
| 63 |
+
env.step(DcOpsAction(command="wait"))
|
| 64 |
+
assert env.state.step_count == 2
|
| 65 |
+
|
| 66 |
+
def test_state_has_episode_id(self) -> None:
|
| 67 |
+
env = DcOpsEnvironment()
|
| 68 |
+
env.reset()
|
| 69 |
+
assert env.state.episode_id is not None
|
| 70 |
+
assert len(env.state.episode_id) > 0
|
| 71 |
+
|
| 72 |
+
def test_reset_changes_episode_id(self) -> None:
|
| 73 |
+
env = DcOpsEnvironment()
|
| 74 |
+
obs1 = env.reset()
|
| 75 |
+
ep1 = env.state.episode_id
|
| 76 |
+
obs2 = env.reset()
|
| 77 |
+
ep2 = env.state.episode_id
|
| 78 |
+
assert ep1 != ep2
|
| 79 |
+
|
| 80 |
+
def test_observation_metadata_populated(self) -> None:
|
| 81 |
+
env = DcOpsEnvironment()
|
| 82 |
+
obs = env.reset()
|
| 83 |
+
assert "total_it_load_kw" in obs.metadata
|
| 84 |
+
assert "pue" in obs.metadata
|
| 85 |
+
assert "zones" in obs.metadata
|
| 86 |
+
assert obs.metadata["total_it_load_kw"] == pytest.approx(160.0, rel=0.01)
|
| 87 |
+
|
| 88 |
+
def test_observation_has_power_metadata(self) -> None:
|
| 89 |
+
env = DcOpsEnvironment()
|
| 90 |
+
obs = env.reset()
|
| 91 |
+
assert "power" in obs.metadata
|
| 92 |
+
assert obs.metadata["power"]["utility_available"] is True
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# ===========================================================================
|
| 96 |
+
# Action Parser Tests
|
| 97 |
+
# ===========================================================================
|
| 98 |
+
class TestActionParser:
|
| 99 |
+
"""Test command parsing and execution."""
|
| 100 |
+
|
| 101 |
+
def test_diagnose_crac(self) -> None:
|
| 102 |
+
env = DcOpsEnvironment()
|
| 103 |
+
env.reset()
|
| 104 |
+
obs = env.step(DcOpsAction(command="diagnose CRAC-1"))
|
| 105 |
+
assert "Diagnostic Report" in obs.action_result
|
| 106 |
+
assert "CRAC-1" in obs.action_result
|
| 107 |
+
assert obs.reward > -0.5 # Valid action should not be heavily penalized
|
| 108 |
+
|
| 109 |
+
def test_diagnose_ups(self) -> None:
|
| 110 |
+
env = DcOpsEnvironment()
|
| 111 |
+
env.reset()
|
| 112 |
+
obs = env.step(DcOpsAction(command="diagnose UPS-1"))
|
| 113 |
+
assert "Diagnostic Report" in obs.action_result
|
| 114 |
+
assert "UPS-1" in obs.action_result
|
| 115 |
+
|
| 116 |
+
def test_diagnose_nonexistent(self) -> None:
|
| 117 |
+
env = DcOpsEnvironment()
|
| 118 |
+
env.reset()
|
| 119 |
+
obs = env.step(DcOpsAction(command="diagnose CRAC-99"))
|
| 120 |
+
assert "not found" in obs.action_result
|
| 121 |
+
|
| 122 |
+
def test_adjust_setpoint_valid(self) -> None:
|
| 123 |
+
env = DcOpsEnvironment()
|
| 124 |
+
env.reset()
|
| 125 |
+
obs = env.step(DcOpsAction(command="adjust_setpoint CRAC-1 22"))
|
| 126 |
+
assert "adjusted" in obs.action_result.lower()
|
| 127 |
+
assert "22.0" in obs.action_result
|
| 128 |
+
|
| 129 |
+
def test_adjust_setpoint_out_of_range(self) -> None:
|
| 130 |
+
env = DcOpsEnvironment()
|
| 131 |
+
env.reset()
|
| 132 |
+
obs = env.step(DcOpsAction(command="adjust_setpoint CRAC-1 50"))
|
| 133 |
+
assert "out of safe range" in obs.action_result.lower() or "out of" in obs.action_result.lower()
|
| 134 |
+
|
| 135 |
+
def test_set_fan_speed(self) -> None:
|
| 136 |
+
env = DcOpsEnvironment()
|
| 137 |
+
env.reset()
|
| 138 |
+
obs = env.step(DcOpsAction(command="set_fan_speed CRAC-2 80"))
|
| 139 |
+
assert "fan speed" in obs.action_result.lower()
|
| 140 |
+
assert "80" in obs.action_result
|
| 141 |
+
|
| 142 |
+
def test_set_rack_load(self) -> None:
|
| 143 |
+
env = DcOpsEnvironment()
|
| 144 |
+
env.reset()
|
| 145 |
+
obs = env.step(DcOpsAction(command="set_rack_load A-01 12"))
|
| 146 |
+
assert "12.0" in obs.action_result
|
| 147 |
+
|
| 148 |
+
def test_start_generator(self) -> None:
|
| 149 |
+
env = DcOpsEnvironment()
|
| 150 |
+
env.reset()
|
| 151 |
+
obs = env.step(DcOpsAction(command="start_generator"))
|
| 152 |
+
assert "generator" in obs.action_result.lower()
|
| 153 |
+
|
| 154 |
+
def test_wait_command(self) -> None:
|
| 155 |
+
env = DcOpsEnvironment()
|
| 156 |
+
env.reset()
|
| 157 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 158 |
+
assert "no action" in obs.action_result.lower()
|
| 159 |
+
|
| 160 |
+
def test_check_status(self) -> None:
|
| 161 |
+
env = DcOpsEnvironment()
|
| 162 |
+
env.reset()
|
| 163 |
+
obs = env.step(DcOpsAction(command="check_status"))
|
| 164 |
+
assert "status" in obs.action_result.lower()
|
| 165 |
+
|
| 166 |
+
def test_invalid_command(self) -> None:
|
| 167 |
+
env = DcOpsEnvironment()
|
| 168 |
+
env.reset()
|
| 169 |
+
obs = env.step(DcOpsAction(command="fly_to_the_moon"))
|
| 170 |
+
assert "unknown" in obs.action_result.lower()
|
| 171 |
+
|
| 172 |
+
def test_empty_command(self) -> None:
|
| 173 |
+
env = DcOpsEnvironment()
|
| 174 |
+
env.reset()
|
| 175 |
+
obs = env.step(DcOpsAction(command=""))
|
| 176 |
+
assert "empty" in obs.action_result.lower()
|
| 177 |
+
|
| 178 |
+
def test_case_insensitive(self) -> None:
|
| 179 |
+
env = DcOpsEnvironment()
|
| 180 |
+
env.reset()
|
| 181 |
+
obs = env.step(DcOpsAction(command="DIAGNOSE CRAC-1"))
|
| 182 |
+
assert "Diagnostic Report" in obs.action_result
|
| 183 |
+
|
| 184 |
+
def test_start_stop_crac(self) -> None:
|
| 185 |
+
env = DcOpsEnvironment()
|
| 186 |
+
env.reset()
|
| 187 |
+
obs = env.step(DcOpsAction(command="stop_crac CRAC-2"))
|
| 188 |
+
assert "standby" in obs.action_result.lower()
|
| 189 |
+
|
| 190 |
+
obs = env.step(DcOpsAction(command="start_crac CRAC-2"))
|
| 191 |
+
assert "started" in obs.action_result.lower()
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
# ===========================================================================
|
| 195 |
+
# Dashboard Rendering Tests
|
| 196 |
+
# ===========================================================================
|
| 197 |
+
class TestDashboardRendering:
|
| 198 |
+
"""Test dashboard output format and content."""
|
| 199 |
+
|
| 200 |
+
def test_dashboard_has_cooling_section(self) -> None:
|
| 201 |
+
env = DcOpsEnvironment()
|
| 202 |
+
obs = env.reset()
|
| 203 |
+
assert "COOLING UNITS" in obs.dashboard
|
| 204 |
+
assert "CRAC-1" in obs.dashboard
|
| 205 |
+
|
| 206 |
+
def test_dashboard_has_zone_temps(self) -> None:
|
| 207 |
+
env = DcOpsEnvironment()
|
| 208 |
+
obs = env.reset()
|
| 209 |
+
assert "ZONE TEMPERATURES" in obs.dashboard
|
| 210 |
+
assert "zone_a" in obs.dashboard
|
| 211 |
+
|
| 212 |
+
def test_dashboard_has_rack_temps(self) -> None:
|
| 213 |
+
env = DcOpsEnvironment()
|
| 214 |
+
obs = env.reset()
|
| 215 |
+
assert "RACK TEMPERATURES" in obs.dashboard
|
| 216 |
+
|
| 217 |
+
def test_dashboard_has_power(self) -> None:
|
| 218 |
+
env = DcOpsEnvironment()
|
| 219 |
+
obs = env.reset()
|
| 220 |
+
assert "POWER" in obs.dashboard
|
| 221 |
+
assert "PUE" in obs.dashboard
|
| 222 |
+
|
| 223 |
+
def test_dashboard_has_environment(self) -> None:
|
| 224 |
+
env = DcOpsEnvironment()
|
| 225 |
+
obs = env.reset()
|
| 226 |
+
assert "ENVIRONMENT" in obs.dashboard
|
| 227 |
+
assert "35.0°C" in obs.dashboard
|
| 228 |
+
|
| 229 |
+
def test_dashboard_shows_alert(self) -> None:
|
| 230 |
+
env = DcOpsEnvironment()
|
| 231 |
+
obs = env.reset(alert="Test alert message")
|
| 232 |
+
assert "ALERT" in obs.dashboard
|
| 233 |
+
assert "Test alert message" in obs.dashboard
|
| 234 |
+
|
| 235 |
+
def test_dashboard_shows_step_count(self) -> None:
|
| 236 |
+
env = DcOpsEnvironment()
|
| 237 |
+
obs = env.reset()
|
| 238 |
+
assert "Step: 0/15" in obs.dashboard
|
| 239 |
+
|
| 240 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 241 |
+
assert "Step: 1/15" in obs.dashboard
|
| 242 |
+
|
| 243 |
+
def test_dashboard_shows_ups_status(self) -> None:
|
| 244 |
+
env = DcOpsEnvironment()
|
| 245 |
+
obs = env.reset()
|
| 246 |
+
assert "UPS-1" in obs.dashboard
|
| 247 |
+
assert "UPS-2" in obs.dashboard
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
# ===========================================================================
|
| 251 |
+
# Episode Termination Tests
|
| 252 |
+
# ===========================================================================
|
| 253 |
+
class TestEpisodeTermination:
|
| 254 |
+
"""Test episode termination conditions."""
|
| 255 |
+
|
| 256 |
+
def test_step_budget_exhaustion(self) -> None:
|
| 257 |
+
env = DcOpsEnvironment()
|
| 258 |
+
env.reset(step_budget=3)
|
| 259 |
+
|
| 260 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 261 |
+
assert obs.done is False
|
| 262 |
+
|
| 263 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 264 |
+
assert obs.done is False
|
| 265 |
+
|
| 266 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 267 |
+
assert obs.done is True # Step 3/3
|
| 268 |
+
|
| 269 |
+
def test_escalation_terminates(self) -> None:
|
| 270 |
+
env = DcOpsEnvironment()
|
| 271 |
+
env.reset()
|
| 272 |
+
obs = env.step(DcOpsAction(command="escalate"))
|
| 273 |
+
assert obs.done is True
|
| 274 |
+
assert obs.reward < 0 # Penalty for escalating
|
| 275 |
+
|
| 276 |
+
def test_step_after_done_is_noop(self) -> None:
|
| 277 |
+
env = DcOpsEnvironment()
|
| 278 |
+
env.reset()
|
| 279 |
+
obs = env.step(DcOpsAction(command="escalate"))
|
| 280 |
+
assert obs.done is True
|
| 281 |
+
|
| 282 |
+
obs2 = env.step(DcOpsAction(command="wait"))
|
| 283 |
+
assert obs2.done is True
|
| 284 |
+
assert "already ended" in obs2.action_result.lower()
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
# ===========================================================================
|
| 288 |
+
# Fault Injection Tests
|
| 289 |
+
# ===========================================================================
|
| 290 |
+
class TestFaultInjection:
|
| 291 |
+
"""Test scenario fault injection at reset."""
|
| 292 |
+
|
| 293 |
+
def test_crac_fault_injection(self) -> None:
|
| 294 |
+
env = DcOpsEnvironment()
|
| 295 |
+
obs = env.reset(
|
| 296 |
+
fault_injection={
|
| 297 |
+
"type": "crac_fault",
|
| 298 |
+
"unit_id": "CRAC-3",
|
| 299 |
+
"fault": "compressor",
|
| 300 |
+
},
|
| 301 |
+
)
|
| 302 |
+
# Dashboard should show the fault
|
| 303 |
+
assert "COMPRESSOR" in obs.dashboard or "FAULT" in obs.dashboard
|
| 304 |
+
|
| 305 |
+
def test_utility_loss_injection(self) -> None:
|
| 306 |
+
env = DcOpsEnvironment()
|
| 307 |
+
obs = env.reset(
|
| 308 |
+
fault_injection={"type": "utility_loss"},
|
| 309 |
+
)
|
| 310 |
+
assert "DOWN" in obs.dashboard or "BATTERY" in obs.dashboard
|
| 311 |
+
|
| 312 |
+
def test_outside_temp_injection(self) -> None:
|
| 313 |
+
env = DcOpsEnvironment()
|
| 314 |
+
obs = env.reset(
|
| 315 |
+
fault_injection={"type": "outside_temp", "temp_c": 45.0},
|
| 316 |
+
)
|
| 317 |
+
assert "45.0°C" in obs.dashboard
|
| 318 |
+
|
| 319 |
+
def test_alert_in_observation(self) -> None:
|
| 320 |
+
env = DcOpsEnvironment()
|
| 321 |
+
obs = env.reset(
|
| 322 |
+
alert="HIGH TEMPERATURE in Zone B",
|
| 323 |
+
scenario_type="thermal",
|
| 324 |
+
)
|
| 325 |
+
assert obs.alert == "HIGH TEMPERATURE in Zone B"
|
| 326 |
+
assert obs.scenario_type == "thermal"
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
# ===========================================================================
|
| 330 |
+
# Reward Tests
|
| 331 |
+
# ===========================================================================
|
| 332 |
+
class TestReward:
|
| 333 |
+
"""Test reward computation."""
|
| 334 |
+
|
| 335 |
+
def test_valid_action_positive_component(self) -> None:
|
| 336 |
+
"""Valid actions should get a positive action reward component."""
|
| 337 |
+
env = DcOpsEnvironment()
|
| 338 |
+
env.reset()
|
| 339 |
+
obs_valid = env.step(DcOpsAction(command="check_status"))
|
| 340 |
+
r_valid = obs_valid.reward
|
| 341 |
+
|
| 342 |
+
env.reset()
|
| 343 |
+
obs_invalid = env.step(DcOpsAction(command="nonsense_command"))
|
| 344 |
+
r_invalid = obs_invalid.reward
|
| 345 |
+
|
| 346 |
+
# Valid action should yield higher reward than invalid
|
| 347 |
+
assert r_valid > r_invalid
|
| 348 |
+
|
| 349 |
+
def test_pue_affects_reward(self) -> None:
|
| 350 |
+
"""Reward should be sensitive to PUE."""
|
| 351 |
+
env = DcOpsEnvironment()
|
| 352 |
+
obs = env.reset()
|
| 353 |
+
# Just verify PUE is in metadata and reward is computed
|
| 354 |
+
pue = obs.metadata["pue"]
|
| 355 |
+
assert pue > 1.0 # PUE should always be > 1
|
| 356 |
+
|
| 357 |
+
def test_cumulative_reward_tracked(self) -> None:
|
| 358 |
+
"""Cumulative reward should be tracked in metadata."""
|
| 359 |
+
env = DcOpsEnvironment()
|
| 360 |
+
env.reset()
|
| 361 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 362 |
+
assert "cumulative_reward" in obs.metadata
|
| 363 |
+
r1 = obs.metadata["cumulative_reward"]
|
| 364 |
+
|
| 365 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 366 |
+
r2 = obs.metadata["cumulative_reward"]
|
| 367 |
+
# Cumulative should change (it's the sum of per-step rewards)
|
| 368 |
+
assert r2 != 0 or r1 != 0 # At least one should be non-zero
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
# ===========================================================================
|
| 372 |
+
# Simulation Integration Tests
|
| 373 |
+
# ===========================================================================
|
| 374 |
+
class TestSimulationIntegration:
|
| 375 |
+
"""Test that the environment properly advances the simulation."""
|
| 376 |
+
|
| 377 |
+
def test_simulation_time_advances(self) -> None:
|
| 378 |
+
"""Each step should advance sim time by game_time_per_step."""
|
| 379 |
+
env = DcOpsEnvironment()
|
| 380 |
+
obs = env.reset()
|
| 381 |
+
t0 = obs.metadata["sim_time_s"]
|
| 382 |
+
|
| 383 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 384 |
+
t1 = obs.metadata["sim_time_s"]
|
| 385 |
+
|
| 386 |
+
# Default: 60s per step
|
| 387 |
+
assert t1 - t0 == pytest.approx(60.0, rel=0.01)
|
| 388 |
+
|
| 389 |
+
def test_custom_game_time_per_step(self) -> None:
|
| 390 |
+
"""Custom game_time_per_step should be respected."""
|
| 391 |
+
env = DcOpsEnvironment()
|
| 392 |
+
obs = env.reset(game_time_per_step_s=120.0)
|
| 393 |
+
t0 = obs.metadata["sim_time_s"]
|
| 394 |
+
|
| 395 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 396 |
+
t1 = obs.metadata["sim_time_s"]
|
| 397 |
+
|
| 398 |
+
assert t1 - t0 == pytest.approx(120.0, rel=0.01)
|
| 399 |
+
|
| 400 |
+
def test_setpoint_change_affects_temperature(self) -> None:
|
| 401 |
+
"""Changing setpoint should cause temperature change over steps."""
|
| 402 |
+
env = DcOpsEnvironment()
|
| 403 |
+
obs = env.reset()
|
| 404 |
+
t_cold_before = obs.metadata["zones"]["zone_a"]["cold_aisle_temp_c"]
|
| 405 |
+
|
| 406 |
+
# Raise setpoint significantly
|
| 407 |
+
env.step(DcOpsAction(command="adjust_setpoint CRAC-1 25"))
|
| 408 |
+
env.step(DcOpsAction(command="adjust_setpoint CRAC-2 25"))
|
| 409 |
+
|
| 410 |
+
# Wait a few steps for temp to change
|
| 411 |
+
for _ in range(3):
|
| 412 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 413 |
+
|
| 414 |
+
t_cold_after = obs.metadata["zones"]["zone_a"]["cold_aisle_temp_c"]
|
| 415 |
+
|
| 416 |
+
# Cold aisle should have increased
|
| 417 |
+
assert t_cold_after > t_cold_before + 0.5, \
|
| 418 |
+
f"Expected temp increase: {t_cold_before:.1f} → {t_cold_after:.1f}"
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
# ===========================================================================
|
| 422 |
+
# Performance Test
|
| 423 |
+
# ===========================================================================
|
| 424 |
+
class TestPerformance:
|
| 425 |
+
"""Ensure full environment steps are fast enough."""
|
| 426 |
+
|
| 427 |
+
def test_episode_performance(self) -> None:
|
| 428 |
+
"""Full 15-step episode should complete in < 5 seconds."""
|
| 429 |
+
import time
|
| 430 |
+
|
| 431 |
+
env = DcOpsEnvironment()
|
| 432 |
+
start = time.perf_counter()
|
| 433 |
+
|
| 434 |
+
env.reset()
|
| 435 |
+
for _ in range(15):
|
| 436 |
+
env.step(DcOpsAction(command="wait"))
|
| 437 |
+
|
| 438 |
+
elapsed = time.perf_counter() - start
|
| 439 |
+
assert elapsed < 5.0, f"Episode took {elapsed:.2f}s, should be < 5s"
|
tests/test_integration.py
ADDED
|
@@ -0,0 +1,535 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Integration tests: full episode playback, config loading, cross-facility.
|
| 8 |
+
|
| 9 |
+
Validates:
|
| 10 |
+
- Known-good action sequences resolve each scenario
|
| 11 |
+
- Reward signals are well-behaved across full episodes
|
| 12 |
+
- YAML config loading produces valid, runnable environments
|
| 13 |
+
- Different facility sizes work correctly
|
| 14 |
+
- Episode metrics (PUE, temps, rewards) are in expected ranges
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import time
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
|
| 22 |
+
import pytest
|
| 23 |
+
|
| 24 |
+
from dc_ops_env.config import (
|
| 25 |
+
BUILTIN_CONFIGS,
|
| 26 |
+
DatacenterConfig,
|
| 27 |
+
load_datacenter_config,
|
| 28 |
+
make_default_datacenter_config,
|
| 29 |
+
)
|
| 30 |
+
from dc_ops_env.models import DcOpsAction, DcOpsObservation
|
| 31 |
+
from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
|
| 32 |
+
from dc_ops_env.scenarios.registry import registered_scenario_ids
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ===========================================================================
|
| 36 |
+
# Config Loading Tests
|
| 37 |
+
# ===========================================================================
|
| 38 |
+
class TestConfigLoading:
|
| 39 |
+
"""Validate YAML config loading and built-in configs."""
|
| 40 |
+
|
| 41 |
+
def test_builtin_configs_exist(self) -> None:
|
| 42 |
+
"""All built-in config files should exist on disk."""
|
| 43 |
+
for name, path in BUILTIN_CONFIGS.items():
|
| 44 |
+
assert path.exists(), f"Built-in config '{name}' not found at {path}"
|
| 45 |
+
|
| 46 |
+
@pytest.mark.parametrize("config_name", ["default", "small", "large"])
|
| 47 |
+
def test_load_builtin(self, config_name: str) -> None:
|
| 48 |
+
"""Each built-in config should load without error."""
|
| 49 |
+
cfg = load_datacenter_config(config_name)
|
| 50 |
+
assert isinstance(cfg, DatacenterConfig)
|
| 51 |
+
assert len(cfg.zones) > 0
|
| 52 |
+
for zone in cfg.zones:
|
| 53 |
+
assert len(zone.racks) > 0
|
| 54 |
+
assert len(zone.crac_units) > 0
|
| 55 |
+
|
| 56 |
+
def test_load_by_path(self) -> None:
|
| 57 |
+
"""Loading by explicit path should work."""
|
| 58 |
+
path = BUILTIN_CONFIGS["default"]
|
| 59 |
+
cfg = load_datacenter_config(path)
|
| 60 |
+
assert cfg.name == "DC-OPS Default Facility"
|
| 61 |
+
|
| 62 |
+
def test_load_nonexistent_raises(self) -> None:
|
| 63 |
+
"""Loading a missing file should raise FileNotFoundError."""
|
| 64 |
+
with pytest.raises(FileNotFoundError):
|
| 65 |
+
load_datacenter_config("/nonexistent/path.yaml")
|
| 66 |
+
|
| 67 |
+
def test_default_yaml_matches_programmatic(self) -> None:
|
| 68 |
+
"""YAML default config should match make_default_datacenter_config()."""
|
| 69 |
+
yaml_cfg = load_datacenter_config("default")
|
| 70 |
+
prog_cfg = make_default_datacenter_config()
|
| 71 |
+
|
| 72 |
+
assert yaml_cfg.name == prog_cfg.name
|
| 73 |
+
assert len(yaml_cfg.zones) == len(prog_cfg.zones)
|
| 74 |
+
assert yaml_cfg.outside_temp_c == prog_cfg.outside_temp_c
|
| 75 |
+
|
| 76 |
+
# Same number of racks and CRACs
|
| 77 |
+
yaml_racks = sum(len(z.racks) for z in yaml_cfg.zones)
|
| 78 |
+
prog_racks = sum(len(z.racks) for z in prog_cfg.zones)
|
| 79 |
+
assert yaml_racks == prog_racks
|
| 80 |
+
|
| 81 |
+
yaml_cracs = sum(len(z.crac_units) for z in yaml_cfg.zones)
|
| 82 |
+
prog_cracs = sum(len(z.crac_units) for z in prog_cfg.zones)
|
| 83 |
+
assert yaml_cracs == prog_cracs
|
| 84 |
+
|
| 85 |
+
def test_small_facility_dimensions(self) -> None:
|
| 86 |
+
"""Small facility should have correct dimensions."""
|
| 87 |
+
cfg = load_datacenter_config("small")
|
| 88 |
+
assert len(cfg.zones) == 1
|
| 89 |
+
total_racks = sum(len(z.racks) for z in cfg.zones)
|
| 90 |
+
assert total_racks == 10
|
| 91 |
+
total_it = sum(r.it_load_kw for z in cfg.zones for r in z.racks)
|
| 92 |
+
assert total_it == pytest.approx(80.0)
|
| 93 |
+
assert len(cfg.power.ups_units) == 1
|
| 94 |
+
|
| 95 |
+
def test_large_facility_dimensions(self) -> None:
|
| 96 |
+
"""Large facility should have correct dimensions."""
|
| 97 |
+
cfg = load_datacenter_config("large")
|
| 98 |
+
assert len(cfg.zones) == 4
|
| 99 |
+
total_racks = sum(len(z.racks) for z in cfg.zones)
|
| 100 |
+
assert total_racks == 60
|
| 101 |
+
total_it = sum(r.it_load_kw for z in cfg.zones for r in z.racks)
|
| 102 |
+
assert total_it == pytest.approx(600.0)
|
| 103 |
+
assert len(cfg.power.ups_units) == 4
|
| 104 |
+
|
| 105 |
+
def test_large_facility_has_h1_zone(self) -> None:
|
| 106 |
+
"""Large facility should include an H1 high-density zone."""
|
| 107 |
+
cfg = load_datacenter_config("large")
|
| 108 |
+
h1_zones = [z for z in cfg.zones if z.ashrae_class == "H1"]
|
| 109 |
+
assert len(h1_zones) == 1
|
| 110 |
+
# H1 zone should have higher per-rack load
|
| 111 |
+
for rack in h1_zones[0].racks:
|
| 112 |
+
assert rack.it_load_kw == 20.0
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# ===========================================================================
|
| 116 |
+
# Config-to-Environment Tests
|
| 117 |
+
# ===========================================================================
|
| 118 |
+
class TestConfigToEnvironment:
|
| 119 |
+
"""Validate that loaded configs produce runnable environments."""
|
| 120 |
+
|
| 121 |
+
@pytest.mark.parametrize("config_name", ["default", "small", "large"])
|
| 122 |
+
def test_env_runs_with_config(self, config_name: str) -> None:
|
| 123 |
+
"""Environment should initialize and run steps with each config."""
|
| 124 |
+
cfg = load_datacenter_config(config_name)
|
| 125 |
+
env = DcOpsEnvironment()
|
| 126 |
+
obs = env.reset(config=cfg)
|
| 127 |
+
assert isinstance(obs, DcOpsObservation)
|
| 128 |
+
assert obs.done is False
|
| 129 |
+
|
| 130 |
+
obs = env.step(DcOpsAction(command="check_status"))
|
| 131 |
+
assert isinstance(obs, DcOpsObservation)
|
| 132 |
+
|
| 133 |
+
def test_small_facility_pue(self) -> None:
|
| 134 |
+
"""Small facility PUE should be realistic after warmup."""
|
| 135 |
+
cfg = load_datacenter_config("small")
|
| 136 |
+
env = DcOpsEnvironment()
|
| 137 |
+
obs = env.reset(config=cfg)
|
| 138 |
+
pue = obs.metadata["pue"]
|
| 139 |
+
assert 1.1 < pue < 2.5, f"PUE {pue} out of realistic range"
|
| 140 |
+
|
| 141 |
+
def test_large_facility_total_load(self) -> None:
|
| 142 |
+
"""Large facility total IT load should match config."""
|
| 143 |
+
cfg = load_datacenter_config("large")
|
| 144 |
+
env = DcOpsEnvironment()
|
| 145 |
+
obs = env.reset(config=cfg)
|
| 146 |
+
total_it = obs.metadata["total_it_load_kw"]
|
| 147 |
+
assert total_it == pytest.approx(600.0, rel=0.01)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
# ===========================================================================
|
| 151 |
+
# Full Episode Playback: Thermal Scenarios
|
| 152 |
+
# ===========================================================================
|
| 153 |
+
class TestEpisodePlaybackThermal:
|
| 154 |
+
"""Full episode playback with known-good action sequences for thermal scenarios."""
|
| 155 |
+
|
| 156 |
+
def test_a1_optimal_episode(self) -> None:
|
| 157 |
+
"""A1 (Cooling Setpoint Optimization): raise setpoints to reduce PUE.
|
| 158 |
+
|
| 159 |
+
Optimal sequence: check_status → raise each CRAC setpoint → wait for convergence.
|
| 160 |
+
PUE should improve significantly from baseline.
|
| 161 |
+
"""
|
| 162 |
+
env = DcOpsEnvironment()
|
| 163 |
+
obs = env.reset(scenario="A1")
|
| 164 |
+
pue_initial = obs.metadata["pue"]
|
| 165 |
+
|
| 166 |
+
# 1. Check status first (procedure bonus)
|
| 167 |
+
obs = env.step(DcOpsAction(command="check_status"))
|
| 168 |
+
assert not obs.done
|
| 169 |
+
|
| 170 |
+
# 2. Raise setpoints on all 4 CRACs from 15°C → 24°C (aggressive)
|
| 171 |
+
for crac_id in ["CRAC-1", "CRAC-2", "CRAC-3", "CRAC-4"]:
|
| 172 |
+
obs = env.step(DcOpsAction(command=f"adjust_setpoint {crac_id} 24"))
|
| 173 |
+
|
| 174 |
+
# 3. Wait for temps to converge
|
| 175 |
+
for _ in range(5):
|
| 176 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 177 |
+
if obs.done:
|
| 178 |
+
break
|
| 179 |
+
|
| 180 |
+
pue_final = obs.metadata["pue"]
|
| 181 |
+
# PUE should have improved (lower is better)
|
| 182 |
+
assert pue_final < pue_initial, (
|
| 183 |
+
f"PUE should improve: {pue_initial:.2f} → {pue_final:.2f}"
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
def test_a2_optimal_episode(self) -> None:
|
| 187 |
+
"""A2 (Thermal Event Response): diagnose CRAC-3, compensate with remaining units.
|
| 188 |
+
|
| 189 |
+
Optimal: diagnose → increase fan speeds on survivors → adjust setpoints.
|
| 190 |
+
"""
|
| 191 |
+
env = DcOpsEnvironment()
|
| 192 |
+
obs = env.reset(scenario="A2")
|
| 193 |
+
|
| 194 |
+
# 1. Diagnose the failed CRAC
|
| 195 |
+
obs = env.step(DcOpsAction(command="diagnose CRAC-3"))
|
| 196 |
+
assert "COMPRESSOR" in obs.action_result or "compressor" in obs.action_result.lower()
|
| 197 |
+
|
| 198 |
+
# 2. Increase fan speed on remaining CRACs
|
| 199 |
+
for crac_id in ["CRAC-1", "CRAC-2", "CRAC-4"]:
|
| 200 |
+
obs = env.step(DcOpsAction(command=f"set_fan_speed {crac_id} 100"))
|
| 201 |
+
|
| 202 |
+
# 3. Lower setpoints slightly on surviving units to compensate
|
| 203 |
+
for crac_id in ["CRAC-1", "CRAC-2", "CRAC-4"]:
|
| 204 |
+
obs = env.step(DcOpsAction(command=f"adjust_setpoint {crac_id} 16"))
|
| 205 |
+
|
| 206 |
+
# 4. Wait for stabilization
|
| 207 |
+
for _ in range(8):
|
| 208 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 209 |
+
if obs.done:
|
| 210 |
+
break
|
| 211 |
+
|
| 212 |
+
# Should resolve or be close — temps within recommended for 2+ steps
|
| 213 |
+
# Even if not fully resolved, reward should be reasonable
|
| 214 |
+
assert obs.metadata["cumulative_reward"] > -5.0
|
| 215 |
+
|
| 216 |
+
def test_a4_episode_with_load_shedding(self) -> None:
|
| 217 |
+
"""A4 (CRAC Failure Cascade): diagnose both, compensate, shed load.
|
| 218 |
+
|
| 219 |
+
This is the hardest thermal scenario — two CRACs down.
|
| 220 |
+
"""
|
| 221 |
+
env = DcOpsEnvironment()
|
| 222 |
+
obs = env.reset(scenario="A4")
|
| 223 |
+
|
| 224 |
+
# 1. Diagnose both failed units
|
| 225 |
+
obs = env.step(DcOpsAction(command="diagnose CRAC-1"))
|
| 226 |
+
obs = env.step(DcOpsAction(command="diagnose CRAC-3"))
|
| 227 |
+
|
| 228 |
+
# 2. Max out surviving CRACs
|
| 229 |
+
obs = env.step(DcOpsAction(command="set_fan_speed CRAC-2 100"))
|
| 230 |
+
obs = env.step(DcOpsAction(command="set_fan_speed CRAC-4 100"))
|
| 231 |
+
obs = env.step(DcOpsAction(command="adjust_setpoint CRAC-2 15"))
|
| 232 |
+
obs = env.step(DcOpsAction(command="adjust_setpoint CRAC-4 15"))
|
| 233 |
+
|
| 234 |
+
# 3. Shed load on hottest racks
|
| 235 |
+
for rack_id in ["A-01", "A-02", "B-01", "B-02"]:
|
| 236 |
+
obs = env.step(DcOpsAction(command=f"set_rack_load {rack_id} 4"))
|
| 237 |
+
|
| 238 |
+
# 4. Wait and monitor
|
| 239 |
+
for _ in range(10):
|
| 240 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 241 |
+
if obs.done:
|
| 242 |
+
break
|
| 243 |
+
|
| 244 |
+
# Hard scenario — may not fully resolve, but should make progress
|
| 245 |
+
assert obs.metadata["cumulative_reward"] > -10.0
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
# ===========================================================================
|
| 249 |
+
# Full Episode Playback: Power Scenarios
|
| 250 |
+
# ===========================================================================
|
| 251 |
+
class TestEpisodePlaybackPower:
|
| 252 |
+
"""Full episode playback with known-good action sequences for power scenarios."""
|
| 253 |
+
|
| 254 |
+
def test_b1_optimal_episode(self) -> None:
|
| 255 |
+
"""B1 (UPS Alarm Response): diagnose UPS, acknowledge alarm.
|
| 256 |
+
|
| 257 |
+
Simple 2-step resolution.
|
| 258 |
+
"""
|
| 259 |
+
env = DcOpsEnvironment()
|
| 260 |
+
obs = env.reset(scenario="B1")
|
| 261 |
+
|
| 262 |
+
# 1. Diagnose UPS status
|
| 263 |
+
obs = env.step(DcOpsAction(command="diagnose UPS-1"))
|
| 264 |
+
assert not obs.done
|
| 265 |
+
|
| 266 |
+
# 2. Acknowledge the alarm
|
| 267 |
+
obs = env.step(DcOpsAction(command="acknowledge_alarm"))
|
| 268 |
+
assert obs.done, "B1 should resolve after diagnose + acknowledge"
|
| 269 |
+
|
| 270 |
+
# Speed bonus: (10 - 2) / 10 = 0.8
|
| 271 |
+
assert obs.reward > 0.5, "Should have significant speed bonus"
|
| 272 |
+
|
| 273 |
+
def test_b3_optimal_episode(self) -> None:
|
| 274 |
+
"""B3 (Generator Test Protocol): follow the correct test sequence.
|
| 275 |
+
|
| 276 |
+
check_status → start_generator → diagnose GEN-1 → stop_generator → acknowledge.
|
| 277 |
+
"""
|
| 278 |
+
env = DcOpsEnvironment()
|
| 279 |
+
obs = env.reset(scenario="B3")
|
| 280 |
+
|
| 281 |
+
# Follow correct protocol
|
| 282 |
+
obs = env.step(DcOpsAction(command="check_status"))
|
| 283 |
+
assert not obs.done
|
| 284 |
+
|
| 285 |
+
obs = env.step(DcOpsAction(command="start_generator"))
|
| 286 |
+
assert not obs.done
|
| 287 |
+
|
| 288 |
+
# Wait for generator to start (30s game time per step, gen startup ~17s)
|
| 289 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 290 |
+
|
| 291 |
+
obs = env.step(DcOpsAction(command="diagnose GEN-1"))
|
| 292 |
+
assert not obs.done
|
| 293 |
+
|
| 294 |
+
obs = env.step(DcOpsAction(command="stop_generator"))
|
| 295 |
+
assert not obs.done
|
| 296 |
+
|
| 297 |
+
# Wait for cooldown
|
| 298 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 299 |
+
|
| 300 |
+
obs = env.step(DcOpsAction(command="acknowledge_alarm"))
|
| 301 |
+
assert obs.done, "B3 should resolve after full protocol"
|
| 302 |
+
|
| 303 |
+
def test_b4_episode_with_load_shedding(self) -> None:
|
| 304 |
+
"""B4 (Power Failure Cascade): manage battery, wait for generator.
|
| 305 |
+
|
| 306 |
+
Generator starts automatically on utility loss. Agent monitors
|
| 307 |
+
and sheds load to extend battery life.
|
| 308 |
+
"""
|
| 309 |
+
env = DcOpsEnvironment()
|
| 310 |
+
obs = env.reset(scenario="B4")
|
| 311 |
+
|
| 312 |
+
# 1. Diagnose to understand the situation
|
| 313 |
+
obs = env.step(DcOpsAction(command="diagnose UPS-1"))
|
| 314 |
+
obs = env.step(DcOpsAction(command="diagnose UPS-2"))
|
| 315 |
+
|
| 316 |
+
# 2. Shed non-critical load to extend battery
|
| 317 |
+
obs = env.step(DcOpsAction(command="set_rack_load A-01 4"))
|
| 318 |
+
obs = env.step(DcOpsAction(command="set_rack_load B-01 4"))
|
| 319 |
+
|
| 320 |
+
# 3. Check generator status
|
| 321 |
+
obs = env.step(DcOpsAction(command="diagnose GEN-1"))
|
| 322 |
+
|
| 323 |
+
# 4. Wait for generator to come online and stabilize
|
| 324 |
+
for _ in range(14):
|
| 325 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 326 |
+
if obs.done:
|
| 327 |
+
break
|
| 328 |
+
|
| 329 |
+
# B4 is hard — may or may not resolve, but should make progress
|
| 330 |
+
assert obs.metadata["cumulative_reward"] > -10.0
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# ===========================================================================
|
| 334 |
+
# Reward Signal Quality
|
| 335 |
+
# ===========================================================================
|
| 336 |
+
class TestRewardSignalQuality:
|
| 337 |
+
"""Validate that reward signals are well-behaved across full episodes."""
|
| 338 |
+
|
| 339 |
+
def test_rewards_bounded_per_step(self) -> None:
|
| 340 |
+
"""Every per-step reward should be bounded."""
|
| 341 |
+
env = DcOpsEnvironment()
|
| 342 |
+
env.reset(scenario="A2")
|
| 343 |
+
|
| 344 |
+
for _ in range(15):
|
| 345 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 346 |
+
# Base reward is [-1, 1], speed bonus can add up to 1.0
|
| 347 |
+
assert -2.0 <= obs.reward <= 2.0, f"Reward {obs.reward} out of bounds"
|
| 348 |
+
if obs.done:
|
| 349 |
+
break
|
| 350 |
+
|
| 351 |
+
def test_good_actions_beat_bad_actions(self) -> None:
|
| 352 |
+
"""An optimal sequence should yield higher cumulative reward than a bad one."""
|
| 353 |
+
env = DcOpsEnvironment()
|
| 354 |
+
|
| 355 |
+
# Good episode: diagnose then fix
|
| 356 |
+
env.reset(scenario="B1")
|
| 357 |
+
env.step(DcOpsAction(command="diagnose UPS-1"))
|
| 358 |
+
obs_good = env.step(DcOpsAction(command="acknowledge_alarm"))
|
| 359 |
+
r_good = obs_good.metadata["cumulative_reward"]
|
| 360 |
+
|
| 361 |
+
# Bad episode: just wait
|
| 362 |
+
env.reset(scenario="B1")
|
| 363 |
+
for _ in range(10):
|
| 364 |
+
obs_bad = env.step(DcOpsAction(command="wait"))
|
| 365 |
+
if obs_bad.done:
|
| 366 |
+
break
|
| 367 |
+
r_bad = obs_bad.metadata["cumulative_reward"]
|
| 368 |
+
|
| 369 |
+
assert r_good > r_bad, f"Good ({r_good:.2f}) should beat bad ({r_bad:.2f})"
|
| 370 |
+
|
| 371 |
+
def test_procedure_bonus_visible(self) -> None:
|
| 372 |
+
"""Following correct procedure should yield higher cumulative reward.
|
| 373 |
+
|
| 374 |
+
Full episode comparison: both episodes do the same actions, but one
|
| 375 |
+
follows procedure (check_status first) and the other doesn't.
|
| 376 |
+
"""
|
| 377 |
+
env = DcOpsEnvironment()
|
| 378 |
+
|
| 379 |
+
# With procedure: check_status → adjust_setpoint → wait
|
| 380 |
+
env.reset(scenario="A1")
|
| 381 |
+
env.step(DcOpsAction(command="check_status"))
|
| 382 |
+
env.step(DcOpsAction(command="adjust_setpoint CRAC-1 22"))
|
| 383 |
+
obs_proc = env.step(DcOpsAction(command="wait"))
|
| 384 |
+
r_with = obs_proc.metadata["cumulative_reward"]
|
| 385 |
+
|
| 386 |
+
# Without procedure: wait → adjust_setpoint → wait (no check_status)
|
| 387 |
+
env.reset(scenario="A1")
|
| 388 |
+
env.step(DcOpsAction(command="wait"))
|
| 389 |
+
env.step(DcOpsAction(command="adjust_setpoint CRAC-1 22"))
|
| 390 |
+
obs_noproc = env.step(DcOpsAction(command="wait"))
|
| 391 |
+
r_without = obs_noproc.metadata["cumulative_reward"]
|
| 392 |
+
|
| 393 |
+
assert r_with > r_without, (
|
| 394 |
+
f"Procedure bonus not visible: with={r_with:.3f} vs without={r_without:.3f}"
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
@pytest.mark.parametrize("scenario_id", registered_scenario_ids())
|
| 398 |
+
def test_no_nan_rewards(self, scenario_id: str) -> None:
|
| 399 |
+
"""No scenario should produce NaN rewards."""
|
| 400 |
+
import math
|
| 401 |
+
|
| 402 |
+
env = DcOpsEnvironment()
|
| 403 |
+
env.reset(scenario=scenario_id)
|
| 404 |
+
|
| 405 |
+
for _ in range(5):
|
| 406 |
+
obs = env.step(DcOpsAction(command="check_status"))
|
| 407 |
+
assert not math.isnan(obs.reward), f"NaN reward in {scenario_id}"
|
| 408 |
+
assert not math.isinf(obs.reward), f"Inf reward in {scenario_id}"
|
| 409 |
+
if obs.done:
|
| 410 |
+
break
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
# ===========================================================================
|
| 414 |
+
# Cross-Facility Scenario Tests
|
| 415 |
+
# ===========================================================================
|
| 416 |
+
class TestCrossFacility:
|
| 417 |
+
"""Validate scenarios work with different facility configs."""
|
| 418 |
+
|
| 419 |
+
def test_scenario_with_small_facility(self) -> None:
|
| 420 |
+
"""Scenarios should adapt to smaller configs that have compatible CRACs."""
|
| 421 |
+
cfg = load_datacenter_config("small")
|
| 422 |
+
env = DcOpsEnvironment()
|
| 423 |
+
# Run without a scenario, just with small config
|
| 424 |
+
obs = env.reset(config=cfg, step_budget=5)
|
| 425 |
+
assert obs.done is False
|
| 426 |
+
|
| 427 |
+
# Basic operations should work
|
| 428 |
+
obs = env.step(DcOpsAction(command="check_status"))
|
| 429 |
+
assert "status" in obs.action_result.lower()
|
| 430 |
+
|
| 431 |
+
obs = env.step(DcOpsAction(command="diagnose CRAC-1"))
|
| 432 |
+
assert "Diagnostic Report" in obs.action_result
|
| 433 |
+
|
| 434 |
+
def test_large_facility_steady_state(self) -> None:
|
| 435 |
+
"""Large facility should reach reasonable steady state."""
|
| 436 |
+
cfg = load_datacenter_config("large")
|
| 437 |
+
env = DcOpsEnvironment()
|
| 438 |
+
obs = env.reset(config=cfg, step_budget=10)
|
| 439 |
+
|
| 440 |
+
pue = obs.metadata["pue"]
|
| 441 |
+
assert 1.1 < pue < 3.0, f"Large facility PUE {pue} unrealistic"
|
| 442 |
+
|
| 443 |
+
total_cooling = obs.metadata["total_cooling_power_kw"]
|
| 444 |
+
total_it = obs.metadata["total_it_load_kw"]
|
| 445 |
+
assert total_cooling > 0
|
| 446 |
+
assert total_it > 0
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
# ===========================================================================
|
| 450 |
+
# Episode Metrics & Physics Consistency
|
| 451 |
+
# ===========================================================================
|
| 452 |
+
class TestEpisodeMetrics:
|
| 453 |
+
"""Validate physics consistency across episode metrics."""
|
| 454 |
+
|
| 455 |
+
def test_pue_always_above_one(self) -> None:
|
| 456 |
+
"""PUE should always be >= 1.0 (physically impossible otherwise)."""
|
| 457 |
+
env = DcOpsEnvironment()
|
| 458 |
+
env.reset(scenario="A1")
|
| 459 |
+
|
| 460 |
+
for _ in range(10):
|
| 461 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 462 |
+
assert obs.metadata["pue"] >= 1.0
|
| 463 |
+
if obs.done:
|
| 464 |
+
break
|
| 465 |
+
|
| 466 |
+
def test_higher_load_raises_temperature(self) -> None:
|
| 467 |
+
"""Adding rack load should cause temperature to rise."""
|
| 468 |
+
env = DcOpsEnvironment()
|
| 469 |
+
obs = env.reset()
|
| 470 |
+
t_before = obs.metadata["zones"]["zone_a"]["cold_aisle_temp_c"]
|
| 471 |
+
|
| 472 |
+
# Significantly increase multiple racks' load
|
| 473 |
+
env.step(DcOpsAction(command="set_rack_load A-01 15"))
|
| 474 |
+
env.step(DcOpsAction(command="set_rack_load A-02 15"))
|
| 475 |
+
env.step(DcOpsAction(command="set_rack_load A-03 15"))
|
| 476 |
+
|
| 477 |
+
# Wait for thermal response
|
| 478 |
+
for _ in range(7):
|
| 479 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 480 |
+
|
| 481 |
+
t_after = obs.metadata["zones"]["zone_a"]["cold_aisle_temp_c"]
|
| 482 |
+
assert t_after > t_before, (
|
| 483 |
+
f"Temp should rise with more load: {t_before:.1f} → {t_after:.1f}"
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
def test_sim_time_monotonically_increases(self) -> None:
|
| 487 |
+
"""Simulation time should always advance."""
|
| 488 |
+
env = DcOpsEnvironment()
|
| 489 |
+
obs = env.reset()
|
| 490 |
+
prev_time = obs.metadata["sim_time_s"]
|
| 491 |
+
|
| 492 |
+
for _ in range(5):
|
| 493 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 494 |
+
assert obs.metadata["sim_time_s"] > prev_time
|
| 495 |
+
prev_time = obs.metadata["sim_time_s"]
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
# ===========================================================================
|
| 499 |
+
# Performance Tests
|
| 500 |
+
# ===========================================================================
|
| 501 |
+
class TestIntegrationPerformance:
|
| 502 |
+
"""Validate performance across different facility sizes."""
|
| 503 |
+
|
| 504 |
+
@pytest.mark.parametrize("config_name", ["default", "small", "large"])
|
| 505 |
+
def test_episode_completes_fast(self, config_name: str) -> None:
|
| 506 |
+
"""Full episode should complete quickly for any facility size."""
|
| 507 |
+
cfg = load_datacenter_config(config_name)
|
| 508 |
+
env = DcOpsEnvironment()
|
| 509 |
+
|
| 510 |
+
start = time.perf_counter()
|
| 511 |
+
env.reset(config=cfg, step_budget=10)
|
| 512 |
+
for _ in range(10):
|
| 513 |
+
env.step(DcOpsAction(command="wait"))
|
| 514 |
+
elapsed = time.perf_counter() - start
|
| 515 |
+
|
| 516 |
+
assert elapsed < 10.0, (
|
| 517 |
+
f"{config_name} facility 10-step episode took {elapsed:.2f}s, should be <10s"
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
+
def test_all_scenarios_full_episode_under_10s(self) -> None:
|
| 521 |
+
"""Running every scenario for its full step budget should be fast."""
|
| 522 |
+
env = DcOpsEnvironment()
|
| 523 |
+
total_start = time.perf_counter()
|
| 524 |
+
|
| 525 |
+
for sid in registered_scenario_ids():
|
| 526 |
+
env.reset(scenario=sid)
|
| 527 |
+
for _ in range(20): # Max budget across scenarios
|
| 528 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 529 |
+
if obs.done:
|
| 530 |
+
break
|
| 531 |
+
|
| 532 |
+
total_elapsed = time.perf_counter() - total_start
|
| 533 |
+
assert total_elapsed < 15.0, (
|
| 534 |
+
f"All {len(registered_scenario_ids())} scenarios took {total_elapsed:.2f}s"
|
| 535 |
+
)
|
tests/test_power.py
ADDED
|
@@ -0,0 +1,743 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Tests for the power subsystem simulation.
|
| 8 |
+
|
| 9 |
+
Validates:
|
| 10 |
+
- UPS quadratic efficiency model against published data
|
| 11 |
+
- UPS battery discharge/charge dynamics
|
| 12 |
+
- PDU loss calculations and three-phase current distribution
|
| 13 |
+
- Generator state machine and fuel consumption
|
| 14 |
+
- ATS transfer timing
|
| 15 |
+
- Full utility-loss → generator-takeover scenario
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import math
|
| 21 |
+
|
| 22 |
+
import pytest
|
| 23 |
+
|
| 24 |
+
from dc_ops_env.config import (
|
| 25 |
+
ATSConfig,
|
| 26 |
+
GeneratorConfig,
|
| 27 |
+
PDUConfig,
|
| 28 |
+
PowerConfig,
|
| 29 |
+
UPSConfig,
|
| 30 |
+
)
|
| 31 |
+
from dc_ops_env.simulation.power import PowerAlarm, PowerSimulation, PowerStepResult
|
| 32 |
+
from dc_ops_env.simulation.types import (
|
| 33 |
+
ATSPosition,
|
| 34 |
+
GeneratorState,
|
| 35 |
+
UPSMode,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
# Helpers
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
def make_simple_power_config(
|
| 43 |
+
num_ups: int = 1,
|
| 44 |
+
num_pdus: int = 1,
|
| 45 |
+
ups_capacity_kw: float = 500.0,
|
| 46 |
+
) -> PowerConfig:
|
| 47 |
+
"""Create a minimal power config for testing."""
|
| 48 |
+
return PowerConfig(
|
| 49 |
+
ups_units=[
|
| 50 |
+
UPSConfig(unit_id=f"UPS-{i+1}", rated_capacity_kw=ups_capacity_kw)
|
| 51 |
+
for i in range(num_ups)
|
| 52 |
+
],
|
| 53 |
+
pdus=[
|
| 54 |
+
PDUConfig(pdu_id=f"PDU-{i+1}")
|
| 55 |
+
for i in range(num_pdus)
|
| 56 |
+
],
|
| 57 |
+
generator=GeneratorConfig(),
|
| 58 |
+
ats=ATSConfig(),
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ===========================================================================
|
| 63 |
+
# UPS Efficiency Tests
|
| 64 |
+
# ===========================================================================
|
| 65 |
+
class TestUPSEfficiency:
|
| 66 |
+
"""Validate UPS quadratic loss model against reference data.
|
| 67 |
+
|
| 68 |
+
APC WP-108 Table: 500 kVA double-conversion UPS efficiency
|
| 69 |
+
25% load → ~90.5%
|
| 70 |
+
50% load → ~93.6%
|
| 71 |
+
75% load → ~94.0%
|
| 72 |
+
100% load → ~93.9%
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
def test_efficiency_at_25_percent(self) -> None:
|
| 76 |
+
"""Efficiency at 25% load: η = 0.25/(0.25+0.013+0.006×0.25+0.011×0.0625) ≈ 94.3%."""
|
| 77 |
+
config = make_simple_power_config()
|
| 78 |
+
sim = PowerSimulation(config, it_load_kw=125.0)
|
| 79 |
+
sim.step(1.0, 125.0) # 125/500 = 25%
|
| 80 |
+
ups = sim.state.ups_units[0]
|
| 81 |
+
assert 0.93 <= ups.efficiency <= 0.96, f"η={ups.efficiency:.3f}"
|
| 82 |
+
|
| 83 |
+
def test_efficiency_at_50_percent(self) -> None:
|
| 84 |
+
"""Efficiency at 50% load: η ≈ 96.4%."""
|
| 85 |
+
config = make_simple_power_config()
|
| 86 |
+
sim = PowerSimulation(config, it_load_kw=250.0)
|
| 87 |
+
sim.step(1.0, 250.0)
|
| 88 |
+
ups = sim.state.ups_units[0]
|
| 89 |
+
assert 0.95 <= ups.efficiency <= 0.97, f"η={ups.efficiency:.3f}"
|
| 90 |
+
|
| 91 |
+
def test_efficiency_at_75_percent(self) -> None:
|
| 92 |
+
"""Efficiency at 75% load: η ≈ 96.9%."""
|
| 93 |
+
config = make_simple_power_config()
|
| 94 |
+
sim = PowerSimulation(config, it_load_kw=375.0)
|
| 95 |
+
sim.step(1.0, 375.0)
|
| 96 |
+
ups = sim.state.ups_units[0]
|
| 97 |
+
assert 0.96 <= ups.efficiency <= 0.98, f"η={ups.efficiency:.3f}"
|
| 98 |
+
|
| 99 |
+
def test_efficiency_at_100_percent(self) -> None:
|
| 100 |
+
"""Efficiency at 100% load: η ≈ 97.1%."""
|
| 101 |
+
config = make_simple_power_config()
|
| 102 |
+
sim = PowerSimulation(config, it_load_kw=500.0)
|
| 103 |
+
sim.step(1.0, 500.0)
|
| 104 |
+
ups = sim.state.ups_units[0]
|
| 105 |
+
assert 0.96 <= ups.efficiency <= 0.98, f"η={ups.efficiency:.3f}"
|
| 106 |
+
|
| 107 |
+
def test_efficiency_peak_around_75_percent(self) -> None:
|
| 108 |
+
"""Peak efficiency should occur around 50-75% load, not at extremes."""
|
| 109 |
+
config = make_simple_power_config()
|
| 110 |
+
sim = PowerSimulation(config, it_load_kw=0.0)
|
| 111 |
+
efficiencies = {}
|
| 112 |
+
for load_pct in [10, 25, 50, 75, 100]:
|
| 113 |
+
load_kw = 500.0 * load_pct / 100.0
|
| 114 |
+
sim2 = PowerSimulation(make_simple_power_config(), it_load_kw=load_kw)
|
| 115 |
+
sim2.step(1.0, load_kw)
|
| 116 |
+
efficiencies[load_pct] = sim2.state.ups_units[0].efficiency
|
| 117 |
+
|
| 118 |
+
# Peak should be between 50-100%, not at 10%
|
| 119 |
+
peak_pct = max(efficiencies, key=efficiencies.get)
|
| 120 |
+
assert peak_pct >= 50, f"Peak at {peak_pct}%, efficiencies: {efficiencies}"
|
| 121 |
+
|
| 122 |
+
def test_losses_are_positive(self) -> None:
|
| 123 |
+
"""UPS losses should always be positive (waste heat)."""
|
| 124 |
+
config = make_simple_power_config()
|
| 125 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 126 |
+
sim.step(1.0, 160.0)
|
| 127 |
+
ups = sim.state.ups_units[0]
|
| 128 |
+
assert ups.heat_output_kw > 0, "UPS must produce waste heat"
|
| 129 |
+
|
| 130 |
+
def test_eco_mode_higher_efficiency(self) -> None:
|
| 131 |
+
"""Eco mode should have higher efficiency than double conversion."""
|
| 132 |
+
config = make_simple_power_config()
|
| 133 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 134 |
+
sim.step(1.0, 160.0)
|
| 135 |
+
eta_dc = sim.state.ups_units[0].efficiency
|
| 136 |
+
|
| 137 |
+
sim2 = PowerSimulation(make_simple_power_config(), it_load_kw=160.0)
|
| 138 |
+
sim2.set_ups_mode("UPS-1", UPSMode.ECO)
|
| 139 |
+
sim2.step(1.0, 160.0)
|
| 140 |
+
eta_eco = sim2.state.ups_units[0].efficiency
|
| 141 |
+
|
| 142 |
+
assert eta_eco > eta_dc, f"Eco {eta_eco:.3f} should > DC {eta_dc:.3f}"
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
# ===========================================================================
|
| 146 |
+
# UPS Battery Tests
|
| 147 |
+
# ===========================================================================
|
| 148 |
+
class TestUPSBattery:
|
| 149 |
+
"""Validate battery discharge and charge dynamics."""
|
| 150 |
+
|
| 151 |
+
def test_battery_discharge_on_utility_loss(self) -> None:
|
| 152 |
+
"""Battery SOC should decrease when utility is lost."""
|
| 153 |
+
config = make_simple_power_config()
|
| 154 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 155 |
+
|
| 156 |
+
# Verify initial SOC = 100%
|
| 157 |
+
assert sim.state.ups_units[0].battery_soc == 1.0
|
| 158 |
+
|
| 159 |
+
# Kill utility
|
| 160 |
+
sim.set_utility_available(False)
|
| 161 |
+
|
| 162 |
+
# Run for 60 seconds
|
| 163 |
+
for _ in range(60):
|
| 164 |
+
sim.step(1.0, 160.0)
|
| 165 |
+
|
| 166 |
+
ups = sim.state.ups_units[0]
|
| 167 |
+
assert ups.mode == UPSMode.ON_BATTERY
|
| 168 |
+
assert ups.battery_soc < 1.0, "SOC should decrease on battery"
|
| 169 |
+
assert ups.battery_soc > 0.5, "SOC shouldn't drop too fast in 60s"
|
| 170 |
+
|
| 171 |
+
def test_battery_runtime_estimation(self) -> None:
|
| 172 |
+
"""Battery time remaining estimate should be reasonable.
|
| 173 |
+
|
| 174 |
+
8.3 kWh battery, 0.9 discharge eff, 0.85 aging, 160 kW load:
|
| 175 |
+
usable = 8.3 × 0.9 × 0.85 = 6.35 kWh
|
| 176 |
+
At 160 kW: ~143 seconds (~2.4 min)
|
| 177 |
+
"""
|
| 178 |
+
config = make_simple_power_config()
|
| 179 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 180 |
+
sim.set_utility_available(False)
|
| 181 |
+
sim.step(1.0, 160.0)
|
| 182 |
+
|
| 183 |
+
ups = sim.state.ups_units[0]
|
| 184 |
+
assert ups.mode == UPSMode.ON_BATTERY
|
| 185 |
+
assert 60 < ups.battery_time_remaining_s < 300, \
|
| 186 |
+
f"Runtime {ups.battery_time_remaining_s:.0f}s should be 1-5 min for 160kW"
|
| 187 |
+
|
| 188 |
+
def test_battery_exhaustion(self) -> None:
|
| 189 |
+
"""Battery should eventually exhaust and UPS should fault."""
|
| 190 |
+
config = make_simple_power_config()
|
| 191 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 192 |
+
sim.set_utility_available(False)
|
| 193 |
+
|
| 194 |
+
# Run until battery dies (should be ~2-3 min)
|
| 195 |
+
max_steps = 600 # 10 min max
|
| 196 |
+
exhausted = False
|
| 197 |
+
for _ in range(max_steps):
|
| 198 |
+
result = sim.step(1.0, 160.0)
|
| 199 |
+
if sim.state.ups_units[0].mode == UPSMode.FAULT:
|
| 200 |
+
exhausted = True
|
| 201 |
+
break
|
| 202 |
+
|
| 203 |
+
assert exhausted, "Battery should exhaust within 10 minutes at 160 kW"
|
| 204 |
+
assert sim.state.ups_units[0].battery_soc == 0.0
|
| 205 |
+
|
| 206 |
+
def test_battery_recharge_after_utility_restored(self) -> None:
|
| 207 |
+
"""Battery should recharge when utility is restored."""
|
| 208 |
+
config = make_simple_power_config()
|
| 209 |
+
sim = PowerSimulation(config, it_load_kw=80.0)
|
| 210 |
+
|
| 211 |
+
# Discharge for 30 seconds
|
| 212 |
+
sim.set_utility_available(False)
|
| 213 |
+
for _ in range(30):
|
| 214 |
+
sim.step(1.0, 80.0)
|
| 215 |
+
soc_after_discharge = sim.state.ups_units[0].battery_soc
|
| 216 |
+
|
| 217 |
+
# Restore utility
|
| 218 |
+
sim.set_utility_available(True)
|
| 219 |
+
for _ in range(300): # 5 min recharge
|
| 220 |
+
sim.step(1.0, 80.0)
|
| 221 |
+
|
| 222 |
+
soc_after_recharge = sim.state.ups_units[0].battery_soc
|
| 223 |
+
assert soc_after_recharge > soc_after_discharge, \
|
| 224 |
+
f"SOC should increase: {soc_after_discharge:.3f} → {soc_after_recharge:.3f}"
|
| 225 |
+
|
| 226 |
+
def test_battery_low_alarm(self) -> None:
|
| 227 |
+
"""Should get low battery alarm when SOC drops below 25%."""
|
| 228 |
+
config = make_simple_power_config()
|
| 229 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 230 |
+
sim.set_utility_available(False)
|
| 231 |
+
|
| 232 |
+
all_alarms: list[PowerAlarm] = []
|
| 233 |
+
for _ in range(600):
|
| 234 |
+
result = sim.step(1.0, 160.0)
|
| 235 |
+
all_alarms.extend(result.alarms)
|
| 236 |
+
if sim.state.ups_units[0].battery_soc < 0.10:
|
| 237 |
+
break
|
| 238 |
+
|
| 239 |
+
alarm_types = [a.alarm_type for a in all_alarms]
|
| 240 |
+
assert "battery_low" in alarm_types or "battery_critical" in alarm_types, \
|
| 241 |
+
f"Should have low battery alarm, got: {alarm_types}"
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
# ===========================================================================
|
| 245 |
+
# PDU Tests
|
| 246 |
+
# ===========================================================================
|
| 247 |
+
class TestPDU:
|
| 248 |
+
"""Validate PDU power distribution and loss calculations."""
|
| 249 |
+
|
| 250 |
+
def test_pdu_losses_at_nominal(self) -> None:
|
| 251 |
+
"""PDU losses should be ~2% of load (98% efficiency)."""
|
| 252 |
+
config = make_simple_power_config(num_pdus=1)
|
| 253 |
+
sim = PowerSimulation(config, it_load_kw=5.0)
|
| 254 |
+
result = sim.step(1.0, 5.0)
|
| 255 |
+
|
| 256 |
+
pdu = sim.state.pdus[0]
|
| 257 |
+
expected_loss = 5.0 * (1.0 / 0.98 - 1.0) # ~0.102 kW
|
| 258 |
+
assert abs(pdu.heat_output_kw - expected_loss) < 0.01, \
|
| 259 |
+
f"PDU loss {pdu.heat_output_kw:.3f} kW, expected {expected_loss:.3f}"
|
| 260 |
+
|
| 261 |
+
def test_phase_current_calculation(self) -> None:
|
| 262 |
+
"""Phase currents should match P = √3 × V_LL × I_L formula.
|
| 263 |
+
|
| 264 |
+
5 kW load at 208V: I_total = 5000 / (√3 × 208) = 13.88 A
|
| 265 |
+
Per phase (balanced): 13.88 / 3 = 4.63 A
|
| 266 |
+
"""
|
| 267 |
+
config = make_simple_power_config(num_pdus=1)
|
| 268 |
+
sim = PowerSimulation(config, it_load_kw=5.0)
|
| 269 |
+
sim.step(1.0, 5.0)
|
| 270 |
+
|
| 271 |
+
pdu = sim.state.pdus[0]
|
| 272 |
+
expected_total = 5000.0 / (math.sqrt(3) * 208.0)
|
| 273 |
+
expected_per_phase = expected_total / 3.0
|
| 274 |
+
|
| 275 |
+
for i, current in enumerate(pdu.phase_currents_a):
|
| 276 |
+
assert abs(current - expected_per_phase) < 0.1, \
|
| 277 |
+
f"Phase {i} current {current:.2f}A, expected {expected_per_phase:.2f}A"
|
| 278 |
+
|
| 279 |
+
def test_pdu_nameplate_capacity(self) -> None:
|
| 280 |
+
"""Nameplate capacity = √3 × 208V × 24A ≈ 8.65 kW."""
|
| 281 |
+
config = make_simple_power_config(num_pdus=1)
|
| 282 |
+
sim = PowerSimulation(config, it_load_kw=1.0)
|
| 283 |
+
sim.step(1.0, 1.0)
|
| 284 |
+
|
| 285 |
+
pdu = sim.state.pdus[0]
|
| 286 |
+
expected = math.sqrt(3) * 208.0 * 24.0 / 1000.0
|
| 287 |
+
assert abs(pdu.nameplate_capacity_kw - expected) < 0.01
|
| 288 |
+
|
| 289 |
+
def test_pdu_derated_capacity(self) -> None:
|
| 290 |
+
"""Derated capacity = nameplate × 0.80."""
|
| 291 |
+
config = make_simple_power_config(num_pdus=1)
|
| 292 |
+
sim = PowerSimulation(config, it_load_kw=1.0)
|
| 293 |
+
sim.step(1.0, 1.0)
|
| 294 |
+
|
| 295 |
+
pdu = sim.state.pdus[0]
|
| 296 |
+
expected = pdu.nameplate_capacity_kw * 0.80
|
| 297 |
+
assert abs(pdu.derated_capacity_kw - expected) < 0.01
|
| 298 |
+
|
| 299 |
+
def test_pdu_overcurrent_alarm(self) -> None:
|
| 300 |
+
"""Overloading a PDU beyond phase current limit should trigger alarm.
|
| 301 |
+
|
| 302 |
+
Phase current = P / (√3 × V_LL) / num_phases_factor
|
| 303 |
+
For total_current > 24A per-phase: need I_total > 72A
|
| 304 |
+
I_total = P / (√3 × 208) = P / 360.2
|
| 305 |
+
So P > 72 × 360.2 / 3 ≈ 8.65 kW won't do it because per_phase = I_total/3
|
| 306 |
+
Actually: per_phase = (P×1000)/(√3×208) / 3, need per_phase > 24A
|
| 307 |
+
per_phase > 24 → P > 24 × 3 × √3 × 208 / 1000 = 25.95 kW
|
| 308 |
+
"""
|
| 309 |
+
config = make_simple_power_config(num_pdus=1)
|
| 310 |
+
sim = PowerSimulation(config, it_load_kw=27.0)
|
| 311 |
+
result = sim.step(1.0, 27.0)
|
| 312 |
+
|
| 313 |
+
alarm_types = [a.alarm_type for a in result.alarms]
|
| 314 |
+
assert "phase_overcurrent" in alarm_types, f"Expected overcurrent alarm, got {alarm_types}"
|
| 315 |
+
|
| 316 |
+
def test_multiple_pdus_share_load(self) -> None:
|
| 317 |
+
"""Load should be distributed across PDUs."""
|
| 318 |
+
config = make_simple_power_config(num_pdus=4)
|
| 319 |
+
sim = PowerSimulation(config, it_load_kw=20.0)
|
| 320 |
+
sim.step(1.0, 20.0)
|
| 321 |
+
|
| 322 |
+
for pdu in sim.state.pdus:
|
| 323 |
+
assert abs(pdu.output_power_kw - 5.0) < 0.01
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
# ===========================================================================
|
| 327 |
+
# Generator Tests
|
| 328 |
+
# ===========================================================================
|
| 329 |
+
class TestGenerator:
|
| 330 |
+
"""Validate generator state machine and fuel consumption."""
|
| 331 |
+
|
| 332 |
+
def test_generator_startup_sequence(self) -> None:
|
| 333 |
+
"""Generator should progress: OFF → START_DELAY → CRANKING → WARMING → READY."""
|
| 334 |
+
config = make_simple_power_config()
|
| 335 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 336 |
+
gen = sim.state.generator
|
| 337 |
+
|
| 338 |
+
assert gen.state == GeneratorState.OFF
|
| 339 |
+
|
| 340 |
+
# Start generator
|
| 341 |
+
sim.start_generator()
|
| 342 |
+
assert gen.state == GeneratorState.START_DELAY
|
| 343 |
+
|
| 344 |
+
# Run through start delay (4s)
|
| 345 |
+
for _ in range(5):
|
| 346 |
+
sim.step(1.0, 160.0)
|
| 347 |
+
assert gen.state == GeneratorState.CRANKING
|
| 348 |
+
|
| 349 |
+
# Run through cranking (5s)
|
| 350 |
+
for _ in range(6):
|
| 351 |
+
sim.step(1.0, 160.0)
|
| 352 |
+
assert gen.state == GeneratorState.WARMING
|
| 353 |
+
|
| 354 |
+
# Run through warmup (8s)
|
| 355 |
+
for _ in range(9):
|
| 356 |
+
sim.step(1.0, 160.0)
|
| 357 |
+
assert gen.state == GeneratorState.READY
|
| 358 |
+
|
| 359 |
+
def test_generator_total_startup_time(self) -> None:
|
| 360 |
+
"""Total startup time should be ~17s (4 + 5 + 8)."""
|
| 361 |
+
config = make_simple_power_config()
|
| 362 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 363 |
+
sim.start_generator()
|
| 364 |
+
|
| 365 |
+
# Run until ready
|
| 366 |
+
steps = 0
|
| 367 |
+
for steps in range(1, 100):
|
| 368 |
+
sim.step(1.0, 160.0)
|
| 369 |
+
if sim.state.generator.is_available:
|
| 370 |
+
break
|
| 371 |
+
|
| 372 |
+
# 4s delay + 5s crank + 8s warmup = 17s, allow ±2s
|
| 373 |
+
assert 15 <= steps <= 20, f"Startup took {steps}s, expected ~17s"
|
| 374 |
+
|
| 375 |
+
def test_fuel_consumption_under_load(self) -> None:
|
| 376 |
+
"""Fuel should be consumed when generator is loaded."""
|
| 377 |
+
config = make_simple_power_config()
|
| 378 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 379 |
+
gen = sim.state.generator
|
| 380 |
+
initial_fuel = gen.fuel_level_liters
|
| 381 |
+
|
| 382 |
+
# Trigger utility loss to get generator running and loaded
|
| 383 |
+
sim.set_utility_available(False)
|
| 384 |
+
|
| 385 |
+
# Run for 30 seconds (enough for startup + some loaded time)
|
| 386 |
+
for _ in range(30):
|
| 387 |
+
sim.step(1.0, 160.0)
|
| 388 |
+
|
| 389 |
+
assert gen.fuel_level_liters < initial_fuel, "Fuel should be consumed"
|
| 390 |
+
|
| 391 |
+
def test_fuel_consumption_rate(self) -> None:
|
| 392 |
+
"""Fuel rate = full_rate × (0.1 + 0.9 × load_fraction).
|
| 393 |
+
|
| 394 |
+
At 160kW / 750kW = 21.3% load:
|
| 395 |
+
rate = 180 × (0.1 + 0.9 × 0.213) = 180 × 0.292 = 52.6 L/hr
|
| 396 |
+
In 1 hour: ~52.6 liters consumed
|
| 397 |
+
"""
|
| 398 |
+
config = make_simple_power_config()
|
| 399 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 400 |
+
|
| 401 |
+
# Must disable utility so ATS stays on generator for full hour
|
| 402 |
+
sim.set_utility_available(False)
|
| 403 |
+
|
| 404 |
+
# Manually put generator into loaded state for cleaner test
|
| 405 |
+
gen = sim.state.generator
|
| 406 |
+
gen.state = GeneratorState.LOADED
|
| 407 |
+
gen.load_fraction = 160.0 / 750.0
|
| 408 |
+
gen.output_power_kw = 160.0
|
| 409 |
+
sim.state.ats.position = ATSPosition.GENERATOR
|
| 410 |
+
|
| 411 |
+
initial_fuel = gen.fuel_level_liters
|
| 412 |
+
|
| 413 |
+
# Run for 1 hour
|
| 414 |
+
for _ in range(3600):
|
| 415 |
+
sim.step(1.0, 160.0)
|
| 416 |
+
|
| 417 |
+
consumed = initial_fuel - gen.fuel_level_liters
|
| 418 |
+
expected_rate = 180.0 * (0.1 + 0.9 * (160.0 / 750.0))
|
| 419 |
+
# Allow 10% tolerance
|
| 420 |
+
assert abs(consumed - expected_rate) < expected_rate * 0.15, \
|
| 421 |
+
f"Consumed {consumed:.1f}L/hr, expected ~{expected_rate:.1f}L/hr"
|
| 422 |
+
|
| 423 |
+
def test_generator_cooldown(self) -> None:
|
| 424 |
+
"""Generator should cool down for 5 minutes before shutdown."""
|
| 425 |
+
config = make_simple_power_config()
|
| 426 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 427 |
+
|
| 428 |
+
# Get generator running
|
| 429 |
+
gen = sim.state.generator
|
| 430 |
+
gen.state = GeneratorState.LOADED
|
| 431 |
+
gen.output_power_kw = 160.0
|
| 432 |
+
|
| 433 |
+
# Stop generator
|
| 434 |
+
sim.stop_generator()
|
| 435 |
+
assert gen.state == GeneratorState.COOLDOWN
|
| 436 |
+
|
| 437 |
+
# Run cooldown (300s)
|
| 438 |
+
for i in range(299):
|
| 439 |
+
sim.step(1.0, 160.0)
|
| 440 |
+
assert gen.state == GeneratorState.COOLDOWN, f"Still cooling at {i+1}s"
|
| 441 |
+
|
| 442 |
+
# Should transition to OFF after 300s
|
| 443 |
+
sim.step(1.0, 160.0)
|
| 444 |
+
assert gen.state == GeneratorState.OFF
|
| 445 |
+
|
| 446 |
+
def test_fuel_exhaustion(self) -> None:
|
| 447 |
+
"""Generator should shut down when fuel runs out."""
|
| 448 |
+
config = PowerConfig(
|
| 449 |
+
ups_units=[UPSConfig(unit_id="UPS-1")],
|
| 450 |
+
pdus=[PDUConfig(pdu_id="PDU-1")],
|
| 451 |
+
generator=GeneratorConfig(fuel_tank_liters=1.0), # Very small tank
|
| 452 |
+
ats=ATSConfig(),
|
| 453 |
+
)
|
| 454 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 455 |
+
|
| 456 |
+
gen = sim.state.generator
|
| 457 |
+
gen.state = GeneratorState.LOADED
|
| 458 |
+
gen.load_fraction = 160.0 / 750.0
|
| 459 |
+
gen.output_power_kw = 160.0
|
| 460 |
+
sim.state.ats.position = ATSPosition.GENERATOR
|
| 461 |
+
sim.set_utility_available(False)
|
| 462 |
+
|
| 463 |
+
# Run until fuel runs out (1L / 52.6 L/hr ≈ 68 seconds)
|
| 464 |
+
all_alarms: list[PowerAlarm] = []
|
| 465 |
+
for _ in range(200):
|
| 466 |
+
result = sim.step(1.0, 160.0)
|
| 467 |
+
all_alarms.extend(result.alarms)
|
| 468 |
+
if gen.state == GeneratorState.OFF:
|
| 469 |
+
break
|
| 470 |
+
|
| 471 |
+
assert gen.state == GeneratorState.OFF
|
| 472 |
+
assert gen.fuel_level_liters == 0.0
|
| 473 |
+
alarm_types = [a.alarm_type for a in all_alarms]
|
| 474 |
+
assert "fuel_exhausted" in alarm_types
|
| 475 |
+
|
| 476 |
+
|
| 477 |
+
# ===========================================================================
|
| 478 |
+
# ATS Tests
|
| 479 |
+
# ===========================================================================
|
| 480 |
+
class TestATS:
|
| 481 |
+
"""Validate Automatic Transfer Switch behavior."""
|
| 482 |
+
|
| 483 |
+
def test_ats_starts_on_utility(self) -> None:
|
| 484 |
+
"""ATS should start in UTILITY position."""
|
| 485 |
+
config = make_simple_power_config()
|
| 486 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 487 |
+
assert sim.state.ats.position == ATSPosition.UTILITY
|
| 488 |
+
|
| 489 |
+
def test_ats_transfers_on_utility_loss(self) -> None:
|
| 490 |
+
"""ATS should begin transfer when utility is lost."""
|
| 491 |
+
config = make_simple_power_config()
|
| 492 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 493 |
+
|
| 494 |
+
sim.set_utility_available(False)
|
| 495 |
+
sim.step(0.001, 160.0) # Tiny step to trigger detection
|
| 496 |
+
|
| 497 |
+
assert sim.state.ats.position == ATSPosition.TRANSFERRING
|
| 498 |
+
|
| 499 |
+
def test_ats_waits_for_generator(self) -> None:
|
| 500 |
+
"""ATS should stay TRANSFERRING until generator is ready."""
|
| 501 |
+
config = make_simple_power_config()
|
| 502 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 503 |
+
|
| 504 |
+
sim.set_utility_available(False)
|
| 505 |
+
|
| 506 |
+
# Run for 5 seconds (generator still starting up)
|
| 507 |
+
for _ in range(5):
|
| 508 |
+
sim.step(1.0, 160.0)
|
| 509 |
+
|
| 510 |
+
# Should still be transferring because generator isn't ready yet
|
| 511 |
+
gen = sim.state.generator
|
| 512 |
+
assert not gen.is_available
|
| 513 |
+
assert sim.state.ats.position == ATSPosition.TRANSFERRING
|
| 514 |
+
|
| 515 |
+
def test_ats_completes_transfer_to_generator(self) -> None:
|
| 516 |
+
"""ATS should transfer to generator once it's ready."""
|
| 517 |
+
config = make_simple_power_config()
|
| 518 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 519 |
+
|
| 520 |
+
sim.set_utility_available(False)
|
| 521 |
+
|
| 522 |
+
# Run long enough for generator startup (~17s) + transfer
|
| 523 |
+
for _ in range(25):
|
| 524 |
+
sim.step(1.0, 160.0)
|
| 525 |
+
|
| 526 |
+
assert sim.state.ats.position == ATSPosition.GENERATOR
|
| 527 |
+
assert sim.state.generator.state == GeneratorState.LOADED
|
| 528 |
+
|
| 529 |
+
def test_ats_retransfer_delay(self) -> None:
|
| 530 |
+
"""ATS should wait retransfer_delay (300s) before switching back to utility."""
|
| 531 |
+
config = make_simple_power_config()
|
| 532 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 533 |
+
|
| 534 |
+
# Lose utility and get on generator
|
| 535 |
+
sim.set_utility_available(False)
|
| 536 |
+
for _ in range(25):
|
| 537 |
+
sim.step(1.0, 160.0)
|
| 538 |
+
assert sim.state.ats.position == ATSPosition.GENERATOR
|
| 539 |
+
|
| 540 |
+
# Restore utility
|
| 541 |
+
sim.set_utility_available(True)
|
| 542 |
+
|
| 543 |
+
# Run for 200s — should still be on generator
|
| 544 |
+
for _ in range(200):
|
| 545 |
+
sim.step(1.0, 160.0)
|
| 546 |
+
assert sim.state.ats.position == ATSPosition.GENERATOR
|
| 547 |
+
|
| 548 |
+
# Run past 300s retransfer delay
|
| 549 |
+
for _ in range(150):
|
| 550 |
+
sim.step(1.0, 160.0)
|
| 551 |
+
|
| 552 |
+
# Should be transferring back or on utility
|
| 553 |
+
ats_pos = sim.state.ats.position
|
| 554 |
+
assert ats_pos in (ATSPosition.TRANSFERRING, ATSPosition.UTILITY), \
|
| 555 |
+
f"Expected transfer back, got {ats_pos}"
|
| 556 |
+
|
| 557 |
+
|
| 558 |
+
# ===========================================================================
|
| 559 |
+
# Full Scenario Tests
|
| 560 |
+
# ===========================================================================
|
| 561 |
+
class TestUtilityLossScenario:
|
| 562 |
+
"""End-to-end utility loss and recovery scenario."""
|
| 563 |
+
|
| 564 |
+
def test_full_utility_loss_and_recovery(self) -> None:
|
| 565 |
+
"""Complete scenario: utility loss → battery bridge → generator → recovery.
|
| 566 |
+
|
| 567 |
+
Timeline:
|
| 568 |
+
t=0: Utility fails
|
| 569 |
+
t=0-17s: UPS on battery, generator starting
|
| 570 |
+
t=17s: Generator ready, ATS transfers
|
| 571 |
+
t=17s+: On generator power
|
| 572 |
+
t=100s: Utility restored
|
| 573 |
+
t=400s: Retransfer to utility (after 300s delay)
|
| 574 |
+
"""
|
| 575 |
+
config = make_simple_power_config()
|
| 576 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 577 |
+
|
| 578 |
+
# Phase 1: Utility loss
|
| 579 |
+
sim.set_utility_available(False)
|
| 580 |
+
|
| 581 |
+
# Run through startup sequence
|
| 582 |
+
ups_on_battery = False
|
| 583 |
+
gen_ready = False
|
| 584 |
+
on_generator = False
|
| 585 |
+
|
| 586 |
+
for t in range(1, 30):
|
| 587 |
+
result = sim.step(1.0, 160.0)
|
| 588 |
+
if sim.state.ups_units[0].mode == UPSMode.ON_BATTERY:
|
| 589 |
+
ups_on_battery = True
|
| 590 |
+
if sim.state.generator.is_available:
|
| 591 |
+
gen_ready = True
|
| 592 |
+
if sim.state.ats.position == ATSPosition.GENERATOR:
|
| 593 |
+
on_generator = True
|
| 594 |
+
|
| 595 |
+
assert ups_on_battery, "UPS should have been on battery"
|
| 596 |
+
assert gen_ready, "Generator should be ready by 30s"
|
| 597 |
+
assert on_generator, "Should be on generator by 30s"
|
| 598 |
+
|
| 599 |
+
# Phase 2: Running on generator
|
| 600 |
+
result = sim.step(1.0, 160.0)
|
| 601 |
+
assert result.on_generator
|
| 602 |
+
assert sim.state.generator.state == GeneratorState.LOADED
|
| 603 |
+
|
| 604 |
+
# Phase 3: Utility restored
|
| 605 |
+
sim.set_utility_available(True)
|
| 606 |
+
|
| 607 |
+
# Run past retransfer delay (300s)
|
| 608 |
+
for _ in range(350):
|
| 609 |
+
sim.step(1.0, 160.0)
|
| 610 |
+
|
| 611 |
+
# Should be back on utility (or transferring)
|
| 612 |
+
assert sim.state.ats.position in (ATSPosition.UTILITY, ATSPosition.TRANSFERRING)
|
| 613 |
+
|
| 614 |
+
def test_power_available_during_transfer(self) -> None:
|
| 615 |
+
"""UPS should bridge the gap during ATS transfer."""
|
| 616 |
+
config = make_simple_power_config()
|
| 617 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 618 |
+
|
| 619 |
+
# Initial: power available
|
| 620 |
+
result = sim.step(1.0, 160.0)
|
| 621 |
+
assert result.power_available
|
| 622 |
+
|
| 623 |
+
# During utility loss, UPS provides power
|
| 624 |
+
sim.set_utility_available(False)
|
| 625 |
+
for _ in range(5):
|
| 626 |
+
result = sim.step(1.0, 160.0)
|
| 627 |
+
|
| 628 |
+
# UPS is on battery, still providing power
|
| 629 |
+
assert sim.state.ups_units[0].mode == UPSMode.ON_BATTERY
|
| 630 |
+
# The IT load is still being served
|
| 631 |
+
assert sim.state.ups_units[0].output_power_kw > 0
|
| 632 |
+
|
| 633 |
+
|
| 634 |
+
# ===========================================================================
|
| 635 |
+
# Integration with DatacenterState Tests
|
| 636 |
+
# ===========================================================================
|
| 637 |
+
class TestPowerStateIntegration:
|
| 638 |
+
"""Test PowerState integration with DatacenterState."""
|
| 639 |
+
|
| 640 |
+
def test_datacenter_state_with_power(self) -> None:
|
| 641 |
+
"""DatacenterState should use PowerState for PUE when available."""
|
| 642 |
+
from dc_ops_env.simulation.types import DatacenterState, PowerState, UPSState, PDUState
|
| 643 |
+
|
| 644 |
+
ups = UPSState(unit_id="UPS-1", heat_output_kw=5.0)
|
| 645 |
+
pdu = PDUState(pdu_id="PDU-1", heat_output_kw=1.0)
|
| 646 |
+
power = PowerState(ups_units=[ups], pdus=[pdu])
|
| 647 |
+
|
| 648 |
+
state = DatacenterState(
|
| 649 |
+
power=power,
|
| 650 |
+
lighting_power_kw=5.0,
|
| 651 |
+
)
|
| 652 |
+
# With no zones (no IT load), PUE should be 1.0
|
| 653 |
+
assert state.pue == 1.0
|
| 654 |
+
|
| 655 |
+
def test_datacenter_state_without_power_uses_stubs(self) -> None:
|
| 656 |
+
"""DatacenterState without PowerState should use stub fractions."""
|
| 657 |
+
from dc_ops_env.simulation.types import DatacenterState
|
| 658 |
+
|
| 659 |
+
state = DatacenterState(
|
| 660 |
+
ups_loss_fraction=0.05,
|
| 661 |
+
pdu_loss_fraction=0.02,
|
| 662 |
+
)
|
| 663 |
+
# Should use the stub loss fractions (backward compat)
|
| 664 |
+
assert state.power is None
|
| 665 |
+
|
| 666 |
+
|
| 667 |
+
# ===========================================================================
|
| 668 |
+
# Performance Test
|
| 669 |
+
# ===========================================================================
|
| 670 |
+
class TestPerformance:
|
| 671 |
+
"""Ensure power simulation is fast enough for RL training."""
|
| 672 |
+
|
| 673 |
+
def test_steps_per_second(self) -> None:
|
| 674 |
+
"""Power sim should sustain >10,000 steps/sec."""
|
| 675 |
+
import time
|
| 676 |
+
|
| 677 |
+
config = make_simple_power_config(num_ups=2, num_pdus=20)
|
| 678 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 679 |
+
|
| 680 |
+
n_steps = 5000
|
| 681 |
+
start = time.perf_counter()
|
| 682 |
+
for _ in range(n_steps):
|
| 683 |
+
sim.step(1.0, 160.0)
|
| 684 |
+
elapsed = time.perf_counter() - start
|
| 685 |
+
|
| 686 |
+
steps_per_sec = n_steps / elapsed
|
| 687 |
+
assert steps_per_sec > 10_000, \
|
| 688 |
+
f"Only {steps_per_sec:.0f} steps/sec, need >10,000"
|
| 689 |
+
|
| 690 |
+
|
| 691 |
+
# ===========================================================================
|
| 692 |
+
# Mutation Helper Tests
|
| 693 |
+
# ===========================================================================
|
| 694 |
+
class TestMutationHelpers:
|
| 695 |
+
"""Test convenience methods for scenario injection."""
|
| 696 |
+
|
| 697 |
+
def test_set_utility_available(self) -> None:
|
| 698 |
+
config = make_simple_power_config()
|
| 699 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 700 |
+
assert sim.state.utility_available is True
|
| 701 |
+
sim.set_utility_available(False)
|
| 702 |
+
assert sim.state.utility_available is False
|
| 703 |
+
|
| 704 |
+
def test_set_ups_mode(self) -> None:
|
| 705 |
+
config = make_simple_power_config()
|
| 706 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 707 |
+
assert sim.set_ups_mode("UPS-1", UPSMode.ECO)
|
| 708 |
+
assert sim.state.ups_units[0].mode == UPSMode.ECO
|
| 709 |
+
assert not sim.set_ups_mode("UPS-999", UPSMode.ECO)
|
| 710 |
+
|
| 711 |
+
def test_inject_and_clear_ups_fault(self) -> None:
|
| 712 |
+
config = make_simple_power_config()
|
| 713 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 714 |
+
assert sim.inject_ups_fault("UPS-1")
|
| 715 |
+
assert sim.state.ups_units[0].mode == UPSMode.FAULT
|
| 716 |
+
assert sim.clear_ups_fault("UPS-1")
|
| 717 |
+
assert sim.state.ups_units[0].mode == UPSMode.DOUBLE_CONVERSION
|
| 718 |
+
|
| 719 |
+
def test_start_stop_generator(self) -> None:
|
| 720 |
+
config = make_simple_power_config()
|
| 721 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 722 |
+
sim.start_generator()
|
| 723 |
+
assert sim.state.generator.state == GeneratorState.START_DELAY
|
| 724 |
+
|
| 725 |
+
# Run to READY
|
| 726 |
+
for _ in range(20):
|
| 727 |
+
sim.step(1.0, 160.0)
|
| 728 |
+
assert sim.state.generator.is_available
|
| 729 |
+
|
| 730 |
+
sim.stop_generator()
|
| 731 |
+
assert sim.state.generator.state == GeneratorState.COOLDOWN
|
| 732 |
+
|
| 733 |
+
def test_refuel_generator(self) -> None:
|
| 734 |
+
config = make_simple_power_config()
|
| 735 |
+
sim = PowerSimulation(config, it_load_kw=160.0)
|
| 736 |
+
gen = sim.state.generator
|
| 737 |
+
gen.fuel_level_liters = 500.0
|
| 738 |
+
|
| 739 |
+
sim.refuel_generator(200.0)
|
| 740 |
+
assert gen.fuel_level_liters == 700.0
|
| 741 |
+
|
| 742 |
+
sim.refuel_generator() # Full tank
|
| 743 |
+
assert gen.fuel_level_liters == gen.fuel_tank_liters
|
tests/test_rewards.py
ADDED
|
@@ -0,0 +1,650 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Tests for the multi-objective reward function.
|
| 8 |
+
|
| 9 |
+
Validates:
|
| 10 |
+
- softplus numerical stability
|
| 11 |
+
- Individual reward component behavior and bounds
|
| 12 |
+
- Weight profiles sum to 1.0
|
| 13 |
+
- Delta-based progress tracking
|
| 14 |
+
- Action quality heuristics
|
| 15 |
+
- End-to-end reward computation
|
| 16 |
+
- Integration with the full environment
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
import math
|
| 22 |
+
|
| 23 |
+
import pytest
|
| 24 |
+
|
| 25 |
+
from dc_ops_env.rewards.reward_function import (
|
| 26 |
+
RewardComponents,
|
| 27 |
+
RewardFunction,
|
| 28 |
+
RewardWeights,
|
| 29 |
+
WEIGHT_PROFILES,
|
| 30 |
+
softplus,
|
| 31 |
+
)
|
| 32 |
+
from dc_ops_env.config import (
|
| 33 |
+
ASHRAE_CLASSES,
|
| 34 |
+
make_default_datacenter_config,
|
| 35 |
+
)
|
| 36 |
+
from dc_ops_env.simulation.thermal import ThermalSimulation
|
| 37 |
+
from dc_ops_env.simulation.power import PowerSimulation
|
| 38 |
+
from dc_ops_env.simulation.types import UPSMode
|
| 39 |
+
from dc_ops_env.actions.parser import CommandResult
|
| 40 |
+
from dc_ops_env.scenarios.base import ScenarioResult
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# ===========================================================================
|
| 44 |
+
# Helpers
|
| 45 |
+
# ===========================================================================
|
| 46 |
+
def _make_thermal_sim(setpoint_c: float = 20.0) -> ThermalSimulation:
|
| 47 |
+
"""Create a warmed-up thermal simulation with a given CRAC setpoint."""
|
| 48 |
+
config = make_default_datacenter_config()
|
| 49 |
+
for zone_cfg in config.zones:
|
| 50 |
+
for crac_cfg in zone_cfg.crac_units:
|
| 51 |
+
crac_cfg.initial_setpoint_c = setpoint_c
|
| 52 |
+
sim = ThermalSimulation(config)
|
| 53 |
+
# Warmup to reach steady state
|
| 54 |
+
for _ in range(120):
|
| 55 |
+
sim.step(1.0)
|
| 56 |
+
return sim
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _make_power_sim(
|
| 60 |
+
utility_available: bool = True,
|
| 61 |
+
) -> PowerSimulation:
|
| 62 |
+
"""Create a power simulation with default config."""
|
| 63 |
+
config = make_default_datacenter_config()
|
| 64 |
+
it_load = 160.0 # Default total IT load
|
| 65 |
+
power_sim = PowerSimulation(config.power, it_load_kw=it_load)
|
| 66 |
+
if not utility_available:
|
| 67 |
+
power_sim.set_utility_available(False)
|
| 68 |
+
# Step a bit so UPS transitions to battery
|
| 69 |
+
for _ in range(5):
|
| 70 |
+
power_sim.step(1.0, it_load)
|
| 71 |
+
return power_sim
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _ok_cmd(name: str = "check_status") -> CommandResult:
|
| 75 |
+
return CommandResult(success=True, message="OK", command_name=name)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def _fail_cmd() -> CommandResult:
|
| 79 |
+
return CommandResult(success=False, message="Unknown command", command_name="")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# ===========================================================================
|
| 83 |
+
# softplus Unit Tests
|
| 84 |
+
# ===========================================================================
|
| 85 |
+
class TestSoftplus:
|
| 86 |
+
"""Validate the numerically stable softplus implementation."""
|
| 87 |
+
|
| 88 |
+
def test_softplus_positive(self) -> None:
|
| 89 |
+
assert softplus(1.0) == pytest.approx(math.log1p(math.exp(1.0)), abs=1e-10)
|
| 90 |
+
|
| 91 |
+
def test_softplus_zero(self) -> None:
|
| 92 |
+
assert softplus(0.0) == pytest.approx(math.log(2.0), abs=1e-10)
|
| 93 |
+
|
| 94 |
+
def test_softplus_negative(self) -> None:
|
| 95 |
+
assert softplus(-5.0) == pytest.approx(math.log1p(math.exp(-5.0)), abs=1e-10)
|
| 96 |
+
|
| 97 |
+
def test_softplus_large_positive_clamp(self) -> None:
|
| 98 |
+
"""x > 20 should return x directly (avoid exp overflow)."""
|
| 99 |
+
assert softplus(25.0) == 25.0
|
| 100 |
+
assert softplus(100.0) == 100.0
|
| 101 |
+
|
| 102 |
+
def test_softplus_large_negative_clamp(self) -> None:
|
| 103 |
+
"""x < -20 should return 0.0 (avoid underflow noise)."""
|
| 104 |
+
assert softplus(-25.0) == 0.0
|
| 105 |
+
assert softplus(-100.0) == 0.0
|
| 106 |
+
|
| 107 |
+
def test_softplus_monotonic(self) -> None:
|
| 108 |
+
"""softplus should be monotonically increasing."""
|
| 109 |
+
values = [-10, -5, -1, 0, 1, 5, 10, 15]
|
| 110 |
+
results = [softplus(x) for x in values]
|
| 111 |
+
for i in range(len(results) - 1):
|
| 112 |
+
assert results[i] < results[i + 1]
|
| 113 |
+
|
| 114 |
+
def test_softplus_always_nonnegative(self) -> None:
|
| 115 |
+
for x in [-20, -10, -1, 0, 1, 10]:
|
| 116 |
+
assert softplus(x) >= 0.0
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# ===========================================================================
|
| 120 |
+
# Weight Profile Tests
|
| 121 |
+
# ===========================================================================
|
| 122 |
+
class TestWeightProfiles:
|
| 123 |
+
"""Validate weight profiles sum to 1.0 and are well-formed."""
|
| 124 |
+
|
| 125 |
+
@pytest.mark.parametrize("profile_name", ["thermal", "power", "default"])
|
| 126 |
+
def test_weights_sum_to_one(self, profile_name: str) -> None:
|
| 127 |
+
w = WEIGHT_PROFILES[profile_name]
|
| 128 |
+
total = (
|
| 129 |
+
w.thermal_safety + w.power_safety + w.efficiency
|
| 130 |
+
+ w.scenario_progress + w.procedure + w.action_quality
|
| 131 |
+
)
|
| 132 |
+
assert total == pytest.approx(1.0, abs=1e-6)
|
| 133 |
+
|
| 134 |
+
@pytest.mark.parametrize("profile_name", ["thermal", "power", "default"])
|
| 135 |
+
def test_weights_nonnegative(self, profile_name: str) -> None:
|
| 136 |
+
w = WEIGHT_PROFILES[profile_name]
|
| 137 |
+
assert w.thermal_safety >= 0
|
| 138 |
+
assert w.power_safety >= 0
|
| 139 |
+
assert w.efficiency >= 0
|
| 140 |
+
assert w.scenario_progress >= 0
|
| 141 |
+
assert w.procedure >= 0
|
| 142 |
+
assert w.action_quality >= 0
|
| 143 |
+
|
| 144 |
+
def test_thermal_profile_emphasizes_thermal(self) -> None:
|
| 145 |
+
w = WEIGHT_PROFILES["thermal"]
|
| 146 |
+
assert w.thermal_safety >= w.power_safety
|
| 147 |
+
assert w.thermal_safety >= w.efficiency
|
| 148 |
+
|
| 149 |
+
def test_power_profile_emphasizes_power(self) -> None:
|
| 150 |
+
w = WEIGHT_PROFILES["power"]
|
| 151 |
+
assert w.power_safety >= w.thermal_safety
|
| 152 |
+
assert w.power_safety >= w.efficiency
|
| 153 |
+
|
| 154 |
+
def test_unknown_profile_falls_back_to_default(self) -> None:
|
| 155 |
+
rf = RewardFunction(scenario_type="unknown_type")
|
| 156 |
+
# Should use default weights without error
|
| 157 |
+
thermal_sim = _make_thermal_sim()
|
| 158 |
+
components = rf.compute(
|
| 159 |
+
thermal_sim, None, _ok_cmd(), "check_status", ["check_status"], None,
|
| 160 |
+
)
|
| 161 |
+
assert isinstance(components, RewardComponents)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
# ===========================================================================
|
| 165 |
+
# Thermal Safety Component Tests
|
| 166 |
+
# ===========================================================================
|
| 167 |
+
class TestThermalSafety:
|
| 168 |
+
"""Validate the dual-softplus thermal safety barrier."""
|
| 169 |
+
|
| 170 |
+
def test_safe_temps_near_zero(self) -> None:
|
| 171 |
+
"""With comfortable temps (20°C setpoint), penalty should be near 0."""
|
| 172 |
+
thermal_sim = _make_thermal_sim(setpoint_c=20.0)
|
| 173 |
+
r = RewardFunction._thermal_safety(thermal_sim)
|
| 174 |
+
# Should be in [-1, 0] and close to 0 for safe temps
|
| 175 |
+
assert -0.3 <= r <= 0.0
|
| 176 |
+
|
| 177 |
+
def test_returns_negative_or_zero(self) -> None:
|
| 178 |
+
"""Thermal safety should never return positive values."""
|
| 179 |
+
for sp in [15.0, 20.0, 24.0]:
|
| 180 |
+
thermal_sim = _make_thermal_sim(setpoint_c=sp)
|
| 181 |
+
r = RewardFunction._thermal_safety(thermal_sim)
|
| 182 |
+
assert r <= 0.0
|
| 183 |
+
|
| 184 |
+
def test_higher_setpoint_more_penalty(self) -> None:
|
| 185 |
+
"""Higher setpoints → hotter temps → more penalty."""
|
| 186 |
+
r_low = RewardFunction._thermal_safety(_make_thermal_sim(15.0))
|
| 187 |
+
r_high = RewardFunction._thermal_safety(_make_thermal_sim(24.0))
|
| 188 |
+
# Higher setpoint should yield equal or more negative reward
|
| 189 |
+
assert r_high <= r_low
|
| 190 |
+
|
| 191 |
+
def test_bounded_to_neg_one(self) -> None:
|
| 192 |
+
"""Even extreme temps should be bounded to [-1, 0] via tanh."""
|
| 193 |
+
thermal_sim = _make_thermal_sim(setpoint_c=15.0)
|
| 194 |
+
# Force extreme rack inlet temps
|
| 195 |
+
for zone in thermal_sim.state.zones:
|
| 196 |
+
for rack in zone.racks:
|
| 197 |
+
rack.inlet_temp_c = 50.0
|
| 198 |
+
r = RewardFunction._thermal_safety(thermal_sim)
|
| 199 |
+
assert r >= -1.0
|
| 200 |
+
assert r <= 0.0
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
# ===========================================================================
|
| 204 |
+
# Power Safety Component Tests
|
| 205 |
+
# ===========================================================================
|
| 206 |
+
class TestPowerSafety:
|
| 207 |
+
"""Validate UPS battery and fault penalty."""
|
| 208 |
+
|
| 209 |
+
def test_no_power_sim_returns_zero(self) -> None:
|
| 210 |
+
assert RewardFunction._power_safety(None) == 0.0
|
| 211 |
+
|
| 212 |
+
def test_utility_available_near_zero(self) -> None:
|
| 213 |
+
"""Normal operation (utility available) should have near-zero penalty."""
|
| 214 |
+
power_sim = _make_power_sim(utility_available=True)
|
| 215 |
+
r = RewardFunction._power_safety(power_sim)
|
| 216 |
+
# On utility with full battery → no penalty
|
| 217 |
+
assert -0.15 <= r <= 0.0
|
| 218 |
+
|
| 219 |
+
def test_on_battery_gives_penalty(self) -> None:
|
| 220 |
+
"""UPS on battery should yield a meaningful penalty."""
|
| 221 |
+
power_sim = _make_power_sim(utility_available=False)
|
| 222 |
+
r = RewardFunction._power_safety(power_sim)
|
| 223 |
+
assert r < 0.0 # Should be negative when on battery
|
| 224 |
+
|
| 225 |
+
def test_low_soc_increases_penalty(self) -> None:
|
| 226 |
+
"""Lower SOC while on battery should increase penalty."""
|
| 227 |
+
power_sim = _make_power_sim(utility_available=False)
|
| 228 |
+
# Force low SOC
|
| 229 |
+
for ups in power_sim.state.ups_units:
|
| 230 |
+
ups.battery_soc = 0.3
|
| 231 |
+
r_low = RewardFunction._power_safety(power_sim)
|
| 232 |
+
|
| 233 |
+
power_sim2 = _make_power_sim(utility_available=False)
|
| 234 |
+
for ups in power_sim2.state.ups_units:
|
| 235 |
+
ups.battery_soc = 0.8
|
| 236 |
+
r_high = RewardFunction._power_safety(power_sim2)
|
| 237 |
+
|
| 238 |
+
assert r_low < r_high # Low SOC → more negative
|
| 239 |
+
|
| 240 |
+
def test_fault_mode_heavy_penalty(self) -> None:
|
| 241 |
+
"""UPS in FAULT mode should yield heavy penalty."""
|
| 242 |
+
power_sim = _make_power_sim(utility_available=True)
|
| 243 |
+
for ups in power_sim.state.ups_units:
|
| 244 |
+
ups.mode = UPSMode.FAULT
|
| 245 |
+
r = RewardFunction._power_safety(power_sim)
|
| 246 |
+
assert r < -0.7 # Should be very negative
|
| 247 |
+
|
| 248 |
+
def test_bounded(self) -> None:
|
| 249 |
+
"""Power safety should be in [-1, 0]."""
|
| 250 |
+
power_sim = _make_power_sim(utility_available=False)
|
| 251 |
+
for ups in power_sim.state.ups_units:
|
| 252 |
+
ups.mode = UPSMode.FAULT
|
| 253 |
+
ups.battery_soc = 0.0
|
| 254 |
+
r = RewardFunction._power_safety(power_sim)
|
| 255 |
+
assert -1.0 <= r <= 0.0
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
# ===========================================================================
|
| 259 |
+
# Efficiency Component Tests
|
| 260 |
+
# ===========================================================================
|
| 261 |
+
class TestEfficiency:
|
| 262 |
+
"""Validate PUE-based efficiency penalty."""
|
| 263 |
+
|
| 264 |
+
def test_low_pue_near_zero_penalty(self) -> None:
|
| 265 |
+
"""PUE close to 1.0 should yield near-zero penalty."""
|
| 266 |
+
thermal_sim = _make_thermal_sim(20.0)
|
| 267 |
+
r = RewardFunction._efficiency(thermal_sim, None)
|
| 268 |
+
pue = thermal_sim.state.pue
|
| 269 |
+
# PUE is typically 1.4-1.8 in our sim, so some penalty is expected
|
| 270 |
+
assert -0.5 <= r <= 0.0
|
| 271 |
+
|
| 272 |
+
def test_returns_negative_or_zero(self) -> None:
|
| 273 |
+
thermal_sim = _make_thermal_sim(20.0)
|
| 274 |
+
r = RewardFunction._efficiency(thermal_sim, None)
|
| 275 |
+
assert r <= 0.0
|
| 276 |
+
|
| 277 |
+
def test_bounded(self) -> None:
|
| 278 |
+
"""Even extreme PUE should be bounded."""
|
| 279 |
+
thermal_sim = _make_thermal_sim(15.0)
|
| 280 |
+
# Force extreme PUE by manipulating state
|
| 281 |
+
thermal_sim.state._pue = 5.0
|
| 282 |
+
r = RewardFunction._efficiency(thermal_sim, None)
|
| 283 |
+
assert -1.0 <= r <= 0.0
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
# ===========================================================================
|
| 287 |
+
# Scenario Progress Component Tests
|
| 288 |
+
# ===========================================================================
|
| 289 |
+
class TestScenarioProgress:
|
| 290 |
+
"""Validate delta-based progress reward."""
|
| 291 |
+
|
| 292 |
+
def test_no_scenario_returns_zero(self) -> None:
|
| 293 |
+
rf = RewardFunction()
|
| 294 |
+
assert rf._scenario_progress(None) == 0.0
|
| 295 |
+
|
| 296 |
+
def test_first_step_progress(self) -> None:
|
| 297 |
+
"""First step with progress > 0 should yield positive delta."""
|
| 298 |
+
rf = RewardFunction()
|
| 299 |
+
result = ScenarioResult(progress=0.5)
|
| 300 |
+
r = rf._scenario_progress(result)
|
| 301 |
+
assert r == pytest.approx(0.5)
|
| 302 |
+
|
| 303 |
+
def test_delta_tracking(self) -> None:
|
| 304 |
+
"""Only the delta should be rewarded, not cumulative progress."""
|
| 305 |
+
rf = RewardFunction()
|
| 306 |
+
|
| 307 |
+
r1 = rf._scenario_progress(ScenarioResult(progress=0.3))
|
| 308 |
+
assert r1 == pytest.approx(0.3)
|
| 309 |
+
|
| 310 |
+
r2 = rf._scenario_progress(ScenarioResult(progress=0.3))
|
| 311 |
+
assert r2 == pytest.approx(0.0) # No change → no reward
|
| 312 |
+
|
| 313 |
+
r3 = rf._scenario_progress(ScenarioResult(progress=0.7))
|
| 314 |
+
assert r3 == pytest.approx(0.4) # 0.7 - 0.3
|
| 315 |
+
|
| 316 |
+
def test_negative_delta_penalized(self) -> None:
|
| 317 |
+
"""Progress regression should yield negative reward."""
|
| 318 |
+
rf = RewardFunction()
|
| 319 |
+
rf._scenario_progress(ScenarioResult(progress=0.8))
|
| 320 |
+
r = rf._scenario_progress(ScenarioResult(progress=0.5))
|
| 321 |
+
assert r == pytest.approx(-0.3)
|
| 322 |
+
|
| 323 |
+
def test_bounded(self) -> None:
|
| 324 |
+
"""Progress delta should be clamped to [-1, 1]."""
|
| 325 |
+
rf = RewardFunction()
|
| 326 |
+
r = rf._scenario_progress(ScenarioResult(progress=1.0))
|
| 327 |
+
assert -1.0 <= r <= 1.0
|
| 328 |
+
|
| 329 |
+
def test_reset_clears_state(self) -> None:
|
| 330 |
+
"""reset() should clear the previous progress."""
|
| 331 |
+
rf = RewardFunction()
|
| 332 |
+
rf._scenario_progress(ScenarioResult(progress=0.5))
|
| 333 |
+
|
| 334 |
+
rf.reset()
|
| 335 |
+
r = rf._scenario_progress(ScenarioResult(progress=0.3))
|
| 336 |
+
assert r == pytest.approx(0.3) # From 0, not from 0.5
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
# ===========================================================================
|
| 340 |
+
# Procedure Component Tests
|
| 341 |
+
# ===========================================================================
|
| 342 |
+
class TestProcedure:
|
| 343 |
+
"""Validate procedural correctness pass-through."""
|
| 344 |
+
|
| 345 |
+
def test_no_scenario_returns_zero(self) -> None:
|
| 346 |
+
assert RewardFunction._procedure(None) == 0.0
|
| 347 |
+
|
| 348 |
+
def test_positive_procedure_reward(self) -> None:
|
| 349 |
+
r = RewardFunction._procedure(ScenarioResult(procedure_reward=0.3))
|
| 350 |
+
assert r == pytest.approx(0.3)
|
| 351 |
+
|
| 352 |
+
def test_negative_procedure_reward(self) -> None:
|
| 353 |
+
r = RewardFunction._procedure(ScenarioResult(procedure_reward=-0.2))
|
| 354 |
+
assert r == pytest.approx(-0.2)
|
| 355 |
+
|
| 356 |
+
def test_clamped_to_bounds(self) -> None:
|
| 357 |
+
r = RewardFunction._procedure(ScenarioResult(procedure_reward=5.0))
|
| 358 |
+
assert r == 1.0
|
| 359 |
+
r = RewardFunction._procedure(ScenarioResult(procedure_reward=-5.0))
|
| 360 |
+
assert r == -1.0
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
# ===========================================================================
|
| 364 |
+
# Action Quality Component Tests
|
| 365 |
+
# ===========================================================================
|
| 366 |
+
class TestActionQuality:
|
| 367 |
+
"""Validate contextual action quality assessment."""
|
| 368 |
+
|
| 369 |
+
def test_invalid_command_penalty(self) -> None:
|
| 370 |
+
thermal_sim = _make_thermal_sim()
|
| 371 |
+
r = RewardFunction._action_quality(
|
| 372 |
+
_fail_cmd(), "nonsense", ["nonsense"], thermal_sim, None,
|
| 373 |
+
)
|
| 374 |
+
assert r == pytest.approx(-0.5)
|
| 375 |
+
|
| 376 |
+
def test_diagnose_rewarded(self) -> None:
|
| 377 |
+
thermal_sim = _make_thermal_sim()
|
| 378 |
+
r = RewardFunction._action_quality(
|
| 379 |
+
_ok_cmd("diagnose"), "diagnose CRAC-1", ["diagnose CRAC-1"],
|
| 380 |
+
thermal_sim, None,
|
| 381 |
+
)
|
| 382 |
+
assert r == pytest.approx(0.3)
|
| 383 |
+
|
| 384 |
+
def test_check_status_rewarded(self) -> None:
|
| 385 |
+
thermal_sim = _make_thermal_sim()
|
| 386 |
+
r = RewardFunction._action_quality(
|
| 387 |
+
_ok_cmd("check_status"), "check_status", ["check_status"],
|
| 388 |
+
thermal_sim, None,
|
| 389 |
+
)
|
| 390 |
+
assert r == pytest.approx(0.3)
|
| 391 |
+
|
| 392 |
+
def test_intervention_rewarded(self) -> None:
|
| 393 |
+
thermal_sim = _make_thermal_sim()
|
| 394 |
+
r = RewardFunction._action_quality(
|
| 395 |
+
_ok_cmd("adjust_setpoint"), "adjust_setpoint CRAC-1 22",
|
| 396 |
+
["adjust_setpoint CRAC-1 22"], thermal_sim, None,
|
| 397 |
+
)
|
| 398 |
+
assert r == pytest.approx(0.2)
|
| 399 |
+
|
| 400 |
+
def test_acknowledge_rewarded(self) -> None:
|
| 401 |
+
thermal_sim = _make_thermal_sim()
|
| 402 |
+
r = RewardFunction._action_quality(
|
| 403 |
+
_ok_cmd("acknowledge_alarm"), "acknowledge_alarm",
|
| 404 |
+
["acknowledge_alarm"], thermal_sim, None,
|
| 405 |
+
)
|
| 406 |
+
assert r == pytest.approx(0.1)
|
| 407 |
+
|
| 408 |
+
def test_repeated_command_penalized(self) -> None:
|
| 409 |
+
"""Repeated non-whitelisted command should be penalized."""
|
| 410 |
+
thermal_sim = _make_thermal_sim()
|
| 411 |
+
# Use adjust_setpoint (not whitelisted) instead of check_status
|
| 412 |
+
history = ["adjust_setpoint CRAC-1 20", "adjust_setpoint CRAC-1 20"]
|
| 413 |
+
r = RewardFunction._action_quality(
|
| 414 |
+
_ok_cmd("adjust_setpoint"), "adjust_setpoint CRAC-1 20", history,
|
| 415 |
+
thermal_sim, None,
|
| 416 |
+
)
|
| 417 |
+
assert r == pytest.approx(-0.2)
|
| 418 |
+
|
| 419 |
+
def test_repeated_whitelisted_not_penalized(self) -> None:
|
| 420 |
+
"""Repeated check_status/wait should NOT be penalized."""
|
| 421 |
+
thermal_sim = _make_thermal_sim()
|
| 422 |
+
history = ["check_status", "check_status"]
|
| 423 |
+
r = RewardFunction._action_quality(
|
| 424 |
+
_ok_cmd("check_status"), "check_status", history,
|
| 425 |
+
thermal_sim, None,
|
| 426 |
+
)
|
| 427 |
+
assert r == pytest.approx(0.3) # Still gets diagnose/check_status bonus
|
| 428 |
+
|
| 429 |
+
def test_wait_no_concern_neutral(self) -> None:
|
| 430 |
+
"""Waiting when nothing is wrong should be neutral (0.0)."""
|
| 431 |
+
thermal_sim = _make_thermal_sim(20.0) # Safe temps
|
| 432 |
+
r = RewardFunction._action_quality(
|
| 433 |
+
_ok_cmd("wait"), "wait", ["wait"], thermal_sim, None,
|
| 434 |
+
)
|
| 435 |
+
assert r == pytest.approx(0.0)
|
| 436 |
+
|
| 437 |
+
def test_wait_during_concern_penalized(self) -> None:
|
| 438 |
+
"""Waiting during a thermal concern should be penalized."""
|
| 439 |
+
thermal_sim = _make_thermal_sim(20.0)
|
| 440 |
+
# Force rack inlet temps above recommended max to create concern
|
| 441 |
+
for zone in thermal_sim.state.zones:
|
| 442 |
+
ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
|
| 443 |
+
if ashrae:
|
| 444 |
+
for rack in zone.racks:
|
| 445 |
+
rack.inlet_temp_c = ashrae.recommended_max_c + 2.0
|
| 446 |
+
r = RewardFunction._action_quality(
|
| 447 |
+
_ok_cmd("wait"), "wait", ["wait"], thermal_sim, None,
|
| 448 |
+
)
|
| 449 |
+
assert r == pytest.approx(-0.2)
|
| 450 |
+
|
| 451 |
+
def test_wait_during_battery_with_gen_starting(self) -> None:
|
| 452 |
+
"""Waiting while UPS on battery but generator starting is acceptable."""
|
| 453 |
+
thermal_sim = _make_thermal_sim(20.0)
|
| 454 |
+
power_sim = _make_power_sim(utility_available=False)
|
| 455 |
+
# Generator should be in startup sequence (auto-started by ATS)
|
| 456 |
+
r = RewardFunction._action_quality(
|
| 457 |
+
_ok_cmd("wait"), "wait", ["wait"], thermal_sim, power_sim,
|
| 458 |
+
)
|
| 459 |
+
assert r == pytest.approx(0.1) # Waiting for gen warmup is reasonable
|
| 460 |
+
|
| 461 |
+
def test_wait_during_thermal_concern_penalized(self) -> None:
|
| 462 |
+
"""Waiting during a thermal concern (no power issue) is penalized."""
|
| 463 |
+
thermal_sim = _make_thermal_sim(20.0)
|
| 464 |
+
for zone in thermal_sim.state.zones:
|
| 465 |
+
ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
|
| 466 |
+
if ashrae:
|
| 467 |
+
for rack in zone.racks:
|
| 468 |
+
rack.inlet_temp_c = ashrae.recommended_max_c + 2.0
|
| 469 |
+
r = RewardFunction._action_quality(
|
| 470 |
+
_ok_cmd("wait"), "wait", ["wait"], thermal_sim, None,
|
| 471 |
+
)
|
| 472 |
+
assert r == pytest.approx(-0.2)
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
# ===========================================================================
|
| 476 |
+
# Full Compute Tests
|
| 477 |
+
# ===========================================================================
|
| 478 |
+
class TestRewardCompute:
|
| 479 |
+
"""Validate full reward computation."""
|
| 480 |
+
|
| 481 |
+
def test_compute_returns_components(self) -> None:
|
| 482 |
+
rf = RewardFunction(scenario_type="thermal")
|
| 483 |
+
thermal_sim = _make_thermal_sim()
|
| 484 |
+
components = rf.compute(
|
| 485 |
+
thermal_sim, None, _ok_cmd(), "check_status",
|
| 486 |
+
["check_status"], None,
|
| 487 |
+
)
|
| 488 |
+
assert isinstance(components, RewardComponents)
|
| 489 |
+
assert hasattr(components, "total")
|
| 490 |
+
assert hasattr(components, "thermal_safety")
|
| 491 |
+
|
| 492 |
+
def test_total_bounded(self) -> None:
|
| 493 |
+
"""Total reward should be in [-1, 1]."""
|
| 494 |
+
rf = RewardFunction(scenario_type="thermal")
|
| 495 |
+
thermal_sim = _make_thermal_sim()
|
| 496 |
+
components = rf.compute(
|
| 497 |
+
thermal_sim, None, _ok_cmd(), "check_status",
|
| 498 |
+
["check_status"], None,
|
| 499 |
+
)
|
| 500 |
+
assert -1.0 <= components.total <= 1.0
|
| 501 |
+
|
| 502 |
+
def test_total_bounded_worst_case(self) -> None:
|
| 503 |
+
"""Even with all-negative components, total should be >= -1."""
|
| 504 |
+
rf = RewardFunction(scenario_type="thermal")
|
| 505 |
+
thermal_sim = _make_thermal_sim()
|
| 506 |
+
# Force extreme conditions
|
| 507 |
+
for zone in thermal_sim.state.zones:
|
| 508 |
+
for rack in zone.racks:
|
| 509 |
+
rack.inlet_temp_c = 50.0
|
| 510 |
+
components = rf.compute(
|
| 511 |
+
thermal_sim, None, _fail_cmd(), "nonsense",
|
| 512 |
+
["nonsense"],
|
| 513 |
+
ScenarioResult(procedure_reward=-1.0, progress=0.0),
|
| 514 |
+
)
|
| 515 |
+
assert components.total >= -1.0
|
| 516 |
+
|
| 517 |
+
def test_valid_action_better_than_invalid(self) -> None:
|
| 518 |
+
"""Same conditions, valid action should score higher than invalid."""
|
| 519 |
+
rf = RewardFunction(scenario_type="default")
|
| 520 |
+
thermal_sim = _make_thermal_sim()
|
| 521 |
+
|
| 522 |
+
c_valid = rf.compute(
|
| 523 |
+
thermal_sim, None, _ok_cmd(), "check_status",
|
| 524 |
+
["check_status"], None,
|
| 525 |
+
)
|
| 526 |
+
rf.reset()
|
| 527 |
+
c_invalid = rf.compute(
|
| 528 |
+
thermal_sim, None, _fail_cmd(), "nonsense",
|
| 529 |
+
["nonsense"], None,
|
| 530 |
+
)
|
| 531 |
+
assert c_valid.total > c_invalid.total
|
| 532 |
+
|
| 533 |
+
def test_progress_delta_affects_total(self) -> None:
|
| 534 |
+
"""Making progress should increase total reward."""
|
| 535 |
+
rf = RewardFunction(scenario_type="thermal")
|
| 536 |
+
thermal_sim = _make_thermal_sim()
|
| 537 |
+
|
| 538 |
+
c1 = rf.compute(
|
| 539 |
+
thermal_sim, None, _ok_cmd("diagnose"), "diagnose CRAC-1",
|
| 540 |
+
["diagnose CRAC-1"],
|
| 541 |
+
ScenarioResult(progress=0.5),
|
| 542 |
+
)
|
| 543 |
+
|
| 544 |
+
c2 = rf.compute(
|
| 545 |
+
thermal_sim, None, _ok_cmd("diagnose"), "diagnose CRAC-2",
|
| 546 |
+
["diagnose CRAC-1", "diagnose CRAC-2"],
|
| 547 |
+
ScenarioResult(progress=0.5), # No change
|
| 548 |
+
)
|
| 549 |
+
|
| 550 |
+
# Step with progress delta should score higher (all else similar)
|
| 551 |
+
assert c1.scenario_progress > c2.scenario_progress
|
| 552 |
+
|
| 553 |
+
def test_with_power_sim(self) -> None:
|
| 554 |
+
"""Compute should work with both thermal and power sims."""
|
| 555 |
+
rf = RewardFunction(scenario_type="power")
|
| 556 |
+
thermal_sim = _make_thermal_sim()
|
| 557 |
+
power_sim = _make_power_sim(utility_available=True)
|
| 558 |
+
|
| 559 |
+
components = rf.compute(
|
| 560 |
+
thermal_sim, power_sim, _ok_cmd(), "check_status",
|
| 561 |
+
["check_status"], None,
|
| 562 |
+
)
|
| 563 |
+
assert -1.0 <= components.total <= 1.0
|
| 564 |
+
|
| 565 |
+
def test_custom_weights(self) -> None:
|
| 566 |
+
"""Custom weights should override profile."""
|
| 567 |
+
custom = RewardWeights(
|
| 568 |
+
thermal_safety=0.0, power_safety=0.0, efficiency=0.0,
|
| 569 |
+
scenario_progress=0.0, procedure=0.0, action_quality=1.0,
|
| 570 |
+
)
|
| 571 |
+
rf = RewardFunction(weights=custom)
|
| 572 |
+
thermal_sim = _make_thermal_sim()
|
| 573 |
+
|
| 574 |
+
c = rf.compute(
|
| 575 |
+
thermal_sim, None, _ok_cmd("diagnose"), "diagnose CRAC-1",
|
| 576 |
+
["diagnose CRAC-1"], None,
|
| 577 |
+
)
|
| 578 |
+
# With only action_quality weighted, total should equal action_quality
|
| 579 |
+
assert c.total == pytest.approx(c.action_quality, abs=0.01)
|
| 580 |
+
|
| 581 |
+
|
| 582 |
+
# ===========================================================================
|
| 583 |
+
# Integration with Full Environment
|
| 584 |
+
# ===========================================================================
|
| 585 |
+
class TestRewardIntegration:
|
| 586 |
+
"""Validate reward function works correctly inside the environment."""
|
| 587 |
+
|
| 588 |
+
def test_scenario_reward_uses_reward_function(self) -> None:
|
| 589 |
+
"""Environment should use RewardFunction, not old _compute_reward."""
|
| 590 |
+
from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
|
| 591 |
+
from dc_ops_env.models import DcOpsAction
|
| 592 |
+
|
| 593 |
+
env = DcOpsEnvironment()
|
| 594 |
+
env.reset(scenario="A1") # Cooling setpoint optimization
|
| 595 |
+
obs = env.step(DcOpsAction(command="check_status"))
|
| 596 |
+
# Reward should be a float from the new system
|
| 597 |
+
assert isinstance(obs.reward, float)
|
| 598 |
+
assert obs.reward != 0.0 # Should have some signal
|
| 599 |
+
|
| 600 |
+
def test_escalation_has_penalty(self) -> None:
|
| 601 |
+
"""Escalation should be penalized relative to a normal action."""
|
| 602 |
+
from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
|
| 603 |
+
from dc_ops_env.models import DcOpsAction
|
| 604 |
+
|
| 605 |
+
# Get reward for a normal first action
|
| 606 |
+
env1 = DcOpsEnvironment()
|
| 607 |
+
env1.reset(scenario="A2")
|
| 608 |
+
obs_normal = env1.step(DcOpsAction(command="check_status"))
|
| 609 |
+
|
| 610 |
+
# Get reward for escalation
|
| 611 |
+
env2 = DcOpsEnvironment()
|
| 612 |
+
env2.reset(scenario="A2")
|
| 613 |
+
obs_esc = env2.step(DcOpsAction(command="escalate"))
|
| 614 |
+
assert obs_esc.done is True
|
| 615 |
+
# Escalation should yield less reward than a check_status
|
| 616 |
+
assert obs_esc.reward < obs_normal.reward
|
| 617 |
+
|
| 618 |
+
def test_scenario_resolution_has_speed_bonus(self) -> None:
|
| 619 |
+
"""Resolving a scenario early should yield a speed bonus."""
|
| 620 |
+
from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
|
| 621 |
+
from dc_ops_env.models import DcOpsAction
|
| 622 |
+
|
| 623 |
+
env = DcOpsEnvironment()
|
| 624 |
+
env.reset(scenario="B1") # UPS Alarm Response
|
| 625 |
+
|
| 626 |
+
# Solve B1: diagnose UPS then acknowledge
|
| 627 |
+
env.step(DcOpsAction(command="diagnose UPS-1"))
|
| 628 |
+
obs = env.step(DcOpsAction(command="acknowledge_alarm"))
|
| 629 |
+
|
| 630 |
+
# Should be resolved with speed bonus
|
| 631 |
+
assert obs.done is True
|
| 632 |
+
# Speed bonus = (budget - steps) / budget = (10 - 2) / 10 = 0.8
|
| 633 |
+
# Total reward includes base + speed bonus, should be positive
|
| 634 |
+
assert obs.reward > 0.5
|
| 635 |
+
|
| 636 |
+
def test_reward_function_reset_on_env_reset(self) -> None:
|
| 637 |
+
"""RewardFunction state should reset between episodes."""
|
| 638 |
+
from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
|
| 639 |
+
from dc_ops_env.models import DcOpsAction
|
| 640 |
+
|
| 641 |
+
env = DcOpsEnvironment()
|
| 642 |
+
|
| 643 |
+
# Episode 1
|
| 644 |
+
env.reset(scenario="A1")
|
| 645 |
+
env.step(DcOpsAction(command="check_status"))
|
| 646 |
+
|
| 647 |
+
# Episode 2 — progress delta should start fresh
|
| 648 |
+
env.reset(scenario="A1")
|
| 649 |
+
obs = env.step(DcOpsAction(command="check_status"))
|
| 650 |
+
assert isinstance(obs.reward, float)
|
tests/test_scenarios.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Tests for the scenario framework.
|
| 8 |
+
|
| 9 |
+
Validates:
|
| 10 |
+
- Scenario registry (registration, lookup, filtering)
|
| 11 |
+
- Scenario base class (procedure checking)
|
| 12 |
+
- Each scenario: initialization, fault injection, resolution detection
|
| 13 |
+
- Scenario integration with the environment
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import pytest
|
| 19 |
+
|
| 20 |
+
from dc_ops_env.models import DcOpsAction, DcOpsObservation
|
| 21 |
+
from dc_ops_env.scenarios import (
|
| 22 |
+
Scenario,
|
| 23 |
+
ScenarioResult,
|
| 24 |
+
get_scenario,
|
| 25 |
+
list_scenarios,
|
| 26 |
+
random_scenario,
|
| 27 |
+
registered_scenario_ids,
|
| 28 |
+
)
|
| 29 |
+
from dc_ops_env.scenarios.base import ProcedureRule
|
| 30 |
+
from dc_ops_env.server.dc_ops_env_environment import DcOpsEnvironment
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# ===========================================================================
|
| 34 |
+
# Registry Tests
|
| 35 |
+
# ===========================================================================
|
| 36 |
+
class TestRegistry:
|
| 37 |
+
"""Test scenario registration and lookup."""
|
| 38 |
+
|
| 39 |
+
def test_all_scenarios_registered(self) -> None:
|
| 40 |
+
ids = registered_scenario_ids()
|
| 41 |
+
assert "A1" in ids
|
| 42 |
+
assert "A2" in ids
|
| 43 |
+
assert "A4" in ids
|
| 44 |
+
assert "B1" in ids
|
| 45 |
+
assert "B3" in ids
|
| 46 |
+
assert "B4" in ids
|
| 47 |
+
|
| 48 |
+
def test_get_scenario_by_id(self) -> None:
|
| 49 |
+
s = get_scenario("A1")
|
| 50 |
+
assert s.scenario_id == "A1"
|
| 51 |
+
assert s.name == "Cooling Setpoint Optimization"
|
| 52 |
+
|
| 53 |
+
def test_get_scenario_unknown_raises(self) -> None:
|
| 54 |
+
with pytest.raises(KeyError, match="Unknown scenario"):
|
| 55 |
+
get_scenario("Z99")
|
| 56 |
+
|
| 57 |
+
def test_list_by_type(self) -> None:
|
| 58 |
+
thermal = list_scenarios(scenario_type="thermal")
|
| 59 |
+
assert all(s.scenario_type == "thermal" for s in thermal)
|
| 60 |
+
assert len(thermal) == 3 # A1, A2, A4
|
| 61 |
+
|
| 62 |
+
power = list_scenarios(scenario_type="power")
|
| 63 |
+
assert all(s.scenario_type == "power" for s in power)
|
| 64 |
+
assert len(power) == 3 # B1, B3, B4
|
| 65 |
+
|
| 66 |
+
def test_list_by_difficulty(self) -> None:
|
| 67 |
+
easy = list_scenarios(difficulty="easy")
|
| 68 |
+
assert all(s.difficulty == "easy" for s in easy)
|
| 69 |
+
assert len(easy) >= 2 # A1, B3
|
| 70 |
+
|
| 71 |
+
hard = list_scenarios(difficulty="hard")
|
| 72 |
+
assert all(s.difficulty == "hard" for s in hard)
|
| 73 |
+
assert len(hard) >= 2 # A4, B4
|
| 74 |
+
|
| 75 |
+
def test_random_scenario(self) -> None:
|
| 76 |
+
s = random_scenario(seed=42)
|
| 77 |
+
assert isinstance(s, Scenario)
|
| 78 |
+
|
| 79 |
+
def test_random_scenario_filtered(self) -> None:
|
| 80 |
+
s = random_scenario(scenario_type="thermal", difficulty="easy", seed=42)
|
| 81 |
+
assert s.scenario_type == "thermal"
|
| 82 |
+
assert s.difficulty == "easy"
|
| 83 |
+
|
| 84 |
+
def test_random_scenario_no_match_raises(self) -> None:
|
| 85 |
+
with pytest.raises(ValueError, match="No scenarios match"):
|
| 86 |
+
random_scenario(scenario_type="network")
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ===========================================================================
|
| 90 |
+
# Procedure Checking Tests
|
| 91 |
+
# ===========================================================================
|
| 92 |
+
class TestProcedureChecking:
|
| 93 |
+
"""Test the procedural correctness reward mechanism."""
|
| 94 |
+
|
| 95 |
+
def test_procedure_bonus_when_satisfied(self) -> None:
|
| 96 |
+
s = get_scenario("A2")
|
| 97 |
+
# History has diagnose, then adjust_setpoint
|
| 98 |
+
history = ["diagnose CRAC-3", "adjust_setpoint CRAC-4 20"]
|
| 99 |
+
reward = s.check_procedure("adjust_setpoint CRAC-4 20", history)
|
| 100 |
+
assert reward > 0, f"Expected bonus, got {reward}"
|
| 101 |
+
|
| 102 |
+
def test_procedure_penalty_when_not_satisfied(self) -> None:
|
| 103 |
+
s = get_scenario("A2")
|
| 104 |
+
# No diagnose before adjust_setpoint
|
| 105 |
+
history = ["adjust_setpoint CRAC-4 20"]
|
| 106 |
+
reward = s.check_procedure("adjust_setpoint CRAC-4 20", history)
|
| 107 |
+
assert reward < 0, f"Expected penalty, got {reward}"
|
| 108 |
+
|
| 109 |
+
def test_no_procedure_rules_returns_zero(self) -> None:
|
| 110 |
+
"""Scenario with no procedure rules should return 0."""
|
| 111 |
+
# Create a scenario without rules
|
| 112 |
+
s = get_scenario("A1") # A1 has rules, but let's test the mechanism
|
| 113 |
+
reward = s.check_procedure("wait", ["wait"])
|
| 114 |
+
# "wait" doesn't match any trigger_command, so should be 0
|
| 115 |
+
assert reward == 0.0
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
# ===========================================================================
|
| 119 |
+
# A1: Cooling Setpoint Optimization Tests
|
| 120 |
+
# ===========================================================================
|
| 121 |
+
class TestA1CoolingSetpoint:
|
| 122 |
+
"""Test the A1 scenario lifecycle."""
|
| 123 |
+
|
| 124 |
+
def test_initialization(self) -> None:
|
| 125 |
+
env = DcOpsEnvironment()
|
| 126 |
+
obs = env.reset(scenario="A1")
|
| 127 |
+
assert obs.scenario_type == "thermal"
|
| 128 |
+
assert "setpoint" in obs.alert.lower() or "PUE" in obs.alert
|
| 129 |
+
|
| 130 |
+
def test_initial_pue_is_high(self) -> None:
|
| 131 |
+
"""With 15°C setpoints, PUE should be higher than optimal."""
|
| 132 |
+
env = DcOpsEnvironment()
|
| 133 |
+
obs = env.reset(scenario="A1")
|
| 134 |
+
pue = obs.metadata["pue"]
|
| 135 |
+
# At 15°C setpoints, PUE should be elevated
|
| 136 |
+
assert pue > 1.5, f"Initial PUE {pue:.2f} should be > 1.5"
|
| 137 |
+
|
| 138 |
+
def test_raising_setpoint_improves_pue(self) -> None:
|
| 139 |
+
"""Raising CRAC setpoints should reduce PUE."""
|
| 140 |
+
env = DcOpsEnvironment()
|
| 141 |
+
obs = env.reset(scenario="A1")
|
| 142 |
+
pue_before = obs.metadata["pue"]
|
| 143 |
+
|
| 144 |
+
# Raise all setpoints to 22°C (within ASHRAE A2 recommended)
|
| 145 |
+
for crac_id in ["CRAC-1", "CRAC-2", "CRAC-3", "CRAC-4"]:
|
| 146 |
+
env.step(DcOpsAction(command=f"adjust_setpoint {crac_id} 22"))
|
| 147 |
+
|
| 148 |
+
# Wait for thermal convergence
|
| 149 |
+
for _ in range(3):
|
| 150 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 151 |
+
|
| 152 |
+
pue_after = obs.metadata["pue"]
|
| 153 |
+
assert pue_after < pue_before, \
|
| 154 |
+
f"PUE should decrease: {pue_before:.3f} → {pue_after:.3f}"
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
# ===========================================================================
|
| 158 |
+
# A2: Thermal Event Response Tests
|
| 159 |
+
# ===========================================================================
|
| 160 |
+
class TestA2ThermalEvent:
|
| 161 |
+
"""Test the A2 scenario lifecycle."""
|
| 162 |
+
|
| 163 |
+
def test_initialization(self) -> None:
|
| 164 |
+
env = DcOpsEnvironment()
|
| 165 |
+
obs = env.reset(scenario="A2")
|
| 166 |
+
assert obs.scenario_type == "thermal"
|
| 167 |
+
assert "CRAC-3" in obs.alert
|
| 168 |
+
|
| 169 |
+
def test_crac_fault_visible_in_dashboard(self) -> None:
|
| 170 |
+
env = DcOpsEnvironment()
|
| 171 |
+
obs = env.reset(scenario="A2")
|
| 172 |
+
assert "COMPRESSOR" in obs.dashboard or "FAULT" in obs.dashboard
|
| 173 |
+
|
| 174 |
+
def test_diagnose_reveals_fault(self) -> None:
|
| 175 |
+
env = DcOpsEnvironment()
|
| 176 |
+
env.reset(scenario="A2")
|
| 177 |
+
obs = env.step(DcOpsAction(command="diagnose CRAC-3"))
|
| 178 |
+
assert "compressor" in obs.action_result.lower()
|
| 179 |
+
assert "FAULT DETECTED" in obs.action_result
|
| 180 |
+
|
| 181 |
+
def test_procedure_bonus_for_diagnose_first(self) -> None:
|
| 182 |
+
"""Diagnosing before adjusting should yield higher reward."""
|
| 183 |
+
# Run 1: diagnose first, then adjust
|
| 184 |
+
env1 = DcOpsEnvironment()
|
| 185 |
+
env1.reset(scenario="A2")
|
| 186 |
+
obs1a = env1.step(DcOpsAction(command="diagnose CRAC-3"))
|
| 187 |
+
obs1b = env1.step(DcOpsAction(command="adjust_setpoint CRAC-4 20"))
|
| 188 |
+
r_with_diagnose = obs1b.reward
|
| 189 |
+
|
| 190 |
+
# Run 2: adjust without diagnosing
|
| 191 |
+
env2 = DcOpsEnvironment()
|
| 192 |
+
env2.reset(scenario="A2")
|
| 193 |
+
obs2 = env2.step(DcOpsAction(command="adjust_setpoint CRAC-4 20"))
|
| 194 |
+
r_without_diagnose = obs2.reward
|
| 195 |
+
|
| 196 |
+
assert r_with_diagnose > r_without_diagnose, \
|
| 197 |
+
f"Diagnose-first should yield higher reward: {r_with_diagnose:.3f} vs {r_without_diagnose:.3f}"
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
# ===========================================================================
|
| 201 |
+
# A4: CRAC Failure Cascade Tests
|
| 202 |
+
# ===========================================================================
|
| 203 |
+
class TestA4CRACCascade:
|
| 204 |
+
"""Test the A4 scenario lifecycle."""
|
| 205 |
+
|
| 206 |
+
def test_initialization(self) -> None:
|
| 207 |
+
env = DcOpsEnvironment()
|
| 208 |
+
obs = env.reset(scenario="A4")
|
| 209 |
+
assert obs.scenario_type == "thermal"
|
| 210 |
+
assert "CRAC-1" in obs.alert
|
| 211 |
+
assert "CRAC-3" in obs.alert
|
| 212 |
+
|
| 213 |
+
def test_two_cracs_faulted(self) -> None:
|
| 214 |
+
env = DcOpsEnvironment()
|
| 215 |
+
env.reset(scenario="A4")
|
| 216 |
+
|
| 217 |
+
obs1 = env.step(DcOpsAction(command="diagnose CRAC-1"))
|
| 218 |
+
assert "compressor" in obs1.action_result.lower()
|
| 219 |
+
|
| 220 |
+
obs3 = env.step(DcOpsAction(command="diagnose CRAC-3"))
|
| 221 |
+
assert "fan" in obs3.action_result.lower()
|
| 222 |
+
|
| 223 |
+
def test_cascade_has_faster_time(self) -> None:
|
| 224 |
+
"""A4 uses 30s per step (urgent scenario)."""
|
| 225 |
+
s = get_scenario("A4")
|
| 226 |
+
assert s.game_time_per_step_s == 30.0
|
| 227 |
+
|
| 228 |
+
def test_harder_than_a2(self) -> None:
|
| 229 |
+
"""A4 should have higher step budget than A2 (more complex)."""
|
| 230 |
+
a2 = get_scenario("A2")
|
| 231 |
+
a4 = get_scenario("A4")
|
| 232 |
+
assert a4.step_budget >= a2.step_budget
|
| 233 |
+
assert a4.difficulty == "hard"
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
# ===========================================================================
|
| 237 |
+
# B1: UPS Alarm Response Tests
|
| 238 |
+
# ===========================================================================
|
| 239 |
+
class TestB1UPSAlarm:
|
| 240 |
+
"""Test the B1 scenario lifecycle."""
|
| 241 |
+
|
| 242 |
+
def test_initialization(self) -> None:
|
| 243 |
+
env = DcOpsEnvironment()
|
| 244 |
+
obs = env.reset(scenario="B1")
|
| 245 |
+
assert obs.scenario_type == "power"
|
| 246 |
+
assert "UPS" in obs.alert
|
| 247 |
+
|
| 248 |
+
def test_battery_partially_drained(self) -> None:
|
| 249 |
+
"""UPS battery should be partially drained (brief outage)."""
|
| 250 |
+
env = DcOpsEnvironment()
|
| 251 |
+
obs = env.reset(scenario="B1")
|
| 252 |
+
ups_soc = obs.metadata["power"]["UPS-1"]["battery_soc"]
|
| 253 |
+
assert ups_soc < 1.0, f"Battery should be partially drained, SOC={ups_soc}"
|
| 254 |
+
|
| 255 |
+
def test_resolution_requires_diagnose_and_ack(self) -> None:
|
| 256 |
+
"""B1 resolves when agent diagnoses UPS AND acknowledges alarm."""
|
| 257 |
+
env = DcOpsEnvironment()
|
| 258 |
+
env.reset(scenario="B1")
|
| 259 |
+
|
| 260 |
+
obs = env.step(DcOpsAction(command="diagnose UPS-1"))
|
| 261 |
+
assert obs.done is False # Not resolved yet
|
| 262 |
+
|
| 263 |
+
obs = env.step(DcOpsAction(command="acknowledge_alarm"))
|
| 264 |
+
assert obs.done is True # Now resolved
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
# ===========================================================================
|
| 268 |
+
# B3: Generator Test Protocol Tests
|
| 269 |
+
# ===========================================================================
|
| 270 |
+
class TestB3GeneratorTest:
|
| 271 |
+
"""Test the B3 scenario lifecycle."""
|
| 272 |
+
|
| 273 |
+
def test_initialization(self) -> None:
|
| 274 |
+
env = DcOpsEnvironment()
|
| 275 |
+
obs = env.reset(scenario="B3")
|
| 276 |
+
assert obs.scenario_type == "power"
|
| 277 |
+
assert "generator" in obs.alert.lower()
|
| 278 |
+
|
| 279 |
+
def test_correct_protocol_resolves(self) -> None:
|
| 280 |
+
"""Following correct protocol should resolve the scenario."""
|
| 281 |
+
env = DcOpsEnvironment()
|
| 282 |
+
env.reset(scenario="B3")
|
| 283 |
+
|
| 284 |
+
env.step(DcOpsAction(command="check_status"))
|
| 285 |
+
env.step(DcOpsAction(command="start_generator"))
|
| 286 |
+
|
| 287 |
+
# Wait for generator to start up
|
| 288 |
+
env.step(DcOpsAction(command="wait"))
|
| 289 |
+
env.step(DcOpsAction(command="wait"))
|
| 290 |
+
|
| 291 |
+
env.step(DcOpsAction(command="diagnose GEN-1"))
|
| 292 |
+
env.step(DcOpsAction(command="stop_generator"))
|
| 293 |
+
obs = env.step(DcOpsAction(command="acknowledge_alarm"))
|
| 294 |
+
|
| 295 |
+
assert obs.done is True
|
| 296 |
+
|
| 297 |
+
def test_uses_30s_steps(self) -> None:
|
| 298 |
+
s = get_scenario("B3")
|
| 299 |
+
assert s.game_time_per_step_s == 30.0
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
# ===========================================================================
|
| 303 |
+
# B4: Power Failure Cascade Tests
|
| 304 |
+
# ===========================================================================
|
| 305 |
+
class TestB4PowerCascade:
|
| 306 |
+
"""Test the B4 scenario lifecycle."""
|
| 307 |
+
|
| 308 |
+
def test_initialization(self) -> None:
|
| 309 |
+
env = DcOpsEnvironment()
|
| 310 |
+
obs = env.reset(scenario="B4")
|
| 311 |
+
assert obs.scenario_type == "power"
|
| 312 |
+
assert "utility" in obs.alert.lower() or "power" in obs.alert.lower()
|
| 313 |
+
|
| 314 |
+
def test_utility_is_down(self) -> None:
|
| 315 |
+
env = DcOpsEnvironment()
|
| 316 |
+
obs = env.reset(scenario="B4")
|
| 317 |
+
assert obs.metadata["power"]["utility_available"] is False
|
| 318 |
+
|
| 319 |
+
def test_ups_on_battery(self) -> None:
|
| 320 |
+
"""UPS should be on battery after utility loss."""
|
| 321 |
+
env = DcOpsEnvironment()
|
| 322 |
+
obs = env.reset(scenario="B4")
|
| 323 |
+
# After warmup + fault injection, UPS should be on battery
|
| 324 |
+
ups_mode = obs.metadata["power"]["UPS-1"]["mode"]
|
| 325 |
+
assert ups_mode in ("on_battery", "double_conversion"), f"UPS mode: {ups_mode}"
|
| 326 |
+
|
| 327 |
+
def test_fast_time_progression(self) -> None:
|
| 328 |
+
s = get_scenario("B4")
|
| 329 |
+
assert s.game_time_per_step_s == 15.0
|
| 330 |
+
assert s.difficulty == "hard"
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# ===========================================================================
|
| 334 |
+
# Environment Scenario Integration Tests
|
| 335 |
+
# ===========================================================================
|
| 336 |
+
class TestScenarioIntegration:
|
| 337 |
+
"""Test scenario integration with the environment."""
|
| 338 |
+
|
| 339 |
+
def test_scenario_by_id_string(self) -> None:
|
| 340 |
+
env = DcOpsEnvironment()
|
| 341 |
+
obs = env.reset(scenario="A1")
|
| 342 |
+
assert obs.metadata["scenario"]["id"] == "A1"
|
| 343 |
+
|
| 344 |
+
def test_scenario_by_instance(self) -> None:
|
| 345 |
+
env = DcOpsEnvironment()
|
| 346 |
+
s = get_scenario("B3")
|
| 347 |
+
obs = env.reset(scenario=s)
|
| 348 |
+
assert obs.metadata["scenario"]["id"] == "B3"
|
| 349 |
+
|
| 350 |
+
def test_scenario_step_budget_used(self) -> None:
|
| 351 |
+
env = DcOpsEnvironment()
|
| 352 |
+
obs = env.reset(scenario="A1")
|
| 353 |
+
assert obs.steps_remaining == 10 # A1 budget
|
| 354 |
+
|
| 355 |
+
def test_scenario_kwargs_override(self) -> None:
|
| 356 |
+
"""Explicit kwargs should override scenario defaults."""
|
| 357 |
+
env = DcOpsEnvironment()
|
| 358 |
+
obs = env.reset(scenario="A1", step_budget=5)
|
| 359 |
+
assert obs.steps_remaining == 5
|
| 360 |
+
|
| 361 |
+
def test_no_scenario_backward_compat(self) -> None:
|
| 362 |
+
"""Environment should work without a scenario (backward compat)."""
|
| 363 |
+
env = DcOpsEnvironment()
|
| 364 |
+
obs = env.reset()
|
| 365 |
+
assert "scenario" not in obs.metadata
|
| 366 |
+
assert obs.scenario_type == ""
|
| 367 |
+
|
| 368 |
+
def test_scenario_resolution_ends_episode(self) -> None:
|
| 369 |
+
"""When scenario is resolved, episode should end with done=True."""
|
| 370 |
+
env = DcOpsEnvironment()
|
| 371 |
+
env.reset(scenario="B1")
|
| 372 |
+
|
| 373 |
+
# Resolve B1: diagnose + acknowledge
|
| 374 |
+
env.step(DcOpsAction(command="diagnose UPS-1"))
|
| 375 |
+
obs = env.step(DcOpsAction(command="acknowledge_alarm"))
|
| 376 |
+
assert obs.done is True
|
| 377 |
+
|
| 378 |
+
def test_speed_bonus_on_resolution(self) -> None:
|
| 379 |
+
"""Resolving early should give a speed bonus."""
|
| 380 |
+
env = DcOpsEnvironment()
|
| 381 |
+
env.reset(scenario="B1") # Budget: 10
|
| 382 |
+
|
| 383 |
+
env.step(DcOpsAction(command="diagnose UPS-1")) # Step 1
|
| 384 |
+
obs = env.step(DcOpsAction(command="acknowledge_alarm")) # Step 2
|
| 385 |
+
|
| 386 |
+
# Speed bonus = (10 - 2) / 10 = 0.8
|
| 387 |
+
# Total reward should include this bonus
|
| 388 |
+
assert obs.reward > 0.5, f"Expected speed bonus in reward, got {obs.reward:.3f}"
|
| 389 |
+
|
| 390 |
+
def test_random_scenario_via_reset(self) -> None:
|
| 391 |
+
"""reset(random_scenario=True) should pick a random scenario."""
|
| 392 |
+
env = DcOpsEnvironment()
|
| 393 |
+
obs = env.reset(random_scenario=True, seed=42)
|
| 394 |
+
assert "scenario" in obs.metadata
|
| 395 |
+
assert obs.metadata["scenario"]["id"] in registered_scenario_ids()
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
# ===========================================================================
|
| 399 |
+
# All Scenarios Smoke Test
|
| 400 |
+
# ===========================================================================
|
| 401 |
+
class TestAllScenariosSmoke:
|
| 402 |
+
"""Smoke test: every scenario can initialize and run 3 steps."""
|
| 403 |
+
|
| 404 |
+
@pytest.mark.parametrize("scenario_id", registered_scenario_ids())
|
| 405 |
+
def test_scenario_runs(self, scenario_id: str) -> None:
|
| 406 |
+
env = DcOpsEnvironment()
|
| 407 |
+
obs = env.reset(scenario=scenario_id)
|
| 408 |
+
assert isinstance(obs, DcOpsObservation)
|
| 409 |
+
assert obs.done is False
|
| 410 |
+
assert len(obs.dashboard) > 100
|
| 411 |
+
|
| 412 |
+
# Run 3 steps
|
| 413 |
+
for _ in range(3):
|
| 414 |
+
obs = env.step(DcOpsAction(command="wait"))
|
| 415 |
+
assert isinstance(obs, DcOpsObservation)
|
tests/test_thermal.py
ADDED
|
@@ -0,0 +1,499 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Physics validation tests for the thermal simulation.
|
| 9 |
+
|
| 10 |
+
These tests verify that the simulation produces physically plausible behavior:
|
| 11 |
+
1. Steady-state temperatures are in expected ranges
|
| 12 |
+
2. CRAC failure causes predictable temperature rise rates
|
| 13 |
+
3. Total cooling loss leads to thermal runaway at ~5°C/min
|
| 14 |
+
4. Setpoint changes propagate with correct time constants
|
| 15 |
+
5. Energy conservation holds
|
| 16 |
+
6. PUE is in realistic range
|
| 17 |
+
7. Recirculation raises cold aisle temperature
|
| 18 |
+
8. Performance target: 1000 steps < 1 second
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import time
|
| 22 |
+
|
| 23 |
+
import pytest
|
| 24 |
+
|
| 25 |
+
from dc_ops_env.config import (
|
| 26 |
+
ASHRAE_CLASSES,
|
| 27 |
+
CRACConfig,
|
| 28 |
+
DatacenterConfig,
|
| 29 |
+
RackConfig,
|
| 30 |
+
ZoneConfig,
|
| 31 |
+
make_default_datacenter_config,
|
| 32 |
+
)
|
| 33 |
+
from dc_ops_env.simulation.thermal import ThermalSimulation
|
| 34 |
+
from dc_ops_env.simulation.types import CRACFaultType, CRACStatus
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@pytest.fixture
|
| 38 |
+
def default_sim() -> ThermalSimulation:
|
| 39 |
+
"""Default datacenter: 2 zones × 10 racks × 2 CRACs, 160 kW total IT."""
|
| 40 |
+
return ThermalSimulation()
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@pytest.fixture
|
| 44 |
+
def single_zone_sim() -> ThermalSimulation:
|
| 45 |
+
"""Minimal single-zone facility for isolated testing."""
|
| 46 |
+
racks = [
|
| 47 |
+
RackConfig(rack_id=f"A-{i:02d}", row="A", position=i, it_load_kw=8.0)
|
| 48 |
+
for i in range(1, 6) # 5 racks × 8 kW = 40 kW IT
|
| 49 |
+
]
|
| 50 |
+
cracs = [
|
| 51 |
+
CRACConfig(unit_id="CRAC-1", rated_capacity_kw=70.0),
|
| 52 |
+
]
|
| 53 |
+
config = DatacenterConfig(
|
| 54 |
+
name="Test Single Zone",
|
| 55 |
+
zones=[
|
| 56 |
+
ZoneConfig(
|
| 57 |
+
zone_id="zone_a",
|
| 58 |
+
racks=racks,
|
| 59 |
+
crac_units=cracs,
|
| 60 |
+
air_volume_m3=300.0,
|
| 61 |
+
recirculation_factor=0.05,
|
| 62 |
+
)
|
| 63 |
+
],
|
| 64 |
+
outside_temp_c=35.0,
|
| 65 |
+
floor_area_m2=300.0,
|
| 66 |
+
)
|
| 67 |
+
return ThermalSimulation(config)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class TestSteadyState:
|
| 71 |
+
"""Test that the simulation converges to physically plausible steady state."""
|
| 72 |
+
|
| 73 |
+
def test_cold_aisle_in_ashrae_range(self, default_sim: ThermalSimulation):
|
| 74 |
+
"""Cold aisle should be within ASHRAE A2 recommended range at steady state."""
|
| 75 |
+
# Run 600 steps (10 minutes) to ensure steady state
|
| 76 |
+
default_sim.step_n(600)
|
| 77 |
+
for zone in default_sim.state.zones:
|
| 78 |
+
ashrae = ASHRAE_CLASSES[zone.ashrae_class]
|
| 79 |
+
assert zone.cold_aisle_temp_c >= ashrae.recommended_min_c - 2.0, (
|
| 80 |
+
f"Zone {zone.zone_id}: cold aisle {zone.cold_aisle_temp_c:.1f}°C "
|
| 81 |
+
f"below ASHRAE min {ashrae.recommended_min_c}°C"
|
| 82 |
+
)
|
| 83 |
+
assert zone.cold_aisle_temp_c <= ashrae.recommended_max_c + 2.0, (
|
| 84 |
+
f"Zone {zone.zone_id}: cold aisle {zone.cold_aisle_temp_c:.1f}°C "
|
| 85 |
+
f"above ASHRAE max {ashrae.recommended_max_c}°C"
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
def test_hot_aisle_warmer_than_cold(self, default_sim: ThermalSimulation):
|
| 89 |
+
"""Hot aisle must always be warmer than cold aisle."""
|
| 90 |
+
default_sim.step_n(300)
|
| 91 |
+
for zone in default_sim.state.zones:
|
| 92 |
+
assert zone.hot_aisle_temp_c > zone.cold_aisle_temp_c, (
|
| 93 |
+
f"Zone {zone.zone_id}: hot aisle {zone.hot_aisle_temp_c:.1f}°C "
|
| 94 |
+
f"not warmer than cold aisle {zone.cold_aisle_temp_c:.1f}°C"
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
def test_hot_cold_delta_reasonable(self, default_sim: ThermalSimulation):
|
| 98 |
+
"""Temperature delta across racks should be 10-20°C for standard density."""
|
| 99 |
+
# At 8 kW/rack with ~160 CFM/kW airflow, ΔT ≈ 8000 / (0.605 × 1005) ≈ 13°C
|
| 100 |
+
default_sim.step_n(300)
|
| 101 |
+
for zone in default_sim.state.zones:
|
| 102 |
+
delta = zone.hot_aisle_temp_c - zone.cold_aisle_temp_c
|
| 103 |
+
assert 5.0 < delta < 25.0, (
|
| 104 |
+
f"Zone {zone.zone_id}: ΔT = {delta:.1f}°C outside expected range 5-25°C"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
def test_pue_realistic(self, default_sim: ThermalSimulation):
|
| 108 |
+
"""PUE should be in realistic range (1.1 - 2.0) at steady state."""
|
| 109 |
+
default_sim.step_n(300)
|
| 110 |
+
pue = default_sim.state.pue
|
| 111 |
+
assert 1.1 <= pue <= 2.0, f"PUE {pue:.2f} outside realistic range 1.1-2.0"
|
| 112 |
+
|
| 113 |
+
def test_rack_inlet_equals_cold_aisle(self, default_sim: ThermalSimulation):
|
| 114 |
+
"""All rack inlets in a zone should equal the zone cold aisle temp."""
|
| 115 |
+
default_sim.step_n(300)
|
| 116 |
+
for zone in default_sim.state.zones:
|
| 117 |
+
for rack in zone.racks:
|
| 118 |
+
assert abs(rack.inlet_temp_c - zone.cold_aisle_temp_c) < 0.01, (
|
| 119 |
+
f"Rack {rack.rack_id}: inlet {rack.inlet_temp_c:.2f}°C "
|
| 120 |
+
f"!= zone cold {zone.cold_aisle_temp_c:.2f}°C"
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
def test_rack_outlet_consistent_with_load(self, default_sim: ThermalSimulation):
|
| 124 |
+
"""Rack outlet temp should be consistent with Q = m_dot × c_p × ΔT."""
|
| 125 |
+
from dc_ops_env.config import AIR_DENSITY_KG_M3, AIR_SPECIFIC_HEAT_J_KGK
|
| 126 |
+
|
| 127 |
+
default_sim.step_n(300)
|
| 128 |
+
for zone in default_sim.state.zones:
|
| 129 |
+
for rack in zone.racks:
|
| 130 |
+
m_dot = rack.airflow_m3s * AIR_DENSITY_KG_M3
|
| 131 |
+
expected_dt = (rack.it_load_kw * 1000.0) / (m_dot * AIR_SPECIFIC_HEAT_J_KGK)
|
| 132 |
+
actual_dt = rack.outlet_temp_c - rack.inlet_temp_c
|
| 133 |
+
assert abs(actual_dt - expected_dt) < 0.1, (
|
| 134 |
+
f"Rack {rack.rack_id}: ΔT {actual_dt:.2f}°C vs expected {expected_dt:.2f}°C"
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class TestCRACFailure:
|
| 139 |
+
"""Test thermal response to CRAC unit failures."""
|
| 140 |
+
|
| 141 |
+
def test_single_crac_failure_temp_rises(self, default_sim: ThermalSimulation):
|
| 142 |
+
"""Losing 1 of 2 CRACs should cause temperature increase.
|
| 143 |
+
|
| 144 |
+
With N+1 cooling provisioning (2 CRACs for 80 kW IT load, each
|
| 145 |
+
rated at 70 kW), losing one CRAC means the faulted unit's fans
|
| 146 |
+
still run but blow unconditioned air (at return temp), actively
|
| 147 |
+
warming the cold aisle. Temperature should rise noticeably.
|
| 148 |
+
"""
|
| 149 |
+
# Settle first
|
| 150 |
+
default_sim.step_n(300)
|
| 151 |
+
temp_before = default_sim.state.zones[0].cold_aisle_temp_c
|
| 152 |
+
|
| 153 |
+
# Fail one CRAC in zone A
|
| 154 |
+
default_sim.inject_crac_fault("CRAC-1", CRACFaultType.COMPRESSOR)
|
| 155 |
+
|
| 156 |
+
# Run 10 minutes (600 steps at dt=1s) — longer for N+1 systems
|
| 157 |
+
default_sim.step_n(600)
|
| 158 |
+
temp_after = default_sim.state.zones[0].cold_aisle_temp_c
|
| 159 |
+
|
| 160 |
+
assert temp_after > temp_before + 0.5, (
|
| 161 |
+
f"Temperature should rise after CRAC failure: {temp_before:.1f} → {temp_after:.1f}°C"
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
def test_single_crac_failure_other_zone_unaffected(self, default_sim: ThermalSimulation):
|
| 165 |
+
"""CRAC failure in zone A should not directly affect zone B."""
|
| 166 |
+
default_sim.step_n(300)
|
| 167 |
+
temp_b_before = default_sim.state.zones[1].cold_aisle_temp_c
|
| 168 |
+
|
| 169 |
+
default_sim.inject_crac_fault("CRAC-1", CRACFaultType.COMPRESSOR)
|
| 170 |
+
default_sim.step_n(300)
|
| 171 |
+
temp_b_after = default_sim.state.zones[1].cold_aisle_temp_c
|
| 172 |
+
|
| 173 |
+
# Zone B has its own CRACs, so temp should be nearly unchanged
|
| 174 |
+
# (small change possible due to shared outside temp / lighting)
|
| 175 |
+
assert abs(temp_b_after - temp_b_before) < 2.0, (
|
| 176 |
+
f"Zone B temp changed too much: {temp_b_before:.1f} → {temp_b_after:.1f}°C"
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
def test_crac_recovery(self, default_sim: ThermalSimulation):
|
| 180 |
+
"""Clearing a CRAC fault should allow temperature to recover."""
|
| 181 |
+
default_sim.step_n(300)
|
| 182 |
+
default_sim.inject_crac_fault("CRAC-1", CRACFaultType.COMPRESSOR)
|
| 183 |
+
default_sim.step_n(600) # Let temp rise for 10 min
|
| 184 |
+
temp_during_fault = default_sim.state.zones[0].cold_aisle_temp_c
|
| 185 |
+
|
| 186 |
+
default_sim.clear_crac_fault("CRAC-1")
|
| 187 |
+
default_sim.step_n(600) # Give time to recover
|
| 188 |
+
temp_recovered = default_sim.state.zones[0].cold_aisle_temp_c
|
| 189 |
+
|
| 190 |
+
assert temp_recovered < temp_during_fault - 0.3, (
|
| 191 |
+
f"Temperature should drop after fault cleared: "
|
| 192 |
+
f"{temp_during_fault:.1f} → {temp_recovered:.1f}°C"
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
class TestTotalCoolingLoss:
|
| 197 |
+
"""Test behavior when all cooling is lost."""
|
| 198 |
+
|
| 199 |
+
def test_temp_rise_rate_approximately_5c_per_minute(self, single_zone_sim: ThermalSimulation):
|
| 200 |
+
"""With all cooling off, temperature should rise ~5°C/min.
|
| 201 |
+
|
| 202 |
+
Reference: Active Power WP-105, Electronics Cooling literature.
|
| 203 |
+
At standard IT densities, initial rate is ~5°C/min or more.
|
| 204 |
+
|
| 205 |
+
For our config: 40 kW IT in a zone with ~5 × 20 × 11.1 kJ/K = 1110 kJ/K
|
| 206 |
+
thermal mass (equipment) + ~360 kJ/K air ≈ 1470 kJ/K total.
|
| 207 |
+
|
| 208 |
+
dT/dt = Q_net / C = 40,000 W / 1,470,000 J/K ≈ 0.027 °C/s ≈ 1.6 °C/min
|
| 209 |
+
|
| 210 |
+
With envelope heat gain at 35°C outside, the actual rate will be slightly
|
| 211 |
+
higher. For a smaller zone with 5 racks, the rate is ~1.6°C/min.
|
| 212 |
+
For higher-density or lower-mass zones it can reach 5°C/min.
|
| 213 |
+
"""
|
| 214 |
+
single_zone_sim.step_n(300) # Settle
|
| 215 |
+
temp_before = single_zone_sim.state.zones[0].cold_aisle_temp_c
|
| 216 |
+
|
| 217 |
+
# Kill all cooling
|
| 218 |
+
single_zone_sim.inject_crac_fault("CRAC-1", CRACFaultType.COMPRESSOR)
|
| 219 |
+
|
| 220 |
+
# Run 2 minutes
|
| 221 |
+
single_zone_sim.step_n(120)
|
| 222 |
+
temp_after = single_zone_sim.state.zones[0].cold_aisle_temp_c
|
| 223 |
+
|
| 224 |
+
rise_rate_per_min = (temp_after - temp_before) / 2.0 # °C/min
|
| 225 |
+
# Accept 0.5 - 8 °C/min depending on thermal mass
|
| 226 |
+
assert rise_rate_per_min > 0.5, (
|
| 227 |
+
f"Temperature rise too slow: {rise_rate_per_min:.2f} °C/min"
|
| 228 |
+
)
|
| 229 |
+
assert rise_rate_per_min < 8.0, (
|
| 230 |
+
f"Temperature rise too fast: {rise_rate_per_min:.2f} °C/min"
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
def test_reaches_critical_in_reasonable_time(self, single_zone_sim: ThermalSimulation):
|
| 234 |
+
"""With all cooling off, should reach ASHRAE allowable max within ~10-20 min."""
|
| 235 |
+
single_zone_sim.step_n(300) # Settle
|
| 236 |
+
|
| 237 |
+
single_zone_sim.inject_crac_fault("CRAC-1", CRACFaultType.COMPRESSOR)
|
| 238 |
+
|
| 239 |
+
ashrae = ASHRAE_CLASSES["A2"]
|
| 240 |
+
max_temp = ashrae.allowable_max_c # 35°C
|
| 241 |
+
|
| 242 |
+
# Run up to 30 minutes (1800 steps)
|
| 243 |
+
reached_critical = False
|
| 244 |
+
for step in range(1800):
|
| 245 |
+
single_zone_sim.step()
|
| 246 |
+
if single_zone_sim.state.zones[0].cold_aisle_temp_c > max_temp:
|
| 247 |
+
reached_critical = True
|
| 248 |
+
time_to_critical_min = (step + 1) / 60.0
|
| 249 |
+
break
|
| 250 |
+
|
| 251 |
+
assert reached_critical, (
|
| 252 |
+
f"Never reached {max_temp}°C in 30 min. "
|
| 253 |
+
f"Final temp: {single_zone_sim.state.zones[0].cold_aisle_temp_c:.1f}°C"
|
| 254 |
+
)
|
| 255 |
+
assert time_to_critical_min < 25.0, (
|
| 256 |
+
f"Took {time_to_critical_min:.1f} min to reach critical — too slow"
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
class TestSetpointChanges:
|
| 261 |
+
"""Test CRAC setpoint change dynamics."""
|
| 262 |
+
|
| 263 |
+
def test_setpoint_increase_raises_cold_aisle(self, single_zone_sim: ThermalSimulation):
|
| 264 |
+
"""Raising CRAC setpoint should raise cold aisle temperature."""
|
| 265 |
+
single_zone_sim.step_n(300)
|
| 266 |
+
temp_before = single_zone_sim.state.zones[0].cold_aisle_temp_c
|
| 267 |
+
|
| 268 |
+
# Raise setpoint by 5°C
|
| 269 |
+
single_zone_sim.set_crac_setpoint("CRAC-1", 23.0)
|
| 270 |
+
single_zone_sim.step_n(300)
|
| 271 |
+
temp_after = single_zone_sim.state.zones[0].cold_aisle_temp_c
|
| 272 |
+
|
| 273 |
+
assert temp_after > temp_before + 2.0, (
|
| 274 |
+
f"Cold aisle should rise with higher setpoint: {temp_before:.1f} → {temp_after:.1f}°C"
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
def test_setpoint_decrease_lowers_cold_aisle(self, single_zone_sim: ThermalSimulation):
|
| 278 |
+
"""Lowering CRAC setpoint should lower cold aisle temperature."""
|
| 279 |
+
single_zone_sim.step_n(300)
|
| 280 |
+
temp_before = single_zone_sim.state.zones[0].cold_aisle_temp_c
|
| 281 |
+
|
| 282 |
+
single_zone_sim.set_crac_setpoint("CRAC-1", 14.0)
|
| 283 |
+
single_zone_sim.step_n(300)
|
| 284 |
+
temp_after = single_zone_sim.state.zones[0].cold_aisle_temp_c
|
| 285 |
+
|
| 286 |
+
assert temp_after < temp_before - 1.0, (
|
| 287 |
+
f"Cold aisle should drop with lower setpoint: {temp_before:.1f} → {temp_after:.1f}°C"
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
def test_supply_temp_lag(self, single_zone_sim: ThermalSimulation):
|
| 291 |
+
"""Supply temp should lag setpoint with ~30s time constant."""
|
| 292 |
+
single_zone_sim.step_n(300)
|
| 293 |
+
|
| 294 |
+
crac = single_zone_sim.state.zones[0].crac_units[0]
|
| 295 |
+
old_supply = crac.supply_temp_c
|
| 296 |
+
|
| 297 |
+
# Step change in setpoint
|
| 298 |
+
single_zone_sim.set_crac_setpoint("CRAC-1", old_supply + 10.0)
|
| 299 |
+
|
| 300 |
+
# After 1 time constant (30s), should be ~63% of the way there
|
| 301 |
+
single_zone_sim.step_n(30)
|
| 302 |
+
expected_63pct = old_supply + 10.0 * 0.632
|
| 303 |
+
actual = crac.supply_temp_c
|
| 304 |
+
|
| 305 |
+
# Allow ±1.5°C tolerance
|
| 306 |
+
assert abs(actual - expected_63pct) < 1.5, (
|
| 307 |
+
f"After 1τ, supply temp {actual:.1f}°C, expected ~{expected_63pct:.1f}°C"
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
class TestRecirculation:
|
| 312 |
+
"""Test hot-air recirculation effects."""
|
| 313 |
+
|
| 314 |
+
def test_higher_recirculation_raises_cold_aisle(self):
|
| 315 |
+
"""Higher recirculation factor should result in warmer cold aisle."""
|
| 316 |
+
configs = []
|
| 317 |
+
for r in [0.0, 0.15, 0.30]:
|
| 318 |
+
racks = [RackConfig(rack_id=f"A-{i}", row="A", position=i) for i in range(1, 6)]
|
| 319 |
+
cracs = [CRACConfig(unit_id="CRAC-1")]
|
| 320 |
+
cfg = DatacenterConfig(
|
| 321 |
+
zones=[ZoneConfig(
|
| 322 |
+
zone_id="zone_a", racks=racks, crac_units=cracs,
|
| 323 |
+
recirculation_factor=r, air_volume_m3=300.0,
|
| 324 |
+
)],
|
| 325 |
+
floor_area_m2=300.0,
|
| 326 |
+
)
|
| 327 |
+
configs.append(cfg)
|
| 328 |
+
|
| 329 |
+
temps = []
|
| 330 |
+
for cfg in configs:
|
| 331 |
+
sim = ThermalSimulation(cfg)
|
| 332 |
+
sim.step_n(600)
|
| 333 |
+
temps.append(sim.state.zones[0].cold_aisle_temp_c)
|
| 334 |
+
|
| 335 |
+
# Each higher recirculation factor should produce a warmer cold aisle
|
| 336 |
+
assert temps[1] > temps[0], (
|
| 337 |
+
f"r=0.15 ({temps[1]:.1f}°C) should be warmer than r=0.0 ({temps[0]:.1f}°C)"
|
| 338 |
+
)
|
| 339 |
+
assert temps[2] > temps[1], (
|
| 340 |
+
f"r=0.30 ({temps[2]:.1f}°C) should be warmer than r=0.15 ({temps[1]:.1f}°C)"
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
class TestFanSpeedEffects:
|
| 345 |
+
"""Test fan speed control on cooling and power."""
|
| 346 |
+
|
| 347 |
+
def test_reduced_fan_speed_raises_temp(self, single_zone_sim: ThermalSimulation):
|
| 348 |
+
"""Reducing fan speed should reduce airflow and raise temperatures.
|
| 349 |
+
|
| 350 |
+
At 50% fan speed, CRAC airflow drops to 50% but cooling injection
|
| 351 |
+
rate (m_dot × c_p × ΔT) drops proportionally, shifting the
|
| 352 |
+
equilibrium cold aisle temp upward. With a well-provisioned CRAC
|
| 353 |
+
the shift is modest (~0.5-1.5°C).
|
| 354 |
+
"""
|
| 355 |
+
single_zone_sim.step_n(300)
|
| 356 |
+
temp_before = single_zone_sim.state.zones[0].cold_aisle_temp_c
|
| 357 |
+
|
| 358 |
+
single_zone_sim.set_crac_fan_speed("CRAC-1", 50.0)
|
| 359 |
+
single_zone_sim.step_n(600) # More time to reach new equilibrium
|
| 360 |
+
temp_after = single_zone_sim.state.zones[0].cold_aisle_temp_c
|
| 361 |
+
|
| 362 |
+
assert temp_after > temp_before + 0.3, (
|
| 363 |
+
f"Reduced fan speed should raise temp: {temp_before:.1f} → {temp_after:.1f}°C"
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
def test_fan_power_cubic_law(self, single_zone_sim: ThermalSimulation):
|
| 367 |
+
"""Fan power should follow cubic law: P ∝ speed³."""
|
| 368 |
+
crac = single_zone_sim.state.zones[0].crac_units[0]
|
| 369 |
+
rated_power = crac.fan_rated_power_kw
|
| 370 |
+
|
| 371 |
+
# At 50% speed, power should be 0.5³ = 0.125 of rated
|
| 372 |
+
crac.fan_speed_pct = 50.0
|
| 373 |
+
# Fan power is part of compute_power_consumption, but we can test the formula
|
| 374 |
+
expected_fan_power = rated_power * (0.5 ** 3)
|
| 375 |
+
actual_fan_power = rated_power * (crac.fan_speed_pct / 100.0) ** 3
|
| 376 |
+
|
| 377 |
+
assert abs(actual_fan_power - expected_fan_power) < 0.01
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
class TestOutsideTemperature:
|
| 381 |
+
"""Test outside temperature effects."""
|
| 382 |
+
|
| 383 |
+
def test_hotter_outside_increases_cooling_power(self):
|
| 384 |
+
"""Higher outside temp should degrade COP and increase cooling power."""
|
| 385 |
+
temps = [20.0, 35.0, 45.0]
|
| 386 |
+
cooling_powers = []
|
| 387 |
+
|
| 388 |
+
for t_out in temps:
|
| 389 |
+
racks = [RackConfig(rack_id=f"A-{i}", row="A", position=i) for i in range(1, 6)]
|
| 390 |
+
cracs = [CRACConfig(unit_id="CRAC-1")]
|
| 391 |
+
cfg = DatacenterConfig(
|
| 392 |
+
zones=[ZoneConfig(
|
| 393 |
+
zone_id="zone_a", racks=racks, crac_units=cracs,
|
| 394 |
+
air_volume_m3=300.0,
|
| 395 |
+
)],
|
| 396 |
+
outside_temp_c=t_out,
|
| 397 |
+
floor_area_m2=300.0,
|
| 398 |
+
)
|
| 399 |
+
sim = ThermalSimulation(cfg)
|
| 400 |
+
sim.step_n(600)
|
| 401 |
+
cooling_powers.append(sim.state.total_cooling_power_kw)
|
| 402 |
+
|
| 403 |
+
# Higher outside temp → higher cooling power (degraded COP)
|
| 404 |
+
assert cooling_powers[1] > cooling_powers[0], (
|
| 405 |
+
f"Cooling power at 35°C ({cooling_powers[1]:.1f} kW) should exceed "
|
| 406 |
+
f"at 20°C ({cooling_powers[0]:.1f} kW)"
|
| 407 |
+
)
|
| 408 |
+
assert cooling_powers[2] > cooling_powers[1], (
|
| 409 |
+
f"Cooling power at 45°C ({cooling_powers[2]:.1f} kW) should exceed "
|
| 410 |
+
f"at 35°C ({cooling_powers[1]:.1f} kW)"
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
class TestEnergyConservation:
|
| 415 |
+
"""Test that energy bookkeeping is consistent."""
|
| 416 |
+
|
| 417 |
+
def test_energy_positive(self, default_sim: ThermalSimulation):
|
| 418 |
+
"""Energy consumed per step should always be positive."""
|
| 419 |
+
for _ in range(100):
|
| 420 |
+
result = default_sim.step()
|
| 421 |
+
assert result.energy_consumed_kwh > 0, "Energy per step must be positive"
|
| 422 |
+
|
| 423 |
+
def test_cooling_output_matches_heat_at_steady_state(
|
| 424 |
+
self, default_sim: ThermalSimulation
|
| 425 |
+
):
|
| 426 |
+
"""At thermal equilibrium, CRAC extraction ≈ IT load + overhead.
|
| 427 |
+
|
| 428 |
+
The CRAC-extracted heat includes bypass airflow effects (cold air
|
| 429 |
+
that bypasses servers and returns to CRACs at T_cold instead of T_hot).
|
| 430 |
+
Total extraction should reasonably cover IT load plus internal gains.
|
| 431 |
+
"""
|
| 432 |
+
default_sim.step_n(600)
|
| 433 |
+
result = default_sim.step()
|
| 434 |
+
|
| 435 |
+
total_it_kw = default_sim.state.total_it_load_kw
|
| 436 |
+
q_cooling = result.total_cooling_output_kw
|
| 437 |
+
|
| 438 |
+
# With bypass-corrected model, CRAC extraction ≈ IT load plus
|
| 439 |
+
# overhead (UPS/PDU/lighting losses + envelope gain ≈ 10-20% of IT)
|
| 440 |
+
ratio = q_cooling / total_it_kw if total_it_kw > 0 else 0
|
| 441 |
+
assert 0.5 < ratio < 2.0, (
|
| 442 |
+
f"Cooling/IT ratio {ratio:.2f} outside plausible range. "
|
| 443 |
+
f"Cooling: {q_cooling:.1f} kW, IT: {total_it_kw:.1f} kW"
|
| 444 |
+
)
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
class TestPerformance:
|
| 448 |
+
"""Test simulation speed meets target: <1ms per step."""
|
| 449 |
+
|
| 450 |
+
def test_1000_steps_under_1_second(self, default_sim: ThermalSimulation):
|
| 451 |
+
"""1000 steps should complete in under 1 second for a 20-rack DC."""
|
| 452 |
+
start = time.perf_counter()
|
| 453 |
+
default_sim.step_n(1000)
|
| 454 |
+
elapsed = time.perf_counter() - start
|
| 455 |
+
|
| 456 |
+
assert elapsed < 1.0, (
|
| 457 |
+
f"1000 steps took {elapsed:.3f}s — exceeds 1s target"
|
| 458 |
+
)
|
| 459 |
+
# Report throughput
|
| 460 |
+
steps_per_sec = 1000.0 / elapsed
|
| 461 |
+
print(f"\nPerformance: {steps_per_sec:.0f} steps/sec ({elapsed*1000:.1f} ms for 1000 steps)")
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
class TestMutationHelpers:
|
| 465 |
+
"""Test that mutation helpers work correctly."""
|
| 466 |
+
|
| 467 |
+
def test_set_crac_setpoint(self, default_sim: ThermalSimulation):
|
| 468 |
+
assert default_sim.set_crac_setpoint("CRAC-1", 22.0)
|
| 469 |
+
crac = default_sim._find_crac("CRAC-1")
|
| 470 |
+
assert crac is not None
|
| 471 |
+
assert crac.setpoint_c == 22.0
|
| 472 |
+
|
| 473 |
+
def test_set_invalid_crac(self, default_sim: ThermalSimulation):
|
| 474 |
+
assert not default_sim.set_crac_setpoint("CRAC-99", 22.0)
|
| 475 |
+
|
| 476 |
+
def test_set_fan_speed_clamped(self, default_sim: ThermalSimulation):
|
| 477 |
+
assert default_sim.set_crac_fan_speed("CRAC-1", 150.0)
|
| 478 |
+
crac = default_sim._find_crac("CRAC-1")
|
| 479 |
+
assert crac is not None
|
| 480 |
+
assert crac.fan_speed_pct == 100.0
|
| 481 |
+
|
| 482 |
+
def test_inject_and_clear_fault(self, default_sim: ThermalSimulation):
|
| 483 |
+
assert default_sim.inject_crac_fault("CRAC-2", CRACFaultType.FAN)
|
| 484 |
+
crac = default_sim._find_crac("CRAC-2")
|
| 485 |
+
assert crac is not None
|
| 486 |
+
assert crac.status == CRACStatus.FAULT
|
| 487 |
+
assert crac.fault_type == CRACFaultType.FAN
|
| 488 |
+
assert crac.current_airflow_m3s == 0.0
|
| 489 |
+
|
| 490 |
+
assert default_sim.clear_crac_fault("CRAC-2")
|
| 491 |
+
assert crac.status == CRACStatus.RUNNING
|
| 492 |
+
assert crac.fault_type == CRACFaultType.NONE
|
| 493 |
+
|
| 494 |
+
def test_set_rack_load(self, default_sim: ThermalSimulation):
|
| 495 |
+
assert default_sim.set_rack_load("A-01", 12.0)
|
| 496 |
+
rack = default_sim._find_rack("A-01")
|
| 497 |
+
assert rack is not None
|
| 498 |
+
assert rack.it_load_kw == 12.0
|
| 499 |
+
assert rack.airflow_m3s > 0 # Airflow updated proportionally
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|