dc_ops_env / simulation /thermal.py
Melikshah's picture
Upload folder using huggingface_hub
91495a2 verified
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""
RC thermal network simulation for datacenter zones.
Physics model (lumped-capacitance, per zone):
C_zone × dT_zone/dt = Q_IT - Q_cooling + Q_envelope + Q_internal
Where:
C_zone = C_air + C_equipment [J/K]
Q_IT = sum of rack IT loads × 1000 [W]
Q_cool = sum of CRAC cooling outputs × 1000 [W]
Q_env = (T_outside - T_zone) / R_envelope [W]
Q_int = UPS losses + PDU losses + lighting [W]
Cold aisle temperature accounts for hot-air recirculation:
T_cold_effective = (1-r) × T_supply_weighted + r × T_hot_aisle
where r is the recirculation factor (0 = perfect containment).
Hot aisle temperature from server energy balance:
T_hot = T_cold + Q_IT / (m_dot_rack × c_p)
Integration: Forward Euler with configurable dt (default 1.0 s).
Target: <1 ms per step for a 20-rack, 4-CRAC datacenter.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from ..config import (
AIR_DENSITY_KG_M3,
AIR_SPECIFIC_HEAT_J_KGK,
ASHRAE_CLASSES,
DatacenterConfig,
RackConfig,
CRACConfig,
ZoneConfig,
cfm_to_m3s,
make_default_datacenter_config,
)
from .types import (
CRACFaultType,
CRACState,
CRACStatus,
DatacenterState,
RackState,
ZoneState,
)
@dataclass
class ThermalAlarm:
"""An active thermal alarm."""
rack_id: str
zone_id: str
inlet_temp_c: float
threshold_c: float
severity: str # "warning" (recommended exceeded) or "critical" (allowable exceeded)
@dataclass
class ThermalStepResult:
"""Result of a single simulation step."""
state: DatacenterState
alarms: list[ThermalAlarm] = field(default_factory=list)
total_cooling_output_kw: float = 0.0
total_cooling_power_kw: float = 0.0
energy_consumed_kwh: float = 0.0 # Energy consumed in this step
class ThermalSimulation:
"""Multi-zone RC thermal network simulation.
Owns the DatacenterState and advances it forward in time.
Each call to step() integrates the thermal ODEs by dt seconds.
"""
def __init__(self, config: DatacenterConfig | None = None):
if config is None:
config = make_default_datacenter_config()
self._config = config
self._state = self._build_initial_state(config)
self._dt = config.simulation_dt_s
@property
def state(self) -> DatacenterState:
return self._state
@property
def config(self) -> DatacenterConfig:
return self._config
@property
def dt(self) -> float:
return self._dt
# ------------------------------------------------------------------
# Initialization
# ------------------------------------------------------------------
@staticmethod
def _build_initial_state(config: DatacenterConfig) -> DatacenterState:
"""Construct the initial DatacenterState from configuration."""
zones: list[ZoneState] = []
for zc in config.zones:
racks = ThermalSimulation._build_racks(zc, zc.initial_cold_aisle_temp_c)
cracs = ThermalSimulation._build_cracs(zc)
zone = ZoneState(
zone_id=zc.zone_id,
cold_aisle_temp_c=zc.initial_cold_aisle_temp_c,
hot_aisle_temp_c=zc.initial_cold_aisle_temp_c + 15.0, # Initial estimate
humidity_rh=zc.initial_humidity_rh,
recirculation_factor=zc.recirculation_factor,
racks=racks,
crac_units=cracs,
air_volume_m3=zc.air_volume_m3,
envelope_r_kw=zc.envelope_r_kw,
ashrae_class=zc.ashrae_class,
)
zones.append(zone)
state = DatacenterState(
zones=zones,
outside_temp_c=config.outside_temp_c,
outside_humidity_rh=config.outside_humidity_rh,
lighting_power_kw=config.lighting_w_per_m2 * config.floor_area_m2 / 1000.0,
ups_loss_fraction=config.ups_loss_fraction,
pdu_loss_fraction=config.pdu_loss_fraction,
sim_time_s=0.0,
)
# Run a few settling steps so initial temps are physically consistent
sim = ThermalSimulation.__new__(ThermalSimulation)
sim._state = state
sim._config = make_default_datacenter_config()
sim._dt = 1.0
for _ in range(300):
sim._integrate_step(1.0)
return state
@staticmethod
def _build_racks(zone_config: ZoneConfig, initial_temp_c: float) -> list[RackState]:
racks: list[RackState] = []
for rc in zone_config.racks:
airflow_cfm = rc.airflow_cfm_per_kw * rc.it_load_kw
airflow_m3s = cfm_to_m3s(airflow_cfm)
thermal_mass = rc.num_servers_2u * rc.server_thermal_mass_jk
rack = RackState(
rack_id=rc.rack_id,
row=rc.row,
position=rc.position,
it_load_kw=rc.it_load_kw,
inlet_temp_c=initial_temp_c,
outlet_temp_c=initial_temp_c + 15.0, # Will be corrected by settling
airflow_m3s=airflow_m3s,
thermal_mass_jk=thermal_mass,
)
racks.append(rack)
return racks
@staticmethod
def _build_cracs(zone_config: ZoneConfig) -> list[CRACState]:
cracs: list[CRACState] = []
for cc in zone_config.crac_units:
crac = CRACState(
unit_id=cc.unit_id,
setpoint_c=cc.initial_setpoint_c,
supply_temp_c=cc.initial_setpoint_c,
fan_speed_pct=cc.initial_fan_speed_pct,
max_airflow_m3s=cfm_to_m3s(cc.max_airflow_cfm),
rated_capacity_kw=cc.rated_capacity_kw,
rated_return_temp_c=cc.rated_return_temp_c,
capacity_slope_per_c=cc.capacity_slope_per_c,
fan_rated_power_kw=cc.fan_rated_power_kw,
cop_rated=cc.cop_rated,
cop_degradation_per_c=cc.cop_degradation_per_c,
supply_temp_lag_s=cc.supply_temp_lag_s,
)
cracs.append(crac)
return cracs
# ------------------------------------------------------------------
# Simulation step
# ------------------------------------------------------------------
def step(self, dt: float | None = None) -> ThermalStepResult:
"""Advance the simulation by dt seconds.
Returns a ThermalStepResult with updated state, alarms, and energy metrics.
"""
if dt is None:
dt = self._dt
result = self._integrate_step(dt)
self._state.sim_time_s += dt
return result
def step_n(self, n: int, dt: float | None = None) -> ThermalStepResult:
"""Advance simulation by n steps. Returns result of the last step."""
result = ThermalStepResult(state=self._state)
for _ in range(n):
result = self.step(dt)
return result
def _integrate_step(self, dt: float) -> ThermalStepResult:
"""Core integration: one Forward Euler step across all zones.
Physics model — **cold aisle energy balance** (not total-zone):
The cold aisle is a mixing volume. Heat flows into/out of it:
q_crac = m_dot_crac × c_p × (T_supply − T_cold) [cooling from CRACs]
q_recirc = r × m_dot_crac × c_p × (T_hot − T_cold) [recirculated hot air]
q_env = (T_outside − T_cold) / R_envelope [building heat gain]
q_int = UPS losses + PDU losses + lighting [internal gains]
IT heat does NOT appear directly — servers move cold air to the hot
aisle, raising T_hot. IT heat affects the cold aisle only through
recirculation (hot air leaking back) and indirectly via CRAC return
temperature.
Hot aisle temperature (algebraic, not ODE):
T_hot = T_cold + Q_IT / (m_dot_rack × c_p)
CRAC return air temperature accounts for bypass airflow:
When CRAC airflow > rack airflow, excess cold air bypasses servers
and returns directly to the CRAC at T_cold, lowering the effective
return air temperature and thus CRAC cooling output.
T_return = (1 − bypass) × T_hot + bypass × T_cold
"""
state = self._state
alarms: list[ThermalAlarm] = []
total_cooling_output_kw = 0.0
total_cooling_power_kw = 0.0
total_power_kw = 0.0
for zone in state.zones:
# 1. Update CRAC supply temperatures (first-order lag toward setpoint)
for crac in zone.crac_units:
crac.update_supply_temp(dt)
# 2. Airflow quantities
q_it_w = zone.total_it_load_kw * 1000.0
m_dot_rack = zone.total_rack_airflow_m3s * AIR_DENSITY_KG_M3 # kg/s
m_dot_crac = zone.total_crac_airflow_m3s * AIR_DENSITY_KG_M3 # kg/s
# Server temperature rise [°C]
if m_dot_rack > 0:
dt_server = q_it_w / (m_dot_rack * AIR_SPECIFIC_HEAT_J_KGK)
else:
dt_server = 50.0 # No airflow → extreme rise
t_hot = zone.cold_aisle_temp_c + dt_server
# 3. Bypass fraction: excess CRAC airflow that bypasses servers
if m_dot_crac > 0 and m_dot_rack > 0:
bypass_frac = max(0.0, 1.0 - m_dot_rack / m_dot_crac)
else:
bypass_frac = 0.0
# CRAC return air temp (mixed hot exhaust + bypassed cold air)
t_return = (1.0 - bypass_frac) * t_hot + bypass_frac * zone.cold_aisle_temp_c
# 4. CRAC cooling output (based on bypass-corrected return temp)
q_cooling_extracted_w = 0.0
zone_cooling_power_kw = 0.0
for crac in zone.crac_units:
q_crac_kw = crac.compute_cooling_output_kw(t_return)
q_cooling_extracted_w += q_crac_kw * 1000.0
total_cooling_output_kw += q_crac_kw
p_crac_kw = crac.compute_power_consumption_kw(q_crac_kw, state.outside_temp_c)
zone_cooling_power_kw += p_crac_kw
total_cooling_power_kw += p_crac_kw
# 5. Cold aisle energy balance [all in Watts]
# CRAC supply mixing: each CRAC injects air into the cold aisle.
# Running CRACs inject air at their supply temp (near setpoint).
# Compressor-faulted CRACs with fans running inject air at the
# return air temp (air passes through the inactive coil unconditioned).
q_crac_mixing_w = 0.0
for crac in zone.crac_units:
crac_flow = crac.current_airflow_m3s * AIR_DENSITY_KG_M3
if crac_flow <= 0:
continue
if crac.fault_type in (CRACFaultType.COMPRESSOR, CRACFaultType.REFRIGERANT_LEAK):
effective_supply = t_return # No cooling — just recirculating
else:
effective_supply = crac.supply_temp_c
q_crac_mixing_w += crac_flow * AIR_SPECIFIC_HEAT_J_KGK * (
effective_supply - zone.cold_aisle_temp_c
)
# Hot air entering cold aisle from two mechanisms:
#
# (a) Containment recirculation: fraction r of air leaks through
# containment gaps regardless of CRAC flow balance.
# Uses max(m_dot_rack, m_dot_crac) — recirculation is driven
# by pressure differentials from whichever airflow is dominant.
# When CRACs are off, server fans still drive leakage.
r = zone.recirculation_factor
m_dot_dominant = max(m_dot_rack, m_dot_crac)
q_recirc_w = r * m_dot_dominant * AIR_SPECIFIC_HEAT_J_KGK * dt_server
# (b) Natural return: when CRAC airflow < rack airflow, servers
# exhaust more hot air than CRACs can capture. The uncaptured
# fraction returns to the cold aisle via natural convection.
# When CRACs are completely off, ALL server exhaust returns
# (= Q_IT returns to cold aisle as heat).
if m_dot_rack > 0 and m_dot_crac < m_dot_rack:
natural_return_frac = 1.0 - m_dot_crac / m_dot_rack
q_natural_return_w = (
natural_return_frac * m_dot_rack * AIR_SPECIFIC_HEAT_J_KGK * dt_server
)
else:
q_natural_return_w = 0.0
# Envelope heat gain
if zone.envelope_r_kw > 0:
q_envelope_w = (state.outside_temp_c - zone.cold_aisle_temp_c) / zone.envelope_r_kw
else:
q_envelope_w = 0.0
# Internal gains (UPS/PDU losses + lighting)
q_ups_w = zone.total_it_load_kw * state.ups_loss_fraction * 1000.0
q_pdu_w = zone.total_it_load_kw * state.pdu_loss_fraction * 1000.0
num_zones = len(state.zones) if state.zones else 1
q_lighting_w = state.lighting_power_kw * 1000.0 / num_zones
q_internal_w = q_ups_w + q_pdu_w + q_lighting_w
# 6. Net heat into cold aisle [W]
q_net_w = (
q_crac_mixing_w + q_recirc_w + q_natural_return_w
+ q_envelope_w + q_internal_w
)
# 7. Forward Euler integration
c_total = zone.compute_thermal_capacitance_jk()
if c_total > 0:
dT = q_net_w * dt / c_total
zone.cold_aisle_temp_c += dT
# 8. Update hot aisle (algebraic: T_hot = T_cold + server ΔT)
if m_dot_rack > 0:
zone.hot_aisle_temp_c = (
zone.cold_aisle_temp_c
+ q_it_w / (m_dot_rack * AIR_SPECIFIC_HEAT_J_KGK)
)
else:
zone.hot_aisle_temp_c = zone.cold_aisle_temp_c + 50.0
# 9. Update individual rack inlet/outlet temperatures
for rack in zone.racks:
rack.inlet_temp_c = zone.cold_aisle_temp_c
rack.outlet_temp_c = rack.compute_outlet_temp()
# 10. Check ASHRAE alarms
ashrae = ASHRAE_CLASSES.get(zone.ashrae_class)
if ashrae:
for rack in zone.racks:
if rack.inlet_temp_c > ashrae.allowable_max_c:
alarms.append(ThermalAlarm(
rack_id=rack.rack_id,
zone_id=zone.zone_id,
inlet_temp_c=rack.inlet_temp_c,
threshold_c=ashrae.allowable_max_c,
severity="critical",
))
elif rack.inlet_temp_c > ashrae.recommended_max_c:
alarms.append(ThermalAlarm(
rack_id=rack.rack_id,
zone_id=zone.zone_id,
inlet_temp_c=rack.inlet_temp_c,
threshold_c=ashrae.recommended_max_c,
severity="warning",
))
total_power_kw += zone.total_it_load_kw
# Energy consumed in this step [kWh]
total_facility_kw = total_power_kw + total_cooling_power_kw + (
total_power_kw * (state.ups_loss_fraction + state.pdu_loss_fraction)
+ state.lighting_power_kw
)
energy_kwh = total_facility_kw * dt / 3600.0
return ThermalStepResult(
state=state,
alarms=alarms,
total_cooling_output_kw=total_cooling_output_kw,
total_cooling_power_kw=total_cooling_power_kw,
energy_consumed_kwh=energy_kwh,
)
@staticmethod
def _compute_weighted_supply_temp(zone: ZoneState) -> float | None:
"""Flow-weighted average of CRAC supply temperatures.
T_supply_weighted = Σ(T_supply_i × m_dot_i) / Σ(m_dot_i)
Returns None if no CRACs are producing airflow.
"""
total_flow = 0.0
weighted_temp = 0.0
for crac in zone.crac_units:
flow = crac.current_airflow_m3s
if flow > 0:
weighted_temp += crac.supply_temp_c * flow
total_flow += flow
if total_flow <= 0:
return None
return weighted_temp / total_flow
# ------------------------------------------------------------------
# Mutation helpers (used by action parser in later phases)
# ------------------------------------------------------------------
def set_crac_setpoint(self, unit_id: str, setpoint_c: float) -> bool:
"""Adjust a CRAC unit's supply air temperature setpoint. Returns success."""
crac = self._find_crac(unit_id)
if crac is None:
return False
crac.setpoint_c = setpoint_c
return True
def set_crac_fan_speed(self, unit_id: str, speed_pct: float) -> bool:
"""Set CRAC fan speed (0-100%). Returns success."""
crac = self._find_crac(unit_id)
if crac is None:
return False
crac.fan_speed_pct = max(0.0, min(100.0, speed_pct))
return True
def set_crac_status(self, unit_id: str, status: CRACStatus) -> bool:
"""Change CRAC operating status. Returns success."""
crac = self._find_crac(unit_id)
if crac is None:
return False
crac.status = status
return True
def inject_crac_fault(
self, unit_id: str, fault_type: CRACFaultType
) -> bool:
"""Inject a fault into a CRAC unit. Returns success."""
crac = self._find_crac(unit_id)
if crac is None:
return False
crac.status = CRACStatus.FAULT
crac.fault_type = fault_type
return True
def clear_crac_fault(self, unit_id: str) -> bool:
"""Clear a CRAC fault and return to running. Returns success."""
crac = self._find_crac(unit_id)
if crac is None:
return False
crac.status = CRACStatus.RUNNING
crac.fault_type = CRACFaultType.NONE
return True
def set_rack_load(self, rack_id: str, load_kw: float) -> bool:
"""Change a rack's IT load. Returns success."""
rack = self._find_rack(rack_id)
if rack is None:
return False
rack.it_load_kw = max(0.0, load_kw)
# Update airflow proportionally (servers spin fans with load)
from ..config import RackConfig
default_cfm_per_kw = RackConfig().airflow_cfm_per_kw
rack.airflow_m3s = cfm_to_m3s(default_cfm_per_kw * rack.it_load_kw)
return True
def set_outside_temp(self, temp_c: float) -> None:
"""Set outside temperature."""
self._state.outside_temp_c = temp_c
def _find_crac(self, unit_id: str) -> CRACState | None:
target = unit_id.lower()
for zone in self._state.zones:
for crac in zone.crac_units:
if crac.unit_id.lower() == target:
return crac
return None
def _find_rack(self, rack_id: str) -> RackState | None:
target = rack_id.lower()
for zone in self._state.zones:
for rack in zone.racks:
if rack.rack_id.lower() == target:
return rack
return None
def find_zone_for_crac(self, unit_id: str) -> ZoneState | None:
"""Find the zone containing a given CRAC unit."""
for zone in self._state.zones:
for crac in zone.crac_units:
if crac.unit_id == unit_id:
return zone
return None
def find_zone_for_rack(self, rack_id: str) -> ZoneState | None:
"""Find the zone containing a given rack."""
for zone in self._state.zones:
for rack in zone.racks:
if rack.rack_id == rack_id:
return zone
return None