""" Data models for the NOC Agent environment. Defines all Pydantic models for actions, observations, and system metrics used by both the Gymnasium training environment and the OpenEnv server. """ from __future__ import annotations from enum import Enum import numpy as np from openenv.core.env_server.types import Action, Observation from pydantic import BaseModel, Field class ActionType(str, Enum): """Discrete actions available to the NOC agent.""" DO_NOTHING = "do_nothing" RESTART_SERVICE = "restart_service" THROTTLE_CPU = "throttle_cpu" CLEAR_CACHE = "clear_cache" REROUTE_TRAFFIC = "reroute_traffic" SCALE_UP = "scale_up" class IncidentType(str, Enum): """Supported incident types the simulator can inject.""" CPU_OVERLOAD = "cpu_overload" MEMORY_LEAK = "memory_leak" NETWORK_CONGESTION = "network_congestion" # Ordered list used to map integer indices to ActionType (for Gymnasium Discrete space) ACTION_INDEX: list[ActionType] = list(ActionType) class SystemMetrics(BaseModel): """ Normalised system health metrics. All values are in [0.0, 1.0] unless noted. Higher values indicate more stress (worse health) except service_healthy. """ cpu_usage: float = Field(..., ge=0.0, le=1.0, description="CPU utilisation (0=idle, 1=fully saturated)") memory_usage: float = Field(..., ge=0.0, le=1.0, description="RAM utilisation") latency: float = Field(..., ge=0.0, le=1.0, description="Network latency normalised over 500 ms") packet_loss: float = Field(..., ge=0.0, le=1.0, description="Fraction of packets dropped") service_healthy: float = Field(..., ge=0.0, le=1.0, description="1.0 = healthy, 0.0 = down") error_rate: float = Field(..., ge=0.0, le=1.0, description="Fraction of requests returning errors") def to_array(self) -> np.ndarray: """Return metrics as a flat float32 numpy array for the Gymnasium observation space.""" return np.array( [ self.cpu_usage, self.memory_usage, self.latency, self.packet_loss, self.service_healthy, self.error_rate, ], dtype=np.float32, ) @classmethod def from_array(cls, arr: np.ndarray) -> "SystemMetrics": """Reconstruct from a flat numpy array (must have 6 elements).""" return cls( cpu_usage=float(arr[0]), memory_usage=float(arr[1]), latency=float(arr[2]), packet_loss=float(arr[3]), service_healthy=float(arr[4]), error_rate=float(arr[5]), ) @property def health_score(self) -> float: """Aggregate health score in [0, 1]. 1.0 = fully healthy.""" stress = ( self.cpu_usage * 0.25 + self.memory_usage * 0.25 + self.latency * 0.20 + self.packet_loss * 0.15 + (1.0 - self.service_healthy) * 0.10 + self.error_rate * 0.05 ) return max(0.0, 1.0 - stress) @property def is_critical(self) -> bool: """True if any metric has exceeded crash thresholds.""" return ( self.cpu_usage >= 0.98 or self.memory_usage >= 0.98 or self.error_rate >= 0.90 ) @property def is_resolved(self) -> bool: """True when all metrics are comfortably below healthy thresholds.""" return ( self.cpu_usage < 0.65 and self.memory_usage < 0.65 and self.latency < 0.20 and self.packet_loss < 0.05 and self.service_healthy >= 1.0 and self.error_rate < 0.10 ) # --------------------------------------------------------------------------- # OpenEnv action / observation (used by the server and HTTP client) # --------------------------------------------------------------------------- class NOCAction(Action): """Action sent by a client to the NOC environment server.""" action_type: ActionType = Field(..., description="Discrete action to apply") class NOCObservation(Observation): """Observation returned by the NOC environment server after each step.""" metrics: SystemMetrics = Field(..., description="Current normalised system metrics") incident_type: IncidentType = Field(..., description="Active incident in this episode") step: int = Field(default=0, description="Current step within the episode") explanation: str = Field(default="", description="Post-hoc explanation for last agent action")