File size: 4,607 Bytes
5899fec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | """
Data models for the NOC Agent environment.
Defines all Pydantic models for actions, observations, and system metrics
used by both the Gymnasium training environment and the OpenEnv server.
"""
from __future__ import annotations
from enum import Enum
import numpy as np
from openenv.core.env_server.types import Action, Observation
from pydantic import BaseModel, Field
class ActionType(str, Enum):
"""Discrete actions available to the NOC agent."""
DO_NOTHING = "do_nothing"
RESTART_SERVICE = "restart_service"
THROTTLE_CPU = "throttle_cpu"
CLEAR_CACHE = "clear_cache"
REROUTE_TRAFFIC = "reroute_traffic"
SCALE_UP = "scale_up"
class IncidentType(str, Enum):
"""Supported incident types the simulator can inject."""
CPU_OVERLOAD = "cpu_overload"
MEMORY_LEAK = "memory_leak"
NETWORK_CONGESTION = "network_congestion"
# Ordered list used to map integer indices to ActionType (for Gymnasium Discrete space)
ACTION_INDEX: list[ActionType] = list(ActionType)
class SystemMetrics(BaseModel):
"""
Normalised system health metrics.
All values are in [0.0, 1.0] unless noted.
Higher values indicate more stress (worse health) except service_healthy.
"""
cpu_usage: float = Field(..., ge=0.0, le=1.0, description="CPU utilisation (0=idle, 1=fully saturated)")
memory_usage: float = Field(..., ge=0.0, le=1.0, description="RAM utilisation")
latency: float = Field(..., ge=0.0, le=1.0, description="Network latency normalised over 500 ms")
packet_loss: float = Field(..., ge=0.0, le=1.0, description="Fraction of packets dropped")
service_healthy: float = Field(..., ge=0.0, le=1.0, description="1.0 = healthy, 0.0 = down")
error_rate: float = Field(..., ge=0.0, le=1.0, description="Fraction of requests returning errors")
def to_array(self) -> np.ndarray:
"""Return metrics as a flat float32 numpy array for the Gymnasium observation space."""
return np.array(
[
self.cpu_usage,
self.memory_usage,
self.latency,
self.packet_loss,
self.service_healthy,
self.error_rate,
],
dtype=np.float32,
)
@classmethod
def from_array(cls, arr: np.ndarray) -> "SystemMetrics":
"""Reconstruct from a flat numpy array (must have 6 elements)."""
return cls(
cpu_usage=float(arr[0]),
memory_usage=float(arr[1]),
latency=float(arr[2]),
packet_loss=float(arr[3]),
service_healthy=float(arr[4]),
error_rate=float(arr[5]),
)
@property
def health_score(self) -> float:
"""Aggregate health score in [0, 1]. 1.0 = fully healthy."""
stress = (
self.cpu_usage * 0.25
+ self.memory_usage * 0.25
+ self.latency * 0.20
+ self.packet_loss * 0.15
+ (1.0 - self.service_healthy) * 0.10
+ self.error_rate * 0.05
)
return max(0.0, 1.0 - stress)
@property
def is_critical(self) -> bool:
"""True if any metric has exceeded crash thresholds."""
return (
self.cpu_usage >= 0.98
or self.memory_usage >= 0.98
or self.error_rate >= 0.90
)
@property
def is_resolved(self) -> bool:
"""True when all metrics are comfortably below healthy thresholds."""
return (
self.cpu_usage < 0.65
and self.memory_usage < 0.65
and self.latency < 0.20
and self.packet_loss < 0.05
and self.service_healthy >= 1.0
and self.error_rate < 0.10
)
# ---------------------------------------------------------------------------
# OpenEnv action / observation (used by the server and HTTP client)
# ---------------------------------------------------------------------------
class NOCAction(Action):
"""Action sent by a client to the NOC environment server."""
action_type: ActionType = Field(..., description="Discrete action to apply")
class NOCObservation(Observation):
"""Observation returned by the NOC environment server after each step."""
metrics: SystemMetrics = Field(..., description="Current normalised system metrics")
incident_type: IncidentType = Field(..., description="Active incident in this episode")
step: int = Field(default=0, description="Current step within the episode")
explanation: str = Field(default="", description="Post-hoc explanation for last agent action")
|