File size: 6,761 Bytes
07e80ad 12439fa dd24a31 77ede9e cf2697b 77ede9e dd24a31 12439fa dd24a31 07e80ad 12439fa dd24a31 12439fa dd24a31 12439fa dd24a31 12439fa 5144b7e 12439fa dd24a31 12439fa dd24a31 12439fa dd24a31 07e80ad dd24a31 12439fa 4b5c463 12439fa bba6f8a dd24a31 bba6f8a dd24a31 12439fa dd24a31 12439fa dd24a31 12439fa 4b5c463 52a986a b5e5650 52a986a 12439fa 77ede9e d6439a8 12439fa dd24a31 12439fa dd24a31 12439fa dd24a31 bba6f8a 279ccf2 dd24a31 279ccf2 dd24a31 12439fa dd24a31 12439fa dd24a31 12439fa dd24a31 12439fa dd24a31 3a871a0 4b5c463 77ede9e dfe5268 6ad7bd8 52a986a 77ede9e 12439fa dd24a31 12439fa 77ede9e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 | from enum import Enum
from typing import Annotated, Literal, Optional
from pydantic import BaseModel, Field
class EnvironmentMode(str, Enum):
SIMULATED = "simulated"
HYBRID = "hybrid"
LIVE = "live"
AWS = "aws"
# ---------------------------------------------------------------------------
# SRE Action Schema (Control Plane)
# ---------------------------------------------------------------------------
class ActionType(str, Enum):
NO_OP = "NO_OP"
SCALE_UP = "SCALE_UP"
SCALE_DOWN = "SCALE_DOWN"
REROUTE_TRAFFIC = "REROUTE_TRAFFIC"
SHED_LOAD = "SHED_LOAD"
class SREAction(BaseModel):
"""
Management action issued by the SRE agent.
* SCALE_UP: Increment capacity on target_node_id by parameter (1-5 units).
* SCALE_DOWN: Decrement capacity on target_node_id by parameter (1-5 units).
* REROUTE_TRAFFIC: Shift 'parameter' [0, 1] of incoming traffic AWAY from
target_node_id and redistribute across healthy peers.
* SHED_LOAD: Drop 'parameter' [0, 1] of incoming traffic targeting target_node_id for 1 tick.
"""
action_type: ActionType
target_node_id: str
parameter: float = Field(default=0.0, ge=0.0, le=10.0)
# ---------------------------------------------------------------------------
# Observation Schema (Data Plane)
# ---------------------------------------------------------------------------
class NodeStatus(str, Enum):
HEALTHY = "HEALTHY"
DEGRADED = "DEGRADED"
FAILED = "FAILED"
class NodeObservation(BaseModel):
"""Telemetry for a single service instance (node)."""
node_id: str
status: NodeStatus
is_vip: bool = False
# All numerical telemetry is normalized to [0, 1] for RL stability.
queue_depth: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description=(
"Normalized queue depth [0.0, 1.0]. Represents the % of theoretical max queue."
),
)
latency_ms: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="Normalized processing latency [0.0, 1.0] relative to 1000ms SLA limit.",
)
incoming_request_rate: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="Normalized incoming request rate [0.0, 1.0] for this node (requests per tick).",
)
cpu_utilization: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="Estimated CPU load [0.0, 1.0].",
)
importance_weight: float = Field(
default=1.0,
ge=0.0,
description="Business criticality weight. VIP nodes have higher impact on scoring.",
)
capacity: float = Field(
default=0.0,
ge=0.0,
description="Current capacity units provisioned for this node (0-5).",
)
pending_capacity: float = Field(
default=0.0,
ge=0.0,
description="Capacity units being booted (will be live after boot delay).",
)
queue_delta: float = Field(
default=0.0,
ge=-1.0,
le=1.0,
description="Normalized queue depth change from previous tick (-1 to +1).",
)
sla_proximity: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="How close this node is to SLA violation (0=safe, 1=violating).",
)
outflow_rate: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="Normalised rate of requests dispatched downstream [0, 1].",
)
upstream_nodes: list[str] = Field(default_factory=list)
downstream_nodes: list[str] = Field(default_factory=list)
upstream_pressure: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="Mean queue depth of upstream parent nodes (normalised).",
)
node_reward: float = Field(
default=0.0,
description="Per-node reward contribution for credit assignment.",
)
# Episode interaction fields (handled by framework)
done: bool = False
reward: float = 0.0
class ClusterObservation(BaseModel):
"""System-wide telemetry representing the 'dashboard' for the agent."""
cluster_id: str
task_id: str
step: int
max_steps: int
mode: EnvironmentMode = EnvironmentMode.SIMULATED
active_nodes: int = Field(ge=0, le=10)
average_latency_ms: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="Cluster-wide average latency (normalized [0.0, 1.0]).",
)
error_rate: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="Cluster-wide fraction of dropped/failed requests [0.0, 1.0].",
)
total_queue_backlog: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description=(
"Normalized sum of queue_depth across all nodes [0.0, 1.0]."
),
)
current_cost_per_hour: float = Field(
default=0.0,
ge=0.0,
description="Infrastructure cost in USD/hr based on provisioned capacity.",
)
lyapunov_energy: float = Field(
default=0.0,
description="Stability metric (Sum of squares of queue depths). Low is good.",
)
sla_violations: int = Field(
default=0,
description="Cumulative count of SLA violations this episode.",
)
invalid_action_count: int = Field(
default=0,
description="Number of forbidden actions (e.g. SHED_LOAD on critical nodes).",
)
vip_failure_count: int = Field(
default=0,
description="Number of failed VIP nodes in the current observation.",
)
# New fields for Prometheus/Kubernetes integration
metric_timestamp: float = 0.0
data_freshness_ms: int = 0
action_ack_status: str = "success"
action_id: str = ""
executor_latency_ms: float = Field(default=0.0, ge=0.0)
executor_error_code: str = ""
raw_reward: float = 0.0
normalized_reward: float = Field(default=0.0, ge=0.0, le=1.0)
reward_scale_version: str = "sigmoid-v1"
# Reward components breakdown
reward_drift: float = Field(
default=0.0,
description="Lyapunov drift component of the reward.",
)
reward_cost: float = Field(
default=0.0,
description="Infrastructure cost component of the reward.",
)
reward_sla: float = Field(
default=0.0,
description="SLA penalty component of the reward.",
)
reward_barrier: float = Field(
default=0.0,
description="Barrier function penalty component of the reward.",
)
choke_level: float = 0.0
nodes: list[NodeObservation]
# Episode interaction fields (handled by framework)
done: bool = False
reward: float = 0.0
|