| from enum import Enum |
| from typing import Annotated, Literal, Optional |
| from pydantic import BaseModel, Field |
|
|
| class EnvironmentMode(str, Enum): |
| SIMULATED = "simulated" |
| HYBRID = "hybrid" |
| LIVE = "live" |
| AWS = "aws" |
|
|
| |
| |
| |
|
|
| class ActionType(str, Enum): |
| NO_OP = "NO_OP" |
| SCALE_UP = "SCALE_UP" |
| SCALE_DOWN = "SCALE_DOWN" |
| REROUTE_TRAFFIC = "REROUTE_TRAFFIC" |
| SHED_LOAD = "SHED_LOAD" |
|
|
| class SREAction(BaseModel): |
| """ |
| Management action issued by the SRE agent. |
| |
| * SCALE_UP: Increment capacity on target_node_id by parameter (1-5 units). |
| * SCALE_DOWN: Decrement capacity on target_node_id by parameter (1-5 units). |
| * REROUTE_TRAFFIC: Shift 'parameter' [0, 1] of incoming traffic AWAY from |
| target_node_id and redistribute across healthy peers. |
| * SHED_LOAD: Drop 'parameter' [0, 1] of incoming traffic targeting target_node_id for 1 tick. |
| """ |
| action_type: ActionType |
| target_node_id: str |
| parameter: float = Field(default=0.0, ge=0.0, le=10.0) |
|
|
| |
| |
| |
|
|
| class NodeStatus(str, Enum): |
| HEALTHY = "HEALTHY" |
| DEGRADED = "DEGRADED" |
| FAILED = "FAILED" |
|
|
| class NodeObservation(BaseModel): |
| """Telemetry for a single service instance (node).""" |
| node_id: str |
| status: NodeStatus |
| is_vip: bool = False |
| |
| |
| queue_depth: float = Field( |
| default=0.0, |
| ge=0.0, |
| le=1.0, |
| description=( |
| "Normalized queue depth [0.0, 1.0]. Represents the % of theoretical max queue." |
| ), |
| ) |
|
|
| latency_ms: float = Field( |
| default=0.0, |
| ge=0.0, |
| le=1.0, |
| description="Normalized processing latency [0.0, 1.0] relative to 1000ms SLA limit.", |
| ) |
|
|
| incoming_request_rate: float = Field( |
| default=0.0, |
| ge=0.0, |
| le=1.0, |
| description="Normalized incoming request rate [0.0, 1.0] for this node (requests per tick).", |
| ) |
|
|
| cpu_utilization: float = Field( |
| default=0.0, |
| ge=0.0, |
| le=1.0, |
| description="Estimated CPU load [0.0, 1.0].", |
| ) |
|
|
| importance_weight: float = Field( |
| default=1.0, |
| ge=0.0, |
| description="Business criticality weight. VIP nodes have higher impact on scoring.", |
| ) |
|
|
| capacity: float = Field( |
| default=0.0, |
| ge=0.0, |
| description="Current capacity units provisioned for this node (0-5).", |
| ) |
|
|
| pending_capacity: float = Field( |
| default=0.0, |
| ge=0.0, |
| description="Capacity units being booted (will be live after boot delay).", |
| ) |
|
|
| queue_delta: float = Field( |
| default=0.0, |
| ge=-1.0, |
| le=1.0, |
| description="Normalized queue depth change from previous tick (-1 to +1).", |
| ) |
|
|
| sla_proximity: float = Field( |
| default=0.0, |
| ge=0.0, |
| le=1.0, |
| description="How close this node is to SLA violation (0=safe, 1=violating).", |
| ) |
|
|
| outflow_rate: float = Field( |
| default=0.0, |
| ge=0.0, |
| le=1.0, |
| description="Normalised rate of requests dispatched downstream [0, 1].", |
| ) |
| upstream_nodes: list[str] = Field(default_factory=list) |
| downstream_nodes: list[str] = Field(default_factory=list) |
| upstream_pressure: float = Field( |
| default=0.0, |
| ge=0.0, |
| le=1.0, |
| description="Mean queue depth of upstream parent nodes (normalised).", |
| ) |
|
|
| node_reward: float = Field( |
| default=0.0, |
| description="Per-node reward contribution for credit assignment.", |
| ) |
|
|
| |
| done: bool = False |
| reward: float = 0.0 |
|
|
| class ClusterObservation(BaseModel): |
| """System-wide telemetry representing the 'dashboard' for the agent.""" |
| cluster_id: str |
| task_id: str |
| step: int |
| max_steps: int |
| |
| mode: EnvironmentMode = EnvironmentMode.SIMULATED |
| |
| active_nodes: int = Field(ge=0, le=10) |
| |
| average_latency_ms: float = Field( |
| default=0.0, |
| ge=0.0, |
| le=1.0, |
| description="Cluster-wide average latency (normalized [0.0, 1.0]).", |
| ) |
|
|
| error_rate: float = Field( |
| default=0.0, |
| ge=0.0, |
| le=1.0, |
| description="Cluster-wide fraction of dropped/failed requests [0.0, 1.0].", |
| ) |
|
|
| total_queue_backlog: float = Field( |
| default=0.0, |
| ge=0.0, |
| le=1.0, |
| description=( |
| "Normalized sum of queue_depth across all nodes [0.0, 1.0]." |
| ), |
| ) |
|
|
| current_cost_per_hour: float = Field( |
| default=0.0, |
| ge=0.0, |
| description="Infrastructure cost in USD/hr based on provisioned capacity.", |
| ) |
|
|
| lyapunov_energy: float = Field( |
| default=0.0, |
| description="Stability metric (Sum of squares of queue depths). Low is good.", |
| ) |
|
|
| sla_violations: int = Field( |
| default=0, |
| description="Cumulative count of SLA violations this episode.", |
| ) |
|
|
| invalid_action_count: int = Field( |
| default=0, |
| description="Number of forbidden actions (e.g. SHED_LOAD on critical nodes).", |
| ) |
|
|
| vip_failure_count: int = Field( |
| default=0, |
| description="Number of failed VIP nodes in the current observation.", |
| ) |
|
|
| |
| metric_timestamp: float = 0.0 |
| data_freshness_ms: int = 0 |
| action_ack_status: str = "success" |
| action_id: str = "" |
| executor_latency_ms: float = Field(default=0.0, ge=0.0) |
| executor_error_code: str = "" |
| raw_reward: float = 0.0 |
| normalized_reward: float = Field(default=0.0, ge=0.0, le=1.0) |
| reward_scale_version: str = "sigmoid-v1" |
| |
| reward_drift: float = Field( |
| default=0.0, |
| description="Lyapunov drift component of the reward.", |
| ) |
| reward_cost: float = Field( |
| default=0.0, |
| description="Infrastructure cost component of the reward.", |
| ) |
| reward_sla: float = Field( |
| default=0.0, |
| description="SLA penalty component of the reward.", |
| ) |
| reward_barrier: float = Field( |
| default=0.0, |
| description="Barrier function penalty component of the reward.", |
| ) |
|
|
| choke_level: float = 0.0 |
|
|
| nodes: list[NodeObservation] |
|
|
| |
| done: bool = False |
| reward: float = 0.0 |
|
|
|
|