File size: 6,761 Bytes

07e80ad
12439fa
 
dd24a31
77ede9e
 
 
 
cf2697b
77ede9e
dd24a31
12439fa
dd24a31
 
07e80ad
12439fa
dd24a31
12439fa
dd24a31
 
 
12439fa
dd24a31
12439fa
 
 
 
5144b7e
 
12439fa
dd24a31
12439fa
 
 
dd24a31
 
12439fa
dd24a31
 
07e80ad
dd24a31
12439fa
 
 
 
 
 
 
4b5c463
12439fa
 
bba6f8a
 
 
 
dd24a31
bba6f8a
dd24a31
 
 
 
 
 
12439fa
 
dd24a31
 
 
 
 
12439fa
 
dd24a31
 
 
 
 
 
12439fa
 
 
4b5c463
 
 
 
 
 
52a986a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5e5650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52a986a
 
 
 
 
12439fa
 
 
 
 
 
 
 
 
 
 
77ede9e
 
d6439a8
12439fa
dd24a31
 
 
12439fa
 
dd24a31
 
 
 
 
 
12439fa
dd24a31
 
bba6f8a
 
 
279ccf2
dd24a31
279ccf2
dd24a31
 
 
 
 
 
12439fa
dd24a31
 
 
 
12439fa
dd24a31
 
12439fa
dd24a31
12439fa
dd24a31
 
3a871a0
 
 
 
 
4b5c463
 
 
 
 
77ede9e
 
 
 
dfe5268
 
 
6ad7bd8
 
 
52a986a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77ede9e
 
12439fa
dd24a31
12439fa
 
 
77ede9e

from enum import Enum
from typing import Annotated, Literal, Optional
from pydantic import BaseModel, Field

class EnvironmentMode(str, Enum):
    SIMULATED = "simulated"
    HYBRID = "hybrid"
    LIVE = "live"
    AWS = "aws"

# ---------------------------------------------------------------------------
# SRE Action Schema (Control Plane)
# ---------------------------------------------------------------------------

class ActionType(str, Enum):
    NO_OP = "NO_OP"
    SCALE_UP = "SCALE_UP"
    SCALE_DOWN = "SCALE_DOWN" 
    REROUTE_TRAFFIC = "REROUTE_TRAFFIC"
    SHED_LOAD = "SHED_LOAD"

class SREAction(BaseModel):
    """
    Management action issued by the SRE agent.
    
    * SCALE_UP: Increment capacity on target_node_id by parameter (1-5 units).
    * SCALE_DOWN: Decrement capacity on target_node_id by parameter (1-5 units).
    * REROUTE_TRAFFIC: Shift 'parameter' [0, 1] of incoming traffic AWAY from
      target_node_id and redistribute across healthy peers.
    * SHED_LOAD: Drop 'parameter' [0, 1] of incoming traffic targeting target_node_id for 1 tick.
    """
    action_type: ActionType
    target_node_id: str
    parameter: float = Field(default=0.0, ge=0.0, le=10.0)

# ---------------------------------------------------------------------------
# Observation Schema (Data Plane)
# ---------------------------------------------------------------------------

class NodeStatus(str, Enum):
    HEALTHY = "HEALTHY"
    DEGRADED = "DEGRADED"
    FAILED = "FAILED"

class NodeObservation(BaseModel):
    """Telemetry for a single service instance (node)."""
    node_id: str
    status: NodeStatus
    is_vip: bool = False
    
    # All numerical telemetry is normalized to [0, 1] for RL stability.
    queue_depth: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description=(
            "Normalized queue depth [0.0, 1.0]. Represents the % of theoretical max queue."
        ),
    )

    latency_ms: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Normalized processing latency [0.0, 1.0] relative to 1000ms SLA limit.",
    )

    incoming_request_rate: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Normalized incoming request rate [0.0, 1.0] for this node (requests per tick).",
    )

    cpu_utilization: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Estimated CPU load [0.0, 1.0].",
    )

    importance_weight: float = Field(
        default=1.0,
        ge=0.0,
        description="Business criticality weight. VIP nodes have higher impact on scoring.",
    )

    capacity: float = Field(
        default=0.0,
        ge=0.0,
        description="Current capacity units provisioned for this node (0-5).",
    )

    pending_capacity: float = Field(
        default=0.0,
        ge=0.0,
        description="Capacity units being booted (will be live after boot delay).",
    )

    queue_delta: float = Field(
        default=0.0,
        ge=-1.0,
        le=1.0,
        description="Normalized queue depth change from previous tick (-1 to +1).",
    )

    sla_proximity: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="How close this node is to SLA violation (0=safe, 1=violating).",
    )

    outflow_rate: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Normalised rate of requests dispatched downstream [0, 1].",
    )
    upstream_nodes: list[str] = Field(default_factory=list)
    downstream_nodes: list[str] = Field(default_factory=list)
    upstream_pressure: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Mean queue depth of upstream parent nodes (normalised).",
    )

    node_reward: float = Field(
        default=0.0,
        description="Per-node reward contribution for credit assignment.",
    )

    # Episode interaction fields (handled by framework)
    done: bool = False
    reward: float = 0.0

class ClusterObservation(BaseModel):
    """System-wide telemetry representing the 'dashboard' for the agent."""
    cluster_id: str
    task_id: str
    step: int
    max_steps: int
    
    mode: EnvironmentMode = EnvironmentMode.SIMULATED
    
    active_nodes: int = Field(ge=0, le=10)
    
    average_latency_ms: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Cluster-wide average latency (normalized [0.0, 1.0]).",
    )

    error_rate: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Cluster-wide fraction of dropped/failed requests [0.0, 1.0].",
    )

    total_queue_backlog: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description=(
            "Normalized sum of queue_depth across all nodes [0.0, 1.0]."
        ),
    )

    current_cost_per_hour: float = Field(
        default=0.0,
        ge=0.0,
        description="Infrastructure cost in USD/hr based on provisioned capacity.",
    )

    lyapunov_energy: float = Field(
        default=0.0,
        description="Stability metric (Sum of squares of queue depths). Low is good.",
    )

    sla_violations: int = Field(
        default=0,
        description="Cumulative count of SLA violations this episode.",
    )

    invalid_action_count: int = Field(
        default=0,
        description="Number of forbidden actions (e.g. SHED_LOAD on critical nodes).",
    )

    vip_failure_count: int = Field(
        default=0,
        description="Number of failed VIP nodes in the current observation.",
    )

    # New fields for Prometheus/Kubernetes integration
    metric_timestamp: float = 0.0
    data_freshness_ms: int = 0
    action_ack_status: str = "success"
    action_id: str = ""
    executor_latency_ms: float = Field(default=0.0, ge=0.0)
    executor_error_code: str = ""
    raw_reward: float = 0.0
    normalized_reward: float = Field(default=0.0, ge=0.0, le=1.0)
    reward_scale_version: str = "sigmoid-v1"
    # Reward components breakdown
    reward_drift: float = Field(
        default=0.0,
        description="Lyapunov drift component of the reward.",
    )
    reward_cost: float = Field(
        default=0.0,
        description="Infrastructure cost component of the reward.",
    )
    reward_sla: float = Field(
        default=0.0,
        description="SLA penalty component of the reward.",
    )
    reward_barrier: float = Field(
        default=0.0,
        description="Barrier function penalty component of the reward.",
    )

    choke_level: float = 0.0

    nodes: list[NodeObservation]

    # Episode interaction fields (handled by framework)
    done: bool = False
    reward: float = 0.0