File size: 6,761 Bytes
07e80ad
12439fa
 
dd24a31
77ede9e
 
 
 
cf2697b
77ede9e
dd24a31
12439fa
dd24a31
 
07e80ad
12439fa
dd24a31
12439fa
dd24a31
 
 
12439fa
dd24a31
12439fa
 
 
 
5144b7e
 
12439fa
dd24a31
12439fa
 
 
dd24a31
 
12439fa
dd24a31
 
07e80ad
dd24a31
12439fa
 
 
 
 
 
 
4b5c463
12439fa
 
bba6f8a
 
 
 
dd24a31
bba6f8a
dd24a31
 
 
 
 
 
12439fa
 
dd24a31
 
 
 
 
12439fa
 
dd24a31
 
 
 
 
 
12439fa
 
 
4b5c463
 
 
 
 
 
52a986a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5e5650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52a986a
 
 
 
 
12439fa
 
 
 
 
 
 
 
 
 
 
77ede9e
 
d6439a8
12439fa
dd24a31
 
 
12439fa
 
dd24a31
 
 
 
 
 
12439fa
dd24a31
 
bba6f8a
 
 
279ccf2
dd24a31
279ccf2
dd24a31
 
 
 
 
 
12439fa
dd24a31
 
 
 
12439fa
dd24a31
 
12439fa
dd24a31
12439fa
dd24a31
 
3a871a0
 
 
 
 
4b5c463
 
 
 
 
77ede9e
 
 
 
dfe5268
 
 
6ad7bd8
 
 
52a986a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77ede9e
 
12439fa
dd24a31
12439fa
 
 
77ede9e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
from enum import Enum
from typing import Annotated, Literal, Optional
from pydantic import BaseModel, Field

class EnvironmentMode(str, Enum):
    SIMULATED = "simulated"
    HYBRID = "hybrid"
    LIVE = "live"
    AWS = "aws"

# ---------------------------------------------------------------------------
# SRE Action Schema (Control Plane)
# ---------------------------------------------------------------------------

class ActionType(str, Enum):
    NO_OP = "NO_OP"
    SCALE_UP = "SCALE_UP"
    SCALE_DOWN = "SCALE_DOWN" 
    REROUTE_TRAFFIC = "REROUTE_TRAFFIC"
    SHED_LOAD = "SHED_LOAD"

class SREAction(BaseModel):
    """
    Management action issued by the SRE agent.
    
    * SCALE_UP: Increment capacity on target_node_id by parameter (1-5 units).
    * SCALE_DOWN: Decrement capacity on target_node_id by parameter (1-5 units).
    * REROUTE_TRAFFIC: Shift 'parameter' [0, 1] of incoming traffic AWAY from
      target_node_id and redistribute across healthy peers.
    * SHED_LOAD: Drop 'parameter' [0, 1] of incoming traffic targeting target_node_id for 1 tick.
    """
    action_type: ActionType
    target_node_id: str
    parameter: float = Field(default=0.0, ge=0.0, le=10.0)

# ---------------------------------------------------------------------------
# Observation Schema (Data Plane)
# ---------------------------------------------------------------------------

class NodeStatus(str, Enum):
    HEALTHY = "HEALTHY"
    DEGRADED = "DEGRADED"
    FAILED = "FAILED"

class NodeObservation(BaseModel):
    """Telemetry for a single service instance (node)."""
    node_id: str
    status: NodeStatus
    is_vip: bool = False
    
    # All numerical telemetry is normalized to [0, 1] for RL stability.
    queue_depth: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description=(
            "Normalized queue depth [0.0, 1.0]. Represents the % of theoretical max queue."
        ),
    )

    latency_ms: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Normalized processing latency [0.0, 1.0] relative to 1000ms SLA limit.",
    )

    incoming_request_rate: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Normalized incoming request rate [0.0, 1.0] for this node (requests per tick).",
    )

    cpu_utilization: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Estimated CPU load [0.0, 1.0].",
    )

    importance_weight: float = Field(
        default=1.0,
        ge=0.0,
        description="Business criticality weight. VIP nodes have higher impact on scoring.",
    )

    capacity: float = Field(
        default=0.0,
        ge=0.0,
        description="Current capacity units provisioned for this node (0-5).",
    )

    pending_capacity: float = Field(
        default=0.0,
        ge=0.0,
        description="Capacity units being booted (will be live after boot delay).",
    )

    queue_delta: float = Field(
        default=0.0,
        ge=-1.0,
        le=1.0,
        description="Normalized queue depth change from previous tick (-1 to +1).",
    )

    sla_proximity: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="How close this node is to SLA violation (0=safe, 1=violating).",
    )

    outflow_rate: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Normalised rate of requests dispatched downstream [0, 1].",
    )
    upstream_nodes: list[str] = Field(default_factory=list)
    downstream_nodes: list[str] = Field(default_factory=list)
    upstream_pressure: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Mean queue depth of upstream parent nodes (normalised).",
    )

    node_reward: float = Field(
        default=0.0,
        description="Per-node reward contribution for credit assignment.",
    )

    # Episode interaction fields (handled by framework)
    done: bool = False
    reward: float = 0.0

class ClusterObservation(BaseModel):
    """System-wide telemetry representing the 'dashboard' for the agent."""
    cluster_id: str
    task_id: str
    step: int
    max_steps: int
    
    mode: EnvironmentMode = EnvironmentMode.SIMULATED
    
    active_nodes: int = Field(ge=0, le=10)
    
    average_latency_ms: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Cluster-wide average latency (normalized [0.0, 1.0]).",
    )

    error_rate: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description="Cluster-wide fraction of dropped/failed requests [0.0, 1.0].",
    )

    total_queue_backlog: float = Field(
        default=0.0,
        ge=0.0,
        le=1.0,
        description=(
            "Normalized sum of queue_depth across all nodes [0.0, 1.0]."
        ),
    )

    current_cost_per_hour: float = Field(
        default=0.0,
        ge=0.0,
        description="Infrastructure cost in USD/hr based on provisioned capacity.",
    )

    lyapunov_energy: float = Field(
        default=0.0,
        description="Stability metric (Sum of squares of queue depths). Low is good.",
    )

    sla_violations: int = Field(
        default=0,
        description="Cumulative count of SLA violations this episode.",
    )

    invalid_action_count: int = Field(
        default=0,
        description="Number of forbidden actions (e.g. SHED_LOAD on critical nodes).",
    )

    vip_failure_count: int = Field(
        default=0,
        description="Number of failed VIP nodes in the current observation.",
    )

    # New fields for Prometheus/Kubernetes integration
    metric_timestamp: float = 0.0
    data_freshness_ms: int = 0
    action_ack_status: str = "success"
    action_id: str = ""
    executor_latency_ms: float = Field(default=0.0, ge=0.0)
    executor_error_code: str = ""
    raw_reward: float = 0.0
    normalized_reward: float = Field(default=0.0, ge=0.0, le=1.0)
    reward_scale_version: str = "sigmoid-v1"
    # Reward components breakdown
    reward_drift: float = Field(
        default=0.0,
        description="Lyapunov drift component of the reward.",
    )
    reward_cost: float = Field(
        default=0.0,
        description="Infrastructure cost component of the reward.",
    )
    reward_sla: float = Field(
        default=0.0,
        description="SLA penalty component of the reward.",
    )
    reward_barrier: float = Field(
        default=0.0,
        description="Barrier function penalty component of the reward.",
    )

    choke_level: float = 0.0

    nodes: list[NodeObservation]

    # Episode interaction fields (handled by framework)
    done: bool = False
    reward: float = 0.0