div18 commited on
Commit ·
92019fc
1
Parent(s): f55f75f
perf(environment): improve cost model with three-tier capacity pricing
Browse files- Import DEFAULT_CAPACITY constant from simulator module as baseline capacity
- Update Kubernetes deployment replicas from 1 or 2 to 3 for all services in local-laptop and aws manifests
- Increase pod resource quota from 30 to 45 to support higher replica count
- Enhance _compute_cost() method with a three-tier cost calculation
- Define tier 1 as baseline capacity at cheap base rate up to DEFAULT_CAPACITY
- Define tier 2 as needed excess capacity charged at moderate 4× base rate
- Define tier 3 as idle excess capacity charged at high 20× penalty rate
- Provide clear incentives for scaling just enough to meet traffic demand without overprovisioning
- deploy/aws/k8s-workloads.yaml +8 -8
- deploy/local-laptop.yaml +5 -5
- server/AntiAtropos_environment.py +29 -10
deploy/aws/k8s-workloads.yaml
CHANGED
|
@@ -22,7 +22,7 @@ metadata:
|
|
| 22 |
# This is a Kubernetes-level safety net. Even if the agent's Python cap fails,
|
| 23 |
# Kubernetes will refuse to create pods beyond this limit.
|
| 24 |
#
|
| 25 |
-
# Max
|
| 26 |
# Max 8 CPU / 8GB RAM = enough for 30 small nginx pods
|
| 27 |
apiVersion: v1
|
| 28 |
kind: ResourceQuota
|
|
@@ -31,7 +31,7 @@ metadata:
|
|
| 31 |
namespace: prod-sre
|
| 32 |
spec:
|
| 33 |
hard:
|
| 34 |
-
pods: "
|
| 35 |
requests.cpu: "8"
|
| 36 |
requests.memory: 8Gi
|
| 37 |
limits.cpu: "15"
|
|
@@ -39,7 +39,7 @@ spec:
|
|
| 39 |
|
| 40 |
---
|
| 41 |
# payments — node-0 (VIP)
|
| 42 |
-
# Business-critical payment service.
|
| 43 |
# The SRE agent should never SHED_LOAD on this (CRITICAL_NODES in simulator.py).
|
| 44 |
apiVersion: apps/v1
|
| 45 |
kind: Deployment
|
|
@@ -51,7 +51,7 @@ metadata:
|
|
| 51 |
node-id: node-0
|
| 52 |
critical: "true"
|
| 53 |
spec:
|
| 54 |
-
replicas:
|
| 55 |
selector:
|
| 56 |
matchLabels:
|
| 57 |
app: payments
|
|
@@ -102,7 +102,7 @@ metadata:
|
|
| 102 |
app: checkout
|
| 103 |
node-id: node-1
|
| 104 |
spec:
|
| 105 |
-
replicas:
|
| 106 |
selector:
|
| 107 |
matchLabels:
|
| 108 |
app: checkout
|
|
@@ -153,7 +153,7 @@ metadata:
|
|
| 153 |
app: catalog
|
| 154 |
node-id: node-2
|
| 155 |
spec:
|
| 156 |
-
replicas:
|
| 157 |
selector:
|
| 158 |
matchLabels:
|
| 159 |
app: catalog
|
|
@@ -204,7 +204,7 @@ metadata:
|
|
| 204 |
app: cart
|
| 205 |
node-id: node-3
|
| 206 |
spec:
|
| 207 |
-
replicas:
|
| 208 |
selector:
|
| 209 |
matchLabels:
|
| 210 |
app: cart
|
|
@@ -255,7 +255,7 @@ metadata:
|
|
| 255 |
app: auth
|
| 256 |
node-id: node-4
|
| 257 |
spec:
|
| 258 |
-
replicas:
|
| 259 |
selector:
|
| 260 |
matchLabels:
|
| 261 |
app: auth
|
|
|
|
| 22 |
# This is a Kubernetes-level safety net. Even if the agent's Python cap fails,
|
| 23 |
# Kubernetes will refuse to create pods beyond this limit.
|
| 24 |
#
|
| 25 |
+
# Max 45 pods = 9 replicas x 5 deployments (our worst-case budget; baseline is 3 per deployment)
|
| 26 |
# Max 8 CPU / 8GB RAM = enough for 30 small nginx pods
|
| 27 |
apiVersion: v1
|
| 28 |
kind: ResourceQuota
|
|
|
|
| 31 |
namespace: prod-sre
|
| 32 |
spec:
|
| 33 |
hard:
|
| 34 |
+
pods: "45"
|
| 35 |
requests.cpu: "8"
|
| 36 |
requests.memory: 8Gi
|
| 37 |
limits.cpu: "15"
|
|
|
|
| 39 |
|
| 40 |
---
|
| 41 |
# payments — node-0 (VIP)
|
| 42 |
+
# Business-critical payment service. Starts at 3 replicas (matches simulator DEFAULT_CAPACITY).
|
| 43 |
# The SRE agent should never SHED_LOAD on this (CRITICAL_NODES in simulator.py).
|
| 44 |
apiVersion: apps/v1
|
| 45 |
kind: Deployment
|
|
|
|
| 51 |
node-id: node-0
|
| 52 |
critical: "true"
|
| 53 |
spec:
|
| 54 |
+
replicas: 3
|
| 55 |
selector:
|
| 56 |
matchLabels:
|
| 57 |
app: payments
|
|
|
|
| 102 |
app: checkout
|
| 103 |
node-id: node-1
|
| 104 |
spec:
|
| 105 |
+
replicas: 3
|
| 106 |
selector:
|
| 107 |
matchLabels:
|
| 108 |
app: checkout
|
|
|
|
| 153 |
app: catalog
|
| 154 |
node-id: node-2
|
| 155 |
spec:
|
| 156 |
+
replicas: 3
|
| 157 |
selector:
|
| 158 |
matchLabels:
|
| 159 |
app: catalog
|
|
|
|
| 204 |
app: cart
|
| 205 |
node-id: node-3
|
| 206 |
spec:
|
| 207 |
+
replicas: 3
|
| 208 |
selector:
|
| 209 |
matchLabels:
|
| 210 |
app: cart
|
|
|
|
| 255 |
app: auth
|
| 256 |
node-id: node-4
|
| 257 |
spec:
|
| 258 |
+
replicas: 3
|
| 259 |
selector:
|
| 260 |
matchLabels:
|
| 261 |
app: auth
|
deploy/local-laptop.yaml
CHANGED
|
@@ -9,7 +9,7 @@ metadata:
|
|
| 9 |
name: auth
|
| 10 |
namespace: prod-sre
|
| 11 |
spec:
|
| 12 |
-
replicas:
|
| 13 |
selector:
|
| 14 |
matchLabels:
|
| 15 |
app: auth
|
|
@@ -81,7 +81,7 @@ metadata:
|
|
| 81 |
name: cart
|
| 82 |
namespace: prod-sre
|
| 83 |
spec:
|
| 84 |
-
replicas:
|
| 85 |
selector:
|
| 86 |
matchLabels:
|
| 87 |
app: cart
|
|
@@ -153,7 +153,7 @@ metadata:
|
|
| 153 |
name: catalog
|
| 154 |
namespace: prod-sre
|
| 155 |
spec:
|
| 156 |
-
replicas:
|
| 157 |
selector:
|
| 158 |
matchLabels:
|
| 159 |
app: catalog
|
|
@@ -225,7 +225,7 @@ metadata:
|
|
| 225 |
name: checkout
|
| 226 |
namespace: prod-sre
|
| 227 |
spec:
|
| 228 |
-
replicas:
|
| 229 |
selector:
|
| 230 |
matchLabels:
|
| 231 |
app: checkout
|
|
@@ -297,7 +297,7 @@ metadata:
|
|
| 297 |
name: payments
|
| 298 |
namespace: prod-sre
|
| 299 |
spec:
|
| 300 |
-
replicas:
|
| 301 |
selector:
|
| 302 |
matchLabels:
|
| 303 |
app: payments
|
|
|
|
| 9 |
name: auth
|
| 10 |
namespace: prod-sre
|
| 11 |
spec:
|
| 12 |
+
replicas: 3
|
| 13 |
selector:
|
| 14 |
matchLabels:
|
| 15 |
app: auth
|
|
|
|
| 81 |
name: cart
|
| 82 |
namespace: prod-sre
|
| 83 |
spec:
|
| 84 |
+
replicas: 3
|
| 85 |
selector:
|
| 86 |
matchLabels:
|
| 87 |
app: cart
|
|
|
|
| 153 |
name: catalog
|
| 154 |
namespace: prod-sre
|
| 155 |
spec:
|
| 156 |
+
replicas: 3
|
| 157 |
selector:
|
| 158 |
matchLabels:
|
| 159 |
app: catalog
|
|
|
|
| 225 |
name: checkout
|
| 226 |
namespace: prod-sre
|
| 227 |
spec:
|
| 228 |
+
replicas: 3
|
| 229 |
selector:
|
| 230 |
matchLabels:
|
| 231 |
app: checkout
|
|
|
|
| 297 |
name: payments
|
| 298 |
namespace: prod-sre
|
| 299 |
spec:
|
| 300 |
+
replicas: 3
|
| 301 |
selector:
|
| 302 |
matchLabels:
|
| 303 |
app: payments
|
server/AntiAtropos_environment.py
CHANGED
|
@@ -9,7 +9,7 @@ from openenv.core.env_server.types import State
|
|
| 9 |
|
| 10 |
try:
|
| 11 |
from ..models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode
|
| 12 |
-
from ..simulator import ClusterSimulator, COST_PER_CAPACITY_UNIT_PER_HOUR, OVERPROVISION_COST_PER_UNIT, CLUSTER_TOPOLOGY
|
| 13 |
from ..stability import (
|
| 14 |
compute_lyapunov,
|
| 15 |
compute_lyapunov_graph,
|
|
@@ -25,7 +25,7 @@ try:
|
|
| 25 |
from ..control import KubernetesExecutor, ActionValidator
|
| 26 |
except ImportError:
|
| 27 |
from models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode # type: ignore[no-redef]
|
| 28 |
-
from simulator import ClusterSimulator, COST_PER_CAPACITY_UNIT_PER_HOUR, OVERPROVISION_COST_PER_UNIT, CLUSTER_TOPOLOGY # type: ignore[no-redef]
|
| 29 |
from stability import ( # type: ignore[no-redef]
|
| 30 |
compute_lyapunov,
|
| 31 |
compute_lyapunov_graph,
|
|
@@ -385,13 +385,23 @@ class AntiAtroposEnvironment(Environment):
|
|
| 385 |
return False, f"Unsupported environment mode: {self._mode}"
|
| 386 |
|
| 387 |
def _compute_cost(self, nodes_true: list[dict]) -> float:
|
| 388 |
-
"""
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
"""
|
| 394 |
total_cost = 0.0
|
|
|
|
| 395 |
for node in nodes_true:
|
| 396 |
if node["status"] == NodeStatus.FAILED:
|
| 397 |
continue
|
|
@@ -400,11 +410,20 @@ class AntiAtroposEnvironment(Environment):
|
|
| 400 |
continue
|
| 401 |
incoming = float(node.get("incoming_request_rate", 0.0))
|
| 402 |
needed = max(1, int(math.ceil(incoming / 15.0)))
|
| 403 |
-
|
|
|
|
|
|
|
| 404 |
total_cost += capacity * COST_PER_CAPACITY_UNIT_PER_HOUR
|
| 405 |
else:
|
| 406 |
-
|
| 407 |
-
total_cost +=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
return total_cost
|
| 409 |
|
| 410 |
def _avg_latency(self, nodes: list[dict]) -> float:
|
|
|
|
| 9 |
|
| 10 |
try:
|
| 11 |
from ..models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode
|
| 12 |
+
from ..simulator import ClusterSimulator, DEFAULT_CAPACITY, COST_PER_CAPACITY_UNIT_PER_HOUR, OVERPROVISION_COST_PER_UNIT, CLUSTER_TOPOLOGY
|
| 13 |
from ..stability import (
|
| 14 |
compute_lyapunov,
|
| 15 |
compute_lyapunov_graph,
|
|
|
|
| 25 |
from ..control import KubernetesExecutor, ActionValidator
|
| 26 |
except ImportError:
|
| 27 |
from models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode # type: ignore[no-redef]
|
| 28 |
+
from simulator import ClusterSimulator, DEFAULT_CAPACITY, COST_PER_CAPACITY_UNIT_PER_HOUR, OVERPROVISION_COST_PER_UNIT, CLUSTER_TOPOLOGY # type: ignore[no-redef]
|
| 29 |
from stability import ( # type: ignore[no-redef]
|
| 30 |
compute_lyapunov,
|
| 31 |
compute_lyapunov_graph,
|
|
|
|
| 385 |
return False, f"Unsupported environment mode: {self._mode}"
|
| 386 |
|
| 387 |
def _compute_cost(self, nodes_true: list[dict]) -> float:
|
| 388 |
+
"""Three-tier cost model calibrated to DEFAULT_CAPACITY as the baseline.
|
| 389 |
+
|
| 390 |
+
Tier 1 — Baseline capacity (up to DEFAULT_CAPACITY): cheap base rate.
|
| 391 |
+
Infrastructure already provisioned and paid for — no penalty.
|
| 392 |
+
Tier 2 — Needed excess (above DEFAULT_CAPACITY, up to 'needed'): moderate
|
| 393 |
+
rate (4× base). Agent added capacity that's actually serving traffic —
|
| 394 |
+
costs more but is justified by demand.
|
| 395 |
+
Tier 3 — Idle excess (above 'needed'): expensive penalty rate (20× base).
|
| 396 |
+
Capacity sitting idle beyond what traffic requires — pure waste.
|
| 397 |
+
|
| 398 |
+
'needed' = ceil(incoming_rate / 15) — minimum units to serve traffic.
|
| 399 |
+
With DEFAULT_CAPACITY=3, a node at baseline costs 3 × $0.05 = $0.15/hr
|
| 400 |
+
regardless of traffic. Only scaling ABOVE baseline triggers higher rates,
|
| 401 |
+
giving the agent a clear gradient: scale just enough, not too much.
|
| 402 |
"""
|
| 403 |
total_cost = 0.0
|
| 404 |
+
baseline_cap = int(DEFAULT_CAPACITY) # Tier 1 ceiling (imported from simulator)
|
| 405 |
for node in nodes_true:
|
| 406 |
if node["status"] == NodeStatus.FAILED:
|
| 407 |
continue
|
|
|
|
| 410 |
continue
|
| 411 |
incoming = float(node.get("incoming_request_rate", 0.0))
|
| 412 |
needed = max(1, int(math.ceil(incoming / 15.0)))
|
| 413 |
+
|
| 414 |
+
if capacity <= baseline_cap:
|
| 415 |
+
# Tier 1: baseline provisioned capacity — cheap base rate
|
| 416 |
total_cost += capacity * COST_PER_CAPACITY_UNIT_PER_HOUR
|
| 417 |
else:
|
| 418 |
+
# Tier 1: baseline portion at cheap rate
|
| 419 |
+
total_cost += baseline_cap * COST_PER_CAPACITY_UNIT_PER_HOUR
|
| 420 |
+
above_baseline = capacity - baseline_cap
|
| 421 |
+
justified = max(0, needed - baseline_cap) # excess that serves traffic
|
| 422 |
+
idle = above_baseline - justified # excess sitting idle
|
| 423 |
+
# Tier 2: needed excess at moderate rate (4× base)
|
| 424 |
+
total_cost += justified * (COST_PER_CAPACITY_UNIT_PER_HOUR * 4.0)
|
| 425 |
+
# Tier 3: idle excess at penalty rate (20× base)
|
| 426 |
+
total_cost += idle * OVERPROVISION_COST_PER_UNIT
|
| 427 |
return total_cost
|
| 428 |
|
| 429 |
def _avg_latency(self, nodes: list[dict]) -> float:
|