div18 commited on
Commit
92019fc
·
1 Parent(s): f55f75f

perf(environment): improve cost model with three-tier capacity pricing

Browse files

- Import DEFAULT_CAPACITY constant from simulator module as baseline capacity
- Update Kubernetes deployment replicas from 1 or 2 to 3 for all services in local-laptop and aws manifests
- Increase pod resource quota from 30 to 45 to support higher replica count
- Enhance _compute_cost() method with a three-tier cost calculation
- Define tier 1 as baseline capacity at cheap base rate up to DEFAULT_CAPACITY
- Define tier 2 as needed excess capacity charged at moderate 4× base rate
- Define tier 3 as idle excess capacity charged at high 20× penalty rate
- Provide clear incentives for scaling just enough to meet traffic demand without overprovisioning

deploy/aws/k8s-workloads.yaml CHANGED
@@ -22,7 +22,7 @@ metadata:
22
  # This is a Kubernetes-level safety net. Even if the agent's Python cap fails,
23
  # Kubernetes will refuse to create pods beyond this limit.
24
  #
25
- # Max 30 pods = 6 replicas x 5 deployments (our worst-case budget)
26
  # Max 8 CPU / 8GB RAM = enough for 30 small nginx pods
27
  apiVersion: v1
28
  kind: ResourceQuota
@@ -31,7 +31,7 @@ metadata:
31
  namespace: prod-sre
32
  spec:
33
  hard:
34
- pods: "30"
35
  requests.cpu: "8"
36
  requests.memory: 8Gi
37
  limits.cpu: "15"
@@ -39,7 +39,7 @@ spec:
39
 
40
  ---
41
  # payments — node-0 (VIP)
42
- # Business-critical payment service. Always has 2 replicas for redundancy.
43
  # The SRE agent should never SHED_LOAD on this (CRITICAL_NODES in simulator.py).
44
  apiVersion: apps/v1
45
  kind: Deployment
@@ -51,7 +51,7 @@ metadata:
51
  node-id: node-0
52
  critical: "true"
53
  spec:
54
- replicas: 2
55
  selector:
56
  matchLabels:
57
  app: payments
@@ -102,7 +102,7 @@ metadata:
102
  app: checkout
103
  node-id: node-1
104
  spec:
105
- replicas: 1
106
  selector:
107
  matchLabels:
108
  app: checkout
@@ -153,7 +153,7 @@ metadata:
153
  app: catalog
154
  node-id: node-2
155
  spec:
156
- replicas: 1
157
  selector:
158
  matchLabels:
159
  app: catalog
@@ -204,7 +204,7 @@ metadata:
204
  app: cart
205
  node-id: node-3
206
  spec:
207
- replicas: 1
208
  selector:
209
  matchLabels:
210
  app: cart
@@ -255,7 +255,7 @@ metadata:
255
  app: auth
256
  node-id: node-4
257
  spec:
258
- replicas: 1
259
  selector:
260
  matchLabels:
261
  app: auth
 
22
  # This is a Kubernetes-level safety net. Even if the agent's Python cap fails,
23
  # Kubernetes will refuse to create pods beyond this limit.
24
  #
25
+ # Max 45 pods = 9 replicas x 5 deployments (our worst-case budget; baseline is 3 per deployment)
26
  # Max 8 CPU / 8GB RAM = enough for 30 small nginx pods
27
  apiVersion: v1
28
  kind: ResourceQuota
 
31
  namespace: prod-sre
32
  spec:
33
  hard:
34
+ pods: "45"
35
  requests.cpu: "8"
36
  requests.memory: 8Gi
37
  limits.cpu: "15"
 
39
 
40
  ---
41
  # payments — node-0 (VIP)
42
+ # Business-critical payment service. Starts at 3 replicas (matches simulator DEFAULT_CAPACITY).
43
  # The SRE agent should never SHED_LOAD on this (CRITICAL_NODES in simulator.py).
44
  apiVersion: apps/v1
45
  kind: Deployment
 
51
  node-id: node-0
52
  critical: "true"
53
  spec:
54
+ replicas: 3
55
  selector:
56
  matchLabels:
57
  app: payments
 
102
  app: checkout
103
  node-id: node-1
104
  spec:
105
+ replicas: 3
106
  selector:
107
  matchLabels:
108
  app: checkout
 
153
  app: catalog
154
  node-id: node-2
155
  spec:
156
+ replicas: 3
157
  selector:
158
  matchLabels:
159
  app: catalog
 
204
  app: cart
205
  node-id: node-3
206
  spec:
207
+ replicas: 3
208
  selector:
209
  matchLabels:
210
  app: cart
 
255
  app: auth
256
  node-id: node-4
257
  spec:
258
+ replicas: 3
259
  selector:
260
  matchLabels:
261
  app: auth
deploy/local-laptop.yaml CHANGED
@@ -9,7 +9,7 @@ metadata:
9
  name: auth
10
  namespace: prod-sre
11
  spec:
12
- replicas: 1
13
  selector:
14
  matchLabels:
15
  app: auth
@@ -81,7 +81,7 @@ metadata:
81
  name: cart
82
  namespace: prod-sre
83
  spec:
84
- replicas: 1
85
  selector:
86
  matchLabels:
87
  app: cart
@@ -153,7 +153,7 @@ metadata:
153
  name: catalog
154
  namespace: prod-sre
155
  spec:
156
- replicas: 1
157
  selector:
158
  matchLabels:
159
  app: catalog
@@ -225,7 +225,7 @@ metadata:
225
  name: checkout
226
  namespace: prod-sre
227
  spec:
228
- replicas: 1
229
  selector:
230
  matchLabels:
231
  app: checkout
@@ -297,7 +297,7 @@ metadata:
297
  name: payments
298
  namespace: prod-sre
299
  spec:
300
- replicas: 2
301
  selector:
302
  matchLabels:
303
  app: payments
 
9
  name: auth
10
  namespace: prod-sre
11
  spec:
12
+ replicas: 3
13
  selector:
14
  matchLabels:
15
  app: auth
 
81
  name: cart
82
  namespace: prod-sre
83
  spec:
84
+ replicas: 3
85
  selector:
86
  matchLabels:
87
  app: cart
 
153
  name: catalog
154
  namespace: prod-sre
155
  spec:
156
+ replicas: 3
157
  selector:
158
  matchLabels:
159
  app: catalog
 
225
  name: checkout
226
  namespace: prod-sre
227
  spec:
228
+ replicas: 3
229
  selector:
230
  matchLabels:
231
  app: checkout
 
297
  name: payments
298
  namespace: prod-sre
299
  spec:
300
+ replicas: 3
301
  selector:
302
  matchLabels:
303
  app: payments
server/AntiAtropos_environment.py CHANGED
@@ -9,7 +9,7 @@ from openenv.core.env_server.types import State
9
 
10
  try:
11
  from ..models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode
12
- from ..simulator import ClusterSimulator, COST_PER_CAPACITY_UNIT_PER_HOUR, OVERPROVISION_COST_PER_UNIT, CLUSTER_TOPOLOGY
13
  from ..stability import (
14
  compute_lyapunov,
15
  compute_lyapunov_graph,
@@ -25,7 +25,7 @@ try:
25
  from ..control import KubernetesExecutor, ActionValidator
26
  except ImportError:
27
  from models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode # type: ignore[no-redef]
28
- from simulator import ClusterSimulator, COST_PER_CAPACITY_UNIT_PER_HOUR, OVERPROVISION_COST_PER_UNIT, CLUSTER_TOPOLOGY # type: ignore[no-redef]
29
  from stability import ( # type: ignore[no-redef]
30
  compute_lyapunov,
31
  compute_lyapunov_graph,
@@ -385,13 +385,23 @@ class AntiAtroposEnvironment(Environment):
385
  return False, f"Unsupported environment mode: {self._mode}"
386
 
387
  def _compute_cost(self, nodes_true: list[dict]) -> float:
388
- """Two-tier cost: cheap for needed capacity, expensive for idling excess.
389
-
390
- Needed capacity = ceil(incoming_rate / 15.0) — the minimum replicas
391
- required to service current traffic. Excess beyond that is idling
392
- waste and costs OVERPROVISION_COST_PER_UNIT (20x base rate).
 
 
 
 
 
 
 
 
 
393
  """
394
  total_cost = 0.0
 
395
  for node in nodes_true:
396
  if node["status"] == NodeStatus.FAILED:
397
  continue
@@ -400,11 +410,20 @@ class AntiAtroposEnvironment(Environment):
400
  continue
401
  incoming = float(node.get("incoming_request_rate", 0.0))
402
  needed = max(1, int(math.ceil(incoming / 15.0)))
403
- if capacity <= needed:
 
 
404
  total_cost += capacity * COST_PER_CAPACITY_UNIT_PER_HOUR
405
  else:
406
- total_cost += needed * COST_PER_CAPACITY_UNIT_PER_HOUR
407
- total_cost += (capacity - needed) * OVERPROVISION_COST_PER_UNIT
 
 
 
 
 
 
 
408
  return total_cost
409
 
410
  def _avg_latency(self, nodes: list[dict]) -> float:
 
9
 
10
  try:
11
  from ..models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode
12
+ from ..simulator import ClusterSimulator, DEFAULT_CAPACITY, COST_PER_CAPACITY_UNIT_PER_HOUR, OVERPROVISION_COST_PER_UNIT, CLUSTER_TOPOLOGY
13
  from ..stability import (
14
  compute_lyapunov,
15
  compute_lyapunov_graph,
 
25
  from ..control import KubernetesExecutor, ActionValidator
26
  except ImportError:
27
  from models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode # type: ignore[no-redef]
28
+ from simulator import ClusterSimulator, DEFAULT_CAPACITY, COST_PER_CAPACITY_UNIT_PER_HOUR, OVERPROVISION_COST_PER_UNIT, CLUSTER_TOPOLOGY # type: ignore[no-redef]
29
  from stability import ( # type: ignore[no-redef]
30
  compute_lyapunov,
31
  compute_lyapunov_graph,
 
385
  return False, f"Unsupported environment mode: {self._mode}"
386
 
387
  def _compute_cost(self, nodes_true: list[dict]) -> float:
388
+ """Three-tier cost model calibrated to DEFAULT_CAPACITY as the baseline.
389
+
390
+ Tier 1 — Baseline capacity (up to DEFAULT_CAPACITY): cheap base rate.
391
+ Infrastructure already provisioned and paid for — no penalty.
392
+ Tier 2 — Needed excess (above DEFAULT_CAPACITY, up to 'needed'): moderate
393
+ rate (4× base). Agent added capacity that's actually serving traffic —
394
+ costs more but is justified by demand.
395
+ Tier 3 — Idle excess (above 'needed'): expensive penalty rate (20× base).
396
+ Capacity sitting idle beyond what traffic requires — pure waste.
397
+
398
+ 'needed' = ceil(incoming_rate / 15) — minimum units to serve traffic.
399
+ With DEFAULT_CAPACITY=3, a node at baseline costs 3 × $0.05 = $0.15/hr
400
+ regardless of traffic. Only scaling ABOVE baseline triggers higher rates,
401
+ giving the agent a clear gradient: scale just enough, not too much.
402
  """
403
  total_cost = 0.0
404
+ baseline_cap = int(DEFAULT_CAPACITY) # Tier 1 ceiling (imported from simulator)
405
  for node in nodes_true:
406
  if node["status"] == NodeStatus.FAILED:
407
  continue
 
410
  continue
411
  incoming = float(node.get("incoming_request_rate", 0.0))
412
  needed = max(1, int(math.ceil(incoming / 15.0)))
413
+
414
+ if capacity <= baseline_cap:
415
+ # Tier 1: baseline provisioned capacity — cheap base rate
416
  total_cost += capacity * COST_PER_CAPACITY_UNIT_PER_HOUR
417
  else:
418
+ # Tier 1: baseline portion at cheap rate
419
+ total_cost += baseline_cap * COST_PER_CAPACITY_UNIT_PER_HOUR
420
+ above_baseline = capacity - baseline_cap
421
+ justified = max(0, needed - baseline_cap) # excess that serves traffic
422
+ idle = above_baseline - justified # excess sitting idle
423
+ # Tier 2: needed excess at moderate rate (4× base)
424
+ total_cost += justified * (COST_PER_CAPACITY_UNIT_PER_HOUR * 4.0)
425
+ # Tier 3: idle excess at penalty rate (20× base)
426
+ total_cost += idle * OVERPROVISION_COST_PER_UNIT
427
  return total_cost
428
 
429
  def _avg_latency(self, nodes: list[dict]) -> float: