perf(environment): improve cost model with three-tier capacity pricing

- Import DEFAULT_CAPACITY constant from simulator module as baseline capacity
- Update Kubernetes deployment replicas from 1 or 2 to 3 for all services in local-laptop and aws manifests
- Increase pod resource quota from 30 to 45 to support higher replica count
- Enhance _compute_cost() method with a three-tier cost calculation
- Define tier 1 as baseline capacity at cheap base rate up to DEFAULT_CAPACITY
- Define tier 2 as needed excess capacity charged at moderate 4× base rate
- Define tier 3 as idle excess capacity charged at high 20× penalty rate
- Provide clear incentives for scaling just enough to meet traffic demand without overprovisioning

Files changed (3) hide show

deploy/aws/k8s-workloads.yaml +8 -8
deploy/local-laptop.yaml +5 -5
server/AntiAtropos_environment.py +29 -10

deploy/aws/k8s-workloads.yaml CHANGED Viewed

@@ -22,7 +22,7 @@ metadata:
 # This is a Kubernetes-level safety net. Even if the agent's Python cap fails,
 # Kubernetes will refuse to create pods beyond this limit.
 #
-# Max 30 pods = 6 replicas x 5 deployments (our worst-case budget)
 # Max 8 CPU / 8GB RAM = enough for 30 small nginx pods
 apiVersion: v1
 kind: ResourceQuota
@@ -31,7 +31,7 @@ metadata:
   namespace: prod-sre
 spec:
   hard:
-    pods: "30"
     requests.cpu: "8"
     requests.memory: 8Gi
     limits.cpu: "15"
@@ -39,7 +39,7 @@ spec:
 ---
 # payments — node-0 (VIP)
-# Business-critical payment service. Always has 2 replicas for redundancy.
 # The SRE agent should never SHED_LOAD on this (CRITICAL_NODES in simulator.py).
 apiVersion: apps/v1
 kind: Deployment
@@ -51,7 +51,7 @@ metadata:
     node-id: node-0
     critical: "true"
 spec:
-  replicas: 2
   selector:
     matchLabels:
       app: payments
@@ -102,7 +102,7 @@ metadata:
     app: checkout
     node-id: node-1
 spec:
-  replicas: 1
   selector:
     matchLabels:
       app: checkout
@@ -153,7 +153,7 @@ metadata:
     app: catalog
     node-id: node-2
 spec:
-  replicas: 1
   selector:
     matchLabels:
       app: catalog
@@ -204,7 +204,7 @@ metadata:
     app: cart
     node-id: node-3
 spec:
-  replicas: 1
   selector:
     matchLabels:
       app: cart
@@ -255,7 +255,7 @@ metadata:
     app: auth
     node-id: node-4
 spec:
-  replicas: 1
   selector:
     matchLabels:
       app: auth

 # This is a Kubernetes-level safety net. Even if the agent's Python cap fails,
 # Kubernetes will refuse to create pods beyond this limit.
 #
+# Max 45 pods = 9 replicas x 5 deployments (our worst-case budget; baseline is 3 per deployment)
 # Max 8 CPU / 8GB RAM = enough for 30 small nginx pods
 apiVersion: v1
 kind: ResourceQuota
   namespace: prod-sre
 spec:
   hard:
+    pods: "45"
     requests.cpu: "8"
     requests.memory: 8Gi
     limits.cpu: "15"
 ---
 # payments — node-0 (VIP)
+# Business-critical payment service. Starts at 3 replicas (matches simulator DEFAULT_CAPACITY).
 # The SRE agent should never SHED_LOAD on this (CRITICAL_NODES in simulator.py).
 apiVersion: apps/v1
 kind: Deployment
     node-id: node-0
     critical: "true"
 spec:
+  replicas: 3
   selector:
     matchLabels:
       app: payments
     app: checkout
     node-id: node-1
 spec:
+  replicas: 3
   selector:
     matchLabels:
       app: checkout
     app: catalog
     node-id: node-2
 spec:
+  replicas: 3
   selector:
     matchLabels:
       app: catalog
     app: cart
     node-id: node-3
 spec:
+  replicas: 3
   selector:
     matchLabels:
       app: cart
     app: auth
     node-id: node-4
 spec:
+  replicas: 3
   selector:
     matchLabels:
       app: auth

deploy/local-laptop.yaml CHANGED Viewed

@@ -9,7 +9,7 @@ metadata:
   name: auth
   namespace: prod-sre
 spec:
-  replicas: 1
   selector:
     matchLabels:
       app: auth
@@ -81,7 +81,7 @@ metadata:
   name: cart
   namespace: prod-sre
 spec:
-  replicas: 1
   selector:
     matchLabels:
       app: cart
@@ -153,7 +153,7 @@ metadata:
   name: catalog
   namespace: prod-sre
 spec:
-  replicas: 1
   selector:
     matchLabels:
       app: catalog
@@ -225,7 +225,7 @@ metadata:
   name: checkout
   namespace: prod-sre
 spec:
-  replicas: 1
   selector:
     matchLabels:
       app: checkout
@@ -297,7 +297,7 @@ metadata:
   name: payments
   namespace: prod-sre
 spec:
-  replicas: 2
   selector:
     matchLabels:
       app: payments

   name: auth
   namespace: prod-sre
 spec:
+  replicas: 3
   selector:
     matchLabels:
       app: auth
   name: cart
   namespace: prod-sre
 spec:
+  replicas: 3
   selector:
     matchLabels:
       app: cart
   name: catalog
   namespace: prod-sre
 spec:
+  replicas: 3
   selector:
     matchLabels:
       app: catalog
   name: checkout
   namespace: prod-sre
 spec:
+  replicas: 3
   selector:
     matchLabels:
       app: checkout
   name: payments
   namespace: prod-sre
 spec:
+  replicas: 3
   selector:
     matchLabels:
       app: payments

server/AntiAtropos_environment.py CHANGED Viewed

@@ -9,7 +9,7 @@ from openenv.core.env_server.types import State
 try:
     from ..models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode
-    from ..simulator import ClusterSimulator, COST_PER_CAPACITY_UNIT_PER_HOUR, OVERPROVISION_COST_PER_UNIT, CLUSTER_TOPOLOGY
     from ..stability import (
         compute_lyapunov,
         compute_lyapunov_graph,
@@ -25,7 +25,7 @@ try:
     from ..control import KubernetesExecutor, ActionValidator
 except ImportError:
     from models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode  # type: ignore[no-redef]
-    from simulator import ClusterSimulator, COST_PER_CAPACITY_UNIT_PER_HOUR, OVERPROVISION_COST_PER_UNIT, CLUSTER_TOPOLOGY  # type: ignore[no-redef]
     from stability import (  # type: ignore[no-redef]
         compute_lyapunov,
         compute_lyapunov_graph,
@@ -385,13 +385,23 @@ class AntiAtroposEnvironment(Environment):
         return False, f"Unsupported environment mode: {self._mode}"
     def _compute_cost(self, nodes_true: list[dict]) -> float:
-        """Two-tier cost: cheap for needed capacity, expensive for idling excess.
-        Needed capacity = ceil(incoming_rate / 15.0) — the minimum replicas
-        required to service current traffic.  Excess beyond that is idling
-        waste and costs OVERPROVISION_COST_PER_UNIT (20x base rate).
         """
         total_cost = 0.0
         for node in nodes_true:
             if node["status"] == NodeStatus.FAILED:
                 continue
@@ -400,11 +410,20 @@ class AntiAtroposEnvironment(Environment):
                 continue
             incoming = float(node.get("incoming_request_rate", 0.0))
             needed = max(1, int(math.ceil(incoming / 15.0)))
-            if capacity <= needed:
                 total_cost += capacity * COST_PER_CAPACITY_UNIT_PER_HOUR
             else:
-                total_cost += needed * COST_PER_CAPACITY_UNIT_PER_HOUR
-                total_cost += (capacity - needed) * OVERPROVISION_COST_PER_UNIT
         return total_cost
     def _avg_latency(self, nodes: list[dict]) -> float:

 try:
     from ..models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode
+    from ..simulator import ClusterSimulator, DEFAULT_CAPACITY, COST_PER_CAPACITY_UNIT_PER_HOUR, OVERPROVISION_COST_PER_UNIT, CLUSTER_TOPOLOGY
     from ..stability import (
         compute_lyapunov,
         compute_lyapunov_graph,
     from ..control import KubernetesExecutor, ActionValidator
 except ImportError:
     from models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode  # type: ignore[no-redef]
+    from simulator import ClusterSimulator, DEFAULT_CAPACITY, COST_PER_CAPACITY_UNIT_PER_HOUR, OVERPROVISION_COST_PER_UNIT, CLUSTER_TOPOLOGY  # type: ignore[no-redef]
     from stability import (  # type: ignore[no-redef]
         compute_lyapunov,
         compute_lyapunov_graph,
         return False, f"Unsupported environment mode: {self._mode}"
     def _compute_cost(self, nodes_true: list[dict]) -> float:
+        """Three-tier cost model calibrated to DEFAULT_CAPACITY as the baseline.
+        Tier 1 — Baseline capacity (up to DEFAULT_CAPACITY): cheap base rate.
+            Infrastructure already provisioned and paid for — no penalty.
+        Tier 2 — Needed excess (above DEFAULT_CAPACITY, up to 'needed'): moderate
+            rate (4× base).  Agent added capacity that's actually serving traffic —
+            costs more but is justified by demand.
+        Tier 3 — Idle excess (above 'needed'): expensive penalty rate (20× base).
+            Capacity sitting idle beyond what traffic requires — pure waste.
+        'needed' = ceil(incoming_rate / 15) — minimum units to serve traffic.
+        With DEFAULT_CAPACITY=3, a node at baseline costs 3 × $0.05 = $0.15/hr
+        regardless of traffic.  Only scaling ABOVE baseline triggers higher rates,
+        giving the agent a clear gradient: scale just enough, not too much.
         """
         total_cost = 0.0
+        baseline_cap = int(DEFAULT_CAPACITY)  # Tier 1 ceiling (imported from simulator)
         for node in nodes_true:
             if node["status"] == NodeStatus.FAILED:
                 continue
                 continue
             incoming = float(node.get("incoming_request_rate", 0.0))
             needed = max(1, int(math.ceil(incoming / 15.0)))
+            if capacity <= baseline_cap:
+                # Tier 1: baseline provisioned capacity — cheap base rate
                 total_cost += capacity * COST_PER_CAPACITY_UNIT_PER_HOUR
             else:
+                # Tier 1: baseline portion at cheap rate
+                total_cost += baseline_cap * COST_PER_CAPACITY_UNIT_PER_HOUR
+                above_baseline = capacity - baseline_cap
+                justified = max(0, needed - baseline_cap)  # excess that serves traffic
+                idle = above_baseline - justified              # excess sitting idle
+                # Tier 2: needed excess at moderate rate (4× base)
+                total_cost += justified * (COST_PER_CAPACITY_UNIT_PER_HOUR * 4.0)
+                # Tier 3: idle excess at penalty rate (20× base)
+                total_cost += idle * OVERPROVISION_COST_PER_UNIT
         return total_cost
     def _avg_latency(self, nodes: list[dict]) -> float: