PranavKK1201 commited on
Commit ·
236abb8
1
Parent(s): d6439a8
increased ramp-up to be proportional to 10 nodes
Browse files- simulator.py +16 -12
simulator.py
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
"""
|
| 8 |
AntiAtropos Core Simulation Physics.
|
| 9 |
|
| 10 |
-
A discrete-time fluid-queue model simulating a
|
| 11 |
Each node has stateful queues, capacities, and failure probabilities.
|
| 12 |
Dynamic traffic is injected per tick, and management actions shift capacity
|
| 13 |
and routing parameters.
|
|
@@ -42,21 +42,21 @@ COST_PER_CAPACITY_UNIT_PER_HOUR: float = 0.05
|
|
| 42 |
|
| 43 |
# Task Profiles (Domain Randomization)
|
| 44 |
# Task 1: Start very near capacity so reward/state react earlier.
|
| 45 |
-
# Default μ_total =
|
| 46 |
# λ_initial randomized close to saturation to avoid long flat early phases.
|
| 47 |
-
T1_INITIAL_LAMBDA: float =
|
| 48 |
-
T1_RAMP_SLOPE: float =
|
| 49 |
-
# Task 2: lambda ≈
|
| 50 |
-
T2_INITIAL_LAMBDA: float =
|
| 51 |
T2_FAIL_TICK: int = 20
|
| 52 |
-
T3_INITIAL_LAMBDA: float =
|
| 53 |
|
| 54 |
# Task 3 surge parameters — base window, jitter applied per episode
|
| 55 |
T3_SURGE_CYCLE: int = 60 # Cycle length (ticks)
|
| 56 |
T3_SURGE_BASE_START: int = 30 # Nominal start of surge within cycle
|
| 57 |
T3_SURGE_BASE_END: int = 40 # Nominal end of surge within cycle
|
| 58 |
T3_SURGE_JITTER: int = 10 # ±jitter applied to start/end each episode
|
| 59 |
-
T3_SURGE_MAGNITUDE: float =
|
| 60 |
|
| 61 |
# Hardening: Critical infrastructure that CANNOT be shed
|
| 62 |
# In Task 3, these receive the surge. Forcing the agent to SCALE.
|
|
@@ -134,7 +134,7 @@ class ClusterSimulator:
|
|
| 134 |
3. Failure Logic: Queue overflows trigger status degradation/node death.
|
| 135 |
"""
|
| 136 |
|
| 137 |
-
def __init__(self, n_nodes: int =
|
| 138 |
self._n_nodes = n_nodes
|
| 139 |
self._task_id = task_id
|
| 140 |
# Default to non-deterministic RNG seeding so fresh simulator instances
|
|
@@ -170,8 +170,8 @@ class ClusterSimulator:
|
|
| 170 |
default_mu_total * 0.92, default_mu_total * 0.99
|
| 171 |
)
|
| 172 |
self._t2_fail_tick = self._rng.randint(10, 40)
|
| 173 |
-
# Task 2: guarantee immediate overload
|
| 174 |
-
self._t2_init_lambda = self._rng.uniform(
|
| 175 |
# Task 3: jitter the surge window so the LLM can't memorise it.
|
| 176 |
jitter = self._rng.randint(-T3_SURGE_JITTER, T3_SURGE_JITTER)
|
| 177 |
self._t3_surge_start = T3_SURGE_BASE_START + jitter
|
|
@@ -398,9 +398,13 @@ class ClusterSimulator:
|
|
| 398 |
and n.status != NodeStatus.FAILED
|
| 399 |
]
|
| 400 |
# If every healthy node is rerouted this tick, fall back to all
|
| 401 |
-
# healthy nodes
|
|
|
|
|
|
|
|
|
|
| 402 |
if not absorbers:
|
| 403 |
absorbers = [n for n in self._nodes if n.status != NodeStatus.FAILED]
|
|
|
|
| 404 |
if absorbers:
|
| 405 |
share = total_overflow / len(absorbers)
|
| 406 |
for n in absorbers:
|
|
|
|
| 7 |
"""
|
| 8 |
AntiAtropos Core Simulation Physics.
|
| 9 |
|
| 10 |
+
A discrete-time fluid-queue model simulating a 10-node microservice cluster.
|
| 11 |
Each node has stateful queues, capacities, and failure probabilities.
|
| 12 |
Dynamic traffic is injected per tick, and management actions shift capacity
|
| 13 |
and routing parameters.
|
|
|
|
| 42 |
|
| 43 |
# Task Profiles (Domain Randomization)
|
| 44 |
# Task 1: Start very near capacity so reward/state react earlier.
|
| 45 |
+
# Default μ_total = 10 nodes × 3 capacity × 15 = 450 req/tick.
|
| 46 |
# λ_initial randomized close to saturation to avoid long flat early phases.
|
| 47 |
+
T1_INITIAL_LAMBDA: float = 390.0
|
| 48 |
+
T1_RAMP_SLOPE: float = 2.0 # +2 req per tick globally (doubled for 10 nodes)
|
| 49 |
+
# Task 2: lambda ≈ 460 means 46/node (102% util) — creates dynamic queue pressure for RL signal.
|
| 50 |
+
T2_INITIAL_LAMBDA: float = 460.0
|
| 51 |
T2_FAIL_TICK: int = 20
|
| 52 |
+
T3_INITIAL_LAMBDA: float = 60.0
|
| 53 |
|
| 54 |
# Task 3 surge parameters — base window, jitter applied per episode
|
| 55 |
T3_SURGE_CYCLE: int = 60 # Cycle length (ticks)
|
| 56 |
T3_SURGE_BASE_START: int = 30 # Nominal start of surge within cycle
|
| 57 |
T3_SURGE_BASE_END: int = 40 # Nominal end of surge within cycle
|
| 58 |
T3_SURGE_JITTER: int = 10 # ±jitter applied to start/end each episode
|
| 59 |
+
T3_SURGE_MAGNITUDE: float = 140.0 # Extra req/tick added to node-1 and node-2
|
| 60 |
|
| 61 |
# Hardening: Critical infrastructure that CANNOT be shed
|
| 62 |
# In Task 3, these receive the surge. Forcing the agent to SCALE.
|
|
|
|
| 134 |
3. Failure Logic: Queue overflows trigger status degradation/node death.
|
| 135 |
"""
|
| 136 |
|
| 137 |
+
def __init__(self, n_nodes: int = 10, task_id: str = "task-1", seed: Optional[int] = None):
|
| 138 |
self._n_nodes = n_nodes
|
| 139 |
self._task_id = task_id
|
| 140 |
# Default to non-deterministic RNG seeding so fresh simulator instances
|
|
|
|
| 170 |
default_mu_total * 0.92, default_mu_total * 0.99
|
| 171 |
)
|
| 172 |
self._t2_fail_tick = self._rng.randint(10, 40)
|
| 173 |
+
# Task 2: guarantee immediate overload (46/node vs 45 capacity)
|
| 174 |
+
self._t2_init_lambda = self._rng.uniform(455.0, 475.0)
|
| 175 |
# Task 3: jitter the surge window so the LLM can't memorise it.
|
| 176 |
jitter = self._rng.randint(-T3_SURGE_JITTER, T3_SURGE_JITTER)
|
| 177 |
self._t3_surge_start = T3_SURGE_BASE_START + jitter
|
|
|
|
| 398 |
and n.status != NodeStatus.FAILED
|
| 399 |
]
|
| 400 |
# If every healthy node is rerouted this tick, fall back to all
|
| 401 |
+
# healthy non-rerouted nodes first, then finally all healthy nodes.
|
| 402 |
+
# CRITICAL: A FAILED node should NEVER be an absorber.
|
| 403 |
+
if not absorbers:
|
| 404 |
+
absorbers = [n for n in self._nodes if n.status != NodeStatus.FAILED and n.node_id not in rerouted_ids]
|
| 405 |
if not absorbers:
|
| 406 |
absorbers = [n for n in self._nodes if n.status != NodeStatus.FAILED]
|
| 407 |
+
|
| 408 |
if absorbers:
|
| 409 |
share = total_overflow / len(absorbers)
|
| 410 |
for n in absorbers:
|