PranavKK1201 commited on
Commit
236abb8
·
1 Parent(s): d6439a8

increased ramp-up to be proportional to 10 nodes

Browse files
Files changed (1) hide show
  1. simulator.py +16 -12
simulator.py CHANGED
@@ -7,7 +7,7 @@
7
  """
8
  AntiAtropos Core Simulation Physics.
9
 
10
- A discrete-time fluid-queue model simulating a 5-node microservice cluster.
11
  Each node has stateful queues, capacities, and failure probabilities.
12
  Dynamic traffic is injected per tick, and management actions shift capacity
13
  and routing parameters.
@@ -42,21 +42,21 @@ COST_PER_CAPACITY_UNIT_PER_HOUR: float = 0.05
42
 
43
  # Task Profiles (Domain Randomization)
44
  # Task 1: Start very near capacity so reward/state react earlier.
45
- # Default μ_total = 5 nodes × 3 capacity × 15 = 225 req/tick.
46
  # λ_initial randomized close to saturation to avoid long flat early phases.
47
- T1_INITIAL_LAMBDA: float = 195.0
48
- T1_RAMP_SLOPE: float = 1.0 # +1 req per tick globally
49
- # Task 2: lambda ≈ 205 means 41/node (91% util), 51/survivor on failure (113% overload).
50
- T2_INITIAL_LAMBDA: float = 205.0
51
  T2_FAIL_TICK: int = 20
52
- T3_INITIAL_LAMBDA: float = 30.0
53
 
54
  # Task 3 surge parameters — base window, jitter applied per episode
55
  T3_SURGE_CYCLE: int = 60 # Cycle length (ticks)
56
  T3_SURGE_BASE_START: int = 30 # Nominal start of surge within cycle
57
  T3_SURGE_BASE_END: int = 40 # Nominal end of surge within cycle
58
  T3_SURGE_JITTER: int = 10 # ±jitter applied to start/end each episode
59
- T3_SURGE_MAGNITUDE: float = 70.0 # Extra req/tick added to node-1 and node-2
60
 
61
  # Hardening: Critical infrastructure that CANNOT be shed
62
  # In Task 3, these receive the surge. Forcing the agent to SCALE.
@@ -134,7 +134,7 @@ class ClusterSimulator:
134
  3. Failure Logic: Queue overflows trigger status degradation/node death.
135
  """
136
 
137
- def __init__(self, n_nodes: int = 5, task_id: str = "task-1", seed: Optional[int] = None):
138
  self._n_nodes = n_nodes
139
  self._task_id = task_id
140
  # Default to non-deterministic RNG seeding so fresh simulator instances
@@ -170,8 +170,8 @@ class ClusterSimulator:
170
  default_mu_total * 0.92, default_mu_total * 0.99
171
  )
172
  self._t2_fail_tick = self._rng.randint(10, 40)
173
- # Task 2: guarantee immediate overload on failure
174
- self._t2_init_lambda = self._rng.uniform(200.0, 220.0)
175
  # Task 3: jitter the surge window so the LLM can't memorise it.
176
  jitter = self._rng.randint(-T3_SURGE_JITTER, T3_SURGE_JITTER)
177
  self._t3_surge_start = T3_SURGE_BASE_START + jitter
@@ -398,9 +398,13 @@ class ClusterSimulator:
398
  and n.status != NodeStatus.FAILED
399
  ]
400
  # If every healthy node is rerouted this tick, fall back to all
401
- # healthy nodes to conserve total incoming traffic.
 
 
 
402
  if not absorbers:
403
  absorbers = [n for n in self._nodes if n.status != NodeStatus.FAILED]
 
404
  if absorbers:
405
  share = total_overflow / len(absorbers)
406
  for n in absorbers:
 
7
  """
8
  AntiAtropos Core Simulation Physics.
9
 
10
+ A discrete-time fluid-queue model simulating a 10-node microservice cluster.
11
  Each node has stateful queues, capacities, and failure probabilities.
12
  Dynamic traffic is injected per tick, and management actions shift capacity
13
  and routing parameters.
 
42
 
43
  # Task Profiles (Domain Randomization)
44
  # Task 1: Start very near capacity so reward/state react earlier.
45
+ # Default μ_total = 10 nodes × 3 capacity × 15 = 450 req/tick.
46
  # λ_initial randomized close to saturation to avoid long flat early phases.
47
+ T1_INITIAL_LAMBDA: float = 390.0
48
+ T1_RAMP_SLOPE: float = 2.0 # +2 req per tick globally (doubled for 10 nodes)
49
+ # Task 2: lambda ≈ 460 means 46/node (102% util) creates dynamic queue pressure for RL signal.
50
+ T2_INITIAL_LAMBDA: float = 460.0
51
  T2_FAIL_TICK: int = 20
52
+ T3_INITIAL_LAMBDA: float = 60.0
53
 
54
  # Task 3 surge parameters — base window, jitter applied per episode
55
  T3_SURGE_CYCLE: int = 60 # Cycle length (ticks)
56
  T3_SURGE_BASE_START: int = 30 # Nominal start of surge within cycle
57
  T3_SURGE_BASE_END: int = 40 # Nominal end of surge within cycle
58
  T3_SURGE_JITTER: int = 10 # ±jitter applied to start/end each episode
59
+ T3_SURGE_MAGNITUDE: float = 140.0 # Extra req/tick added to node-1 and node-2
60
 
61
  # Hardening: Critical infrastructure that CANNOT be shed
62
  # In Task 3, these receive the surge. Forcing the agent to SCALE.
 
134
  3. Failure Logic: Queue overflows trigger status degradation/node death.
135
  """
136
 
137
+ def __init__(self, n_nodes: int = 10, task_id: str = "task-1", seed: Optional[int] = None):
138
  self._n_nodes = n_nodes
139
  self._task_id = task_id
140
  # Default to non-deterministic RNG seeding so fresh simulator instances
 
170
  default_mu_total * 0.92, default_mu_total * 0.99
171
  )
172
  self._t2_fail_tick = self._rng.randint(10, 40)
173
+ # Task 2: guarantee immediate overload (46/node vs 45 capacity)
174
+ self._t2_init_lambda = self._rng.uniform(455.0, 475.0)
175
  # Task 3: jitter the surge window so the LLM can't memorise it.
176
  jitter = self._rng.randint(-T3_SURGE_JITTER, T3_SURGE_JITTER)
177
  self._t3_surge_start = T3_SURGE_BASE_START + jitter
 
398
  and n.status != NodeStatus.FAILED
399
  ]
400
  # If every healthy node is rerouted this tick, fall back to all
401
+ # healthy non-rerouted nodes first, then finally all healthy nodes.
402
+ # CRITICAL: A FAILED node should NEVER be an absorber.
403
+ if not absorbers:
404
+ absorbers = [n for n in self._nodes if n.status != NodeStatus.FAILED and n.node_id not in rerouted_ids]
405
  if not absorbers:
406
  absorbers = [n for n in self._nodes if n.status != NodeStatus.FAILED]
407
+
408
  if absorbers:
409
  share = total_overflow / len(absorbers)
410
  for n in absorbers: