graph modelling

Files changed (7) hide show

fix_colab_mcp.py +0 -48
models.py +15 -0
server/AntiAtropos_environment.py +26 -4
simulator.py +202 -69
smoke_test.py +8 -8
stability.py +41 -0
validate_dag_physics.py +512 -0

fix_colab_mcp.py DELETED Viewed

@@ -1,48 +0,0 @@
-import re
-path = r'C:\Users\kesha\AppData\Local\uv\cache\archive-v0\XqdAvZFy3eRi9W25WFXDP\Lib\site-packages\colab_mcp\session.py'
-with open(path, 'r') as f:
-    content = f.read()
-# The corrupted version from the failed PowerShell replacement
-old = '''async def check_session_proxy_tool_fn(random_string: str = " \\, ctx: Context = CurrentContext()) -> bool:
- \\\\\\Opens a connection to a Google Colab browser session.
- Args:
- random_string: A dummy parameter required by some MCP clients for
- tools with no real arguments. This value is ignored.
- \\\\\\
- fe_connected'''
-new = '''async def check_session_proxy_tool_fn(random_string: str = "", ctx: Context = CurrentContext()) -> bool:
-    """Opens a connection to a Google Colab browser session.
-    Args:
-        random_string: A dummy parameter required by some MCP clients for
-            tools with no real arguments. This value is ignored.
-    """
-    fe_connected'''
-if old in content:
-    content = content.replace(old, new)
-    with open(path, 'w') as f:
-        f.write(content)
-    print('Fixed corrupted version successfully.')
-else:
-    print('Old pattern not found. Checking current state...')
-    # Check if the file still has the original version or corrupted version
-    if 'async def check_session_proxy_tool_fn(ctx: Context = CurrentContext()) -> bool:' in content:
-        print('Found original version, applying fix...')
-        old2 = 'async def check_session_proxy_tool_fn(ctx: Context = CurrentContext()) -> bool:\n    fe_connected'
-        new2 = 'async def check_session_proxy_tool_fn(random_string: str = "", ctx: Context = CurrentContext()) -> bool:\n    """Opens a connection to a Google Colab browser session.\n\n    Args:\n        random_string: A dummy parameter required by some MCP clients for\n            tools with no real arguments. This value is ignored.\n    """\n    fe_connected'
-        content = content.replace(old2, new2)
-        with open(path, 'w') as f:
-            f.write(content)
-        print('Fixed original version successfully.')
-    else:
-        # Print the current function for debugging
-        lines = content.split('\n')
-        for i, line in enumerate(lines):
-            if 'check_session_proxy_tool_fn' in line:
-                print(f'Line {i}: {line}')
-        print('Could not find expected pattern.')

models.py CHANGED Viewed

@@ -111,6 +111,21 @@ class NodeObservation(BaseModel):
         description="How close this node is to SLA violation (0=safe, 1=violating).",
     )
     node_reward: float = Field(
         default=0.0,
         description="Per-node reward contribution for credit assignment.",

         description="How close this node is to SLA violation (0=safe, 1=violating).",
     )
+    outflow_rate: float = Field(
+        default=0.0,
+        ge=0.0,
+        le=1.0,
+        description="Normalised rate of requests dispatched downstream [0, 1].",
+    )
+    upstream_nodes: list[str] = Field(default_factory=list)
+    downstream_nodes: list[str] = Field(default_factory=list)
+    upstream_pressure: float = Field(
+        default=0.0,
+        ge=0.0,
+        le=1.0,
+        description="Mean queue depth of upstream parent nodes (normalised).",
+    )
     node_reward: float = Field(
         default=0.0,
         description="Per-node reward contribution for credit assignment.",

server/AntiAtropos_environment.py CHANGED Viewed

@@ -9,9 +9,10 @@ from openenv.core.env_server.types import State
 try:
     from ..models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode
-    from ..simulator import ClusterSimulator, COST_PER_CAPACITY_UNIT_PER_HOUR
     from ..stability import (
         compute_lyapunov,
         compute_reward,
         compute_barrier,
         normalize_reward,
@@ -24,9 +25,10 @@ try:
     from ..control import KubernetesExecutor, ActionValidator
 except ImportError:
     from models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode  # type: ignore[no-redef]
-    from simulator import ClusterSimulator, COST_PER_CAPACITY_UNIT_PER_HOUR  # type: ignore[no-redef]
     from stability import (  # type: ignore[no-redef]
         compute_lyapunov,
         compute_reward,
         compute_barrier,
         normalize_reward,
@@ -161,7 +163,7 @@ class AntiAtroposEnvironment(Environment):
         self._nodes_true = self._sim.state(for_agent=False)
         self._nodes_obs  = self._sim.state(for_agent=True)
-        self._prev_lyapunov = compute_lyapunov(self._nodes_true)
         return self._build_observation()
@@ -249,7 +251,7 @@ class AntiAtroposEnvironment(Environment):
             self._sla_violations += 1
         # 6. Compute Lyapunov stability metrics from Ground Truth
-        current_lyapunov = compute_lyapunov(self._nodes_true)
         # 7. Compute scalar reward (with barrier function)
         cost = self._compute_cost(self._nodes_true)
@@ -438,6 +440,22 @@ class AntiAtroposEnvironment(Environment):
             node_latency_norm = min(1.0, max(0.0, float(n["latency_ms"]) / MAX_LATENCY_NORM))
             sla_prox = max(0.0, min(1.0, node_latency_norm / 0.20))  # 0.20 is SLA threshold
             node_obs.append(NodeObservation(
                 node_id=n["node_id"],
                 status=n["status"],
@@ -451,6 +469,10 @@ class AntiAtroposEnvironment(Environment):
                 pending_capacity=float(n.get("pending_capacity_units", 0)) / 5.0,
                 queue_delta=queue_delta,
                 sla_proximity=sla_prox,
                 node_reward=node_reward_val,
                 done=False,
                 reward=0.0,

 try:
     from ..models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode
+    from ..simulator import ClusterSimulator, COST_PER_CAPACITY_UNIT_PER_HOUR, CLUSTER_TOPOLOGY
     from ..stability import (
         compute_lyapunov,
+        compute_lyapunov_graph,
         compute_reward,
         compute_barrier,
         normalize_reward,
     from ..control import KubernetesExecutor, ActionValidator
 except ImportError:
     from models import SREAction, ClusterObservation, NodeObservation, NodeStatus, EnvironmentMode  # type: ignore[no-redef]
+    from simulator import ClusterSimulator, COST_PER_CAPACITY_UNIT_PER_HOUR, CLUSTER_TOPOLOGY  # type: ignore[no-redef]
     from stability import (  # type: ignore[no-redef]
         compute_lyapunov,
+        compute_lyapunov_graph,
         compute_reward,
         compute_barrier,
         normalize_reward,
         self._nodes_true = self._sim.state(for_agent=False)
         self._nodes_obs  = self._sim.state(for_agent=True)
+        self._prev_lyapunov = compute_lyapunov_graph(self._nodes_true, CLUSTER_TOPOLOGY)
         return self._build_observation()
             self._sla_violations += 1
         # 6. Compute Lyapunov stability metrics from Ground Truth
+        current_lyapunov = compute_lyapunov_graph(self._nodes_true, CLUSTER_TOPOLOGY)
         # 7. Compute scalar reward (with barrier function)
         cost = self._compute_cost(self._nodes_true)
             node_latency_norm = min(1.0, max(0.0, float(n["latency_ms"]) / MAX_LATENCY_NORM))
             sla_prox = max(0.0, min(1.0, node_latency_norm / 0.20))  # 0.20 is SLA threshold
+            # Topology context for each node
+            node_upstreams = [
+                pid for pid, children in CLUSTER_TOPOLOGY.items()
+                if n["node_id"] in children
+            ]
+            node_downstreams = CLUSTER_TOPOLOGY.get(n["node_id"], [])
+            # Upstream pressure: mean normalised queue depth of parent nodes.
+            # Clamp to [0,1] because raw queues can transiently exceed MAX_QUEUE_NORM
+            # (e.g. during surge events), producing normalised values > 1.
+            parent_queues = [
+                min(1.0, max(0.0, float(prev_by_id.get(pid, {}).get("queue_depth", 0)) / MAX_QUEUE_NORM))
+                for pid in node_upstreams
+            ]
+            upstream_pressure = sum(parent_queues) / len(parent_queues) if parent_queues else 0.0
             node_obs.append(NodeObservation(
                 node_id=n["node_id"],
                 status=n["status"],
                 pending_capacity=float(n.get("pending_capacity_units", 0)) / 5.0,
                 queue_delta=queue_delta,
                 sla_proximity=sla_prox,
+                outflow_rate=min(1.0, float(n.get("outflow_rate", 0.0)) / MAX_REQUEST_RATE_NORM),
+                upstream_nodes=node_upstreams,
+                downstream_nodes=node_downstreams,
+                upstream_pressure=upstream_pressure,
                 node_reward=node_reward_val,
                 done=False,
                 reward=0.0,

simulator.py CHANGED Viewed

@@ -33,6 +33,8 @@ FATAL_FAIL_THRESHOLD: int   = 200     # Hard cap on queue depth (catastrophic fa
 CASCADE_WINDOW_TICKS: int = 3     # Ticks after a failure to check for cascade effects
 CASCADE_QUEUE_MULTIPLIER: float = 1.2  # Queue must exceed FATAL_FAIL_THRESHOLD * this to cascade
 NODE_RECOVERY_TICKS: int   = 20      # Ticks before a FAILED node auto-recovers
 SENSOR_DROPOUT_PROB:  float = 0.05    # P(node.queue, latency reports 0 or -1.0)
 NODE_FAILURE_PROB:    float = 0.00    # P(node fails naturally) — largely driven by task profile
@@ -41,13 +43,13 @@ NODE_FAILURE_PROB:    float = 0.00    # P(node fails naturally) — largely driv
 COST_PER_CAPACITY_UNIT_PER_HOUR: float = 0.05
 # Task Profiles (Domain Randomization)
-# Task 1: Start very near capacity so reward/state react earlier.
-# Default μ_total = 10 nodes × 3 capacity × 15 = 450 req/tick.
-# λ_initial randomized close to saturation to avoid long flat early phases.
-T1_INITIAL_LAMBDA: float = 390.0
-T1_RAMP_SLOPE:     float = 2.0   # +2 req per tick globally (doubled for 10 nodes)
-# Task 2: lambda ≈ 460 means 46/node (102% util) — creates dynamic queue pressure for RL signal.
-T2_INITIAL_LAMBDA: float = 460.0
 T2_FAIL_TICK:      int   = 20
 T3_INITIAL_LAMBDA: float = 60.0
@@ -71,6 +73,41 @@ VIP_NODE_WEIGHTS: dict[str, float] = {
     "node-0": 2.0,
 }
 class NodeStatus(str, Enum):
     HEALTHY  = "HEALTHY"
@@ -98,6 +135,7 @@ class NodeState:
     pending_capacity_queue: list[int] = field(default_factory=list)
     recovery_timer: int = 0          # Countdown to auto-recovery from FAILED status
     is_scripted_failure: bool = False  # True if failed due to task scripting (no auto-recovery)
     # Derived (recomputed whenever capacity or status changes)
     @property
@@ -124,6 +162,7 @@ class NodeState:
             "pending_capacity_units": int(len(self.pending_capacity_queue)),
             "recovery_timer": self.recovery_timer,
             "is_scripted_failure": self.is_scripted_failure,
         }
@@ -166,15 +205,22 @@ class ClusterSimulator:
     def _randomize_domain(self) -> None:
         """Apply domain randomization for RL robustness across tasks."""
         self._t1_ramp_slope = self._rng.uniform(0.8, 2.0)
-        # Task 1: start between 92–99 % of default cluster capacity so
-        # the system is responsive early (less flat reward plateaus).
-        default_mu_total = self._n_nodes * DEFAULT_CAPACITY * 15.0  # 225
         self._t1_init_lambda = self._rng.uniform(
-            default_mu_total * 0.92, default_mu_total * 0.99
         )
         self._t2_fail_tick = self._rng.randint(10, 40)
-        # Task 2: guarantee immediate overload (46/node vs 45 capacity)
-        self._t2_init_lambda = self._rng.uniform(455.0, 475.0)
         # Task 3: jitter the surge window so the LLM can't memorise it.
         jitter = self._rng.randint(-T3_SURGE_JITTER, T3_SURGE_JITTER)
         self._t3_surge_start = T3_SURGE_BASE_START + jitter
@@ -275,21 +321,27 @@ class ClusterSimulator:
         self._tick_count += 1
         self._update_capacity()
         self._inject_traffic()
         # Reset per-tick shed counters before physics update
         for node in self._nodes:
             node.dropped_requests = 0.0
         self._update_queues()
         self._update_derived_metrics()
         self._update_statuses()
         self._cascade_failures()
         self._process_recovery()
-        # Decay shed fractions gradually (retain 80% per tick = slow decay)
-        # The agent must still re-issue to maintain full effect, but the
-        # effect doesn't vanish instantly.  *= 0.8 means after 3 ticks
-        # the shed is still at 51% (0.8^3), vs old 0.0 after 1 tick.
         for node in self._nodes:
-            node.shed_fraction *= 0.8
-            if node.shed_fraction < 0.01:
                 node.shed_fraction = 0.0
     def _update_capacity(self) -> None:
@@ -304,62 +356,106 @@ class ClusterSimulator:
             node.pending_capacity_queue = [delay for delay in node.pending_capacity_queue if delay > 0]
     def _inject_traffic(self) -> None:
-        """Determine λ_i per node based on task and routing state."""
-        total_lambda = 0.0
         if self._task_id == "task-1":
-            # Task 1: Linear Ramp — starts near cluster capacity
             total_lambda = self._t1_init_lambda + (self._t1_ramp_slope * self._tick_count)
         elif self._task_id == "task-2":
-            # Task 2: Fault Tolerance
             total_lambda = self._t2_init_lambda
             if self._tick_count >= self._t2_fail_tick and not self._failed_node_id:
                 self._failed_node_id = self._rng.choice(
                     [n.node_id for n in self._nodes if n.node_id != "node-0"]
                 )
-                # Mark the chosen node as a scripted (permanent) failure
                 target = next((n for n in self._nodes if n.node_id == self._failed_node_id), None)
                 if target:
                     target.is_scripted_failure = True
-            # Physics change: In Task 2, we do NOT redistribute dead node traffic
-            # automatically. The infrastructure keeps sending λ/N to the failed node
-            # (causing errors) until the agent chooses REROUTE_TRAFFIC or SCALE_UP.
-            base_share = total_lambda / self._n_nodes
-            for n in self._nodes:
-                if n.node_id == self._failed_node_id:
-                    n.status = NodeStatus.FAILED
-                    # If the agent hasn't rerouted traffic away, it still hits the failed node
-                    n.incoming_request_rate = base_share
-                else:
-                    n.incoming_request_rate = base_share
-            # This is where the agent's actions (REROUTE_TRAFFIC) physically
-            # move the share from the failed node to the survivors.
-            self._apply_reroute_weights()
-            return
         elif self._task_id == "task-3":
-            # Task 3: Periodic surge — window is jittered per episode
             total_lambda = T3_INITIAL_LAMBDA
             phase = self._tick_count % T3_SURGE_CYCLE
             if self._t3_surge_start <= phase <= self._t3_surge_end:
-                surge = T3_SURGE_MAGNITUDE
-                for n in self._nodes:
-                    if n.node_id in ["node-1", "node-2"]:
-                        n.incoming_request_rate = (total_lambda / self._n_nodes) + surge
-                    else:
-                        n.incoming_request_rate = total_lambda / self._n_nodes
-                return
-        # --- Default: distribute traffic evenly, then apply rerouting ---
-        base_share = total_lambda / self._n_nodes
-        for n in self._nodes:
-            n.incoming_request_rate = base_share
         self._apply_reroute_weights()
     def _apply_reroute_weights(self) -> None:
         """
         Apply REROUTE_TRAFFIC adjustments.
@@ -414,13 +510,31 @@ class ClusterSimulator:
                     n.incoming_request_rate += share
         # Decay weights — agent must keep re-issuing to maintain effect
-        # *= 0.8 retains 80% per tick (slow decay, persistent effect).
-        # After 5 ticks without re-issue, effect is at 33% (0.8^5).
         for nid in list(self._reroute_weights.keys()):
-            self._reroute_weights[nid] *= 0.8
-            if self._reroute_weights[nid] < 0.01:
                 del self._reroute_weights[nid]
     def _update_queues(self) -> None:
         """
         Fluid-queue update for all nodes.
@@ -484,6 +598,7 @@ class ClusterSimulator:
                 if n.status != NodeStatus.FAILED:
                     n.status = NodeStatus.FAILED
                     n.recovery_timer = NODE_RECOVERY_TICKS
                     self._cascade_triggered = True  # Signal cascade detection
             elif n.queue_depth > OVERLOAD_THRESHOLD:
                 n.status = NodeStatus.DEGRADED
@@ -496,7 +611,7 @@ class ClusterSimulator:
         Guardrails:
         - Only triggers when a NEW failure occurred this tick (not any failed node).
-        - Max one cascade step per failure event (no cascade chains).
         - Scripted failures (Task 2) do not trigger cascades.
         """
         if not self._cascade_triggered:
@@ -509,18 +624,27 @@ class ClusterSimulator:
             self._cascade_tick = 0
             return
         cascade_threshold = FATAL_FAIL_THRESHOLD * CASCADE_QUEUE_MULTIPLIER
-        cascaded_this_tick = 0
         for n in self._nodes:
-            if cascaded_this_tick >= 1:
-                break  # Max one cascade per window to prevent chain reactions
-            if n.status == NodeStatus.FAILED:
-                continue
-            if n.is_scripted_failure:
                 continue
             if n.queue_depth > cascade_threshold:
                 n.status = NodeStatus.DEGRADED
-                cascaded_this_tick += 1
     def _process_recovery(self) -> None:
         """Count down recovery timers and bring FAILED nodes back online.
@@ -528,17 +652,26 @@ class ClusterSimulator:
         Only overload-failed nodes (recovery_timer > 0) can recover.
         Scripted failures (is_scripted_failure=True) are excluded.
         """
         for n in self._nodes:
             if n.is_scripted_failure:
                 continue
-            if n.status == NodeStatus.FAILED and n.recovery_timer > 0:
                 n.recovery_timer -= 1
                 if n.recovery_timer <= 0:
                     n.status = NodeStatus.HEALTHY
-                    n.capacity = 1.0  # Recover at minimum capacity
                     n.queue_depth = 0.0
                     n.latency_ms = BASE_LATENCY_MS
                     n.cpu_utilization = 0.0
     def reconcile_state(self, telemetry_map: dict) -> None:
         """

 CASCADE_WINDOW_TICKS: int = 3     # Ticks after a failure to check for cascade effects
 CASCADE_QUEUE_MULTIPLIER: float = 1.2  # Queue must exceed FATAL_FAIL_THRESHOLD * this to cascade
 NODE_RECOVERY_TICKS: int   = 20      # Ticks before a FAILED node auto-recovers
+BACKPRESSURE_THRESHOLD: float = 60.0   # Queue depth that triggers backpressure
+BACKPRESSURE_MAX_FACTOR: float = 0.4   # Maximum service rate reduction (40%)
 SENSOR_DROPOUT_PROB:  float = 0.05    # P(node.queue, latency reports 0 or -1.0)
 NODE_FAILURE_PROB:    float = 0.00    # P(node fails naturally) — largely driven by task profile
 COST_PER_CAPACITY_UNIT_PER_HOUR: float = 0.05
 # Task Profiles (Domain Randomization)
+# Task 1: Start at 92-99% of ingress capacity (randomised in _randomize_domain).
+# DAG ingress capacity = 2 ingress nodes * DEFAULT_CAPACITY * 15 = 90 req/tick.
+# lambda_init ≈ 83-89 so each ingress node sees ~41-44 req/tick (just under 45 capacity).
+T1_INITIAL_LAMBDA: float = 86.0   # midpoint of [82.8, 89.1]; overridden by _randomize_domain
+T1_RAMP_SLOPE:     float = 0.5    # +0.5 req/tick globally per tick
+# Task 2: lambda at 100-110% of ingress capacity — guarantees immediate ingress overload.
+T2_INITIAL_LAMBDA: float = 95.0   # midpoint of [90, 99]; overridden by _randomize_domain
 T2_FAIL_TICK:      int   = 20
 T3_INITIAL_LAMBDA: float = 60.0
     "node-0": 2.0,
 }
+# ---------------------------------------------------------------------------
+# Graph Topology (DAG — fixed 5-node cluster architecture)
+# ---------------------------------------------------------------------------
+# Directed edges: parent -> list of direct children.
+# node-0 (payments/VIP) is the primary ingress; node-4 (auth) is independent.
+CLUSTER_TOPOLOGY: dict[str, list[str]] = {
+    "node-0": ["node-1", "node-2"],
+    "node-1": [],
+    "node-2": ["node-3"],
+    "node-3": [],
+    "node-4": [],
+}
+# Nodes that receive raw external traffic directly.
+EXTERNAL_TRAFFIC_NODES: set[str] = {"node-0", "node-4"}
+# 50/50 external λ split between the two ingress nodes.
+# node-0 (payments/VIP) and node-4 (auth) each receive half of total_lambda.
+# total_lambda is the cluster-wide external arrival rate (req/tick).
+# Each ingress node therefore sees total_lambda * 0.5 req/tick at its input.
+EXTERNAL_LAMBDA_FRACTION: float = 0.5
+# Default upstream-to-downstream routing weights (parent → child splits).
+# These represent the baseline traffic split before agent rerouting.
+DEFAULT_ROUTING_SPLIT: dict[str, dict[str, float]] = {
+    "node-0": {"node-1": 0.5, "node-2": 0.5},
+    "node-2": {"node-3": 1.0},
+}
+# Pre-computed topological order (Kahn's BFS on CLUSTER_TOPOLOGY).
+# Ensures parents are always processed before their children in _inject_traffic().
+# Order: node-0, node-4 (roots) → node-1, node-2 (node-0 children) → node-3 (node-2 child).
+_TOPOLOGICAL_ORDER: tuple[str, ...] = ("node-0", "node-4", "node-1", "node-2", "node-3")
 class NodeStatus(str, Enum):
     HEALTHY  = "HEALTHY"
     pending_capacity_queue: list[int] = field(default_factory=list)
     recovery_timer: int = 0          # Countdown to auto-recovery from FAILED status
     is_scripted_failure: bool = False  # True if failed due to task scripting (no auto-recovery)
+    outflow_rate: float = 0.0         # Requests/tick actually dispatched downstream (DAG edge signal)
     # Derived (recomputed whenever capacity or status changes)
     @property
             "pending_capacity_units": int(len(self.pending_capacity_queue)),
             "recovery_timer": self.recovery_timer,
             "is_scripted_failure": self.is_scripted_failure,
+            "outflow_rate": round(self.outflow_rate, 2),
         }
     def _randomize_domain(self) -> None:
         """Apply domain randomization for RL robustness across tasks."""
         self._t1_ramp_slope = self._rng.uniform(0.8, 2.0)
+        # DAG calibration: total_lambda is split across 2 ingress nodes (node-0, node-4).
+        # Each ingress node's capacity is DEFAULT_CAPACITY * 15 req/tick.
+        # Ingress cluster capacity = len(EXTERNAL_TRAFFIC_NODES) * DEFAULT_CAPACITY * 15 = 90.
+        # Task 1: start between 92-99% of ingress capacity so the ingress nodes are
+        # near saturation immediately, producing rich early reward signal.
+        n_ingress = len(EXTERNAL_TRAFFIC_NODES)  # 2
+        ingress_mu_total = n_ingress * DEFAULT_CAPACITY * 15.0  # 90 req/tick
         self._t1_init_lambda = self._rng.uniform(
+            ingress_mu_total * 0.92, ingress_mu_total * 0.99
         )
         self._t2_fail_tick = self._rng.randint(10, 40)
+        # Task 2: guarantee immediate ingress overload (slightly above ingress saturation).
+        # Each ingress node sees total_lambda/2; target ~102% of individual ingress capacity.
+        self._t2_init_lambda = self._rng.uniform(
+            ingress_mu_total * 1.00, ingress_mu_total * 1.10
+        )
         # Task 3: jitter the surge window so the LLM can't memorise it.
         jitter = self._rng.randint(-T3_SURGE_JITTER, T3_SURGE_JITTER)
         self._t3_surge_start = T3_SURGE_BASE_START + jitter
         self._tick_count += 1
         self._update_capacity()
         self._inject_traffic()
+        # Save original capacities; backpressure temporarily reduces service_rate
+        # for this tick only. Restore after _update_queues so the reduction does
+        # not compound across ticks and permanently cripple parent nodes.
+        saved_capacities = {n.node_id: n.capacity for n in self._nodes}
+        self._apply_backpressure()
         # Reset per-tick shed counters before physics update
         for node in self._nodes:
             node.dropped_requests = 0.0
         self._update_queues()
+        # Restore capacities so the next tick starts from the true provisioned level
+        for n in self._nodes:
+            n.capacity = saved_capacities.get(n.node_id, n.capacity)
         self._update_derived_metrics()
         self._update_statuses()
         self._cascade_failures()
         self._process_recovery()
+        # Decay shed fractions gradually
+        # *= 0.5 retains 50% per tick (fast decay).
         for node in self._nodes:
+            node.shed_fraction *= 0.5
+            if node.shed_fraction < 0.05:
                 node.shed_fraction = 0.0
     def _update_capacity(self) -> None:
             node.pending_capacity_queue = [delay for delay in node.pending_capacity_queue if delay > 0]
     def _inject_traffic(self) -> None:
+        """
+        Distribute traffic through the cluster DAG in three phases.
+        Phase 1 — Task lambda + scripted events:
+            Compute total external λ for this tick and apply any task-specific
+            mutations (node failure scripting, surge flags).  No early returns;
+            all branches fall through to the shared DAG in Phase 2.
+        Phase 2 — Topological DAG distribution:
+            Traverse _TOPOLOGICAL_ORDER (roots first).  Each parent's
+            processed outflow (min(incoming, service_rate)) is split across
+            its children via DEFAULT_ROUTING_SPLIT.  A FAILED node has
+            service_rate=0, so outflow=0 and its children are naturally
+            starved — this is the causal failure chain the RL agent must
+            learn to route around.
+        Phase 3 — Reroute weight correction:
+            Apply REROUTE_TRAFFIC weight adjustments post-DAG, then decay
+            weights.  Keeps reroute semantics identical to pre-DAG behaviour.
+        """
+        # -------------------------------------------------------------------
+        # Phase 1: task-specific lambda + scripted events (no early returns)
+        # -------------------------------------------------------------------
+        total_lambda: float = 0.0
+        # direct_injections: extra traffic added directly to a node ON TOP OF
+        # the DAG distribution.  Used for Task-3 surge bursts that model a
+        # side-channel load source (e.g. bulk import hitting checkout/catalog
+        # directly), while the base λ still travels through node-0 as ingress.
+        direct_injections: dict[str, float] = {}
         if self._task_id == "task-1":
+            # Linear ramp — starts near cluster capacity
             total_lambda = self._t1_init_lambda + (self._t1_ramp_slope * self._tick_count)
         elif self._task_id == "task-2":
             total_lambda = self._t2_init_lambda
+            # Scripted node failure fires at the configured tick
             if self._tick_count >= self._t2_fail_tick and not self._failed_node_id:
                 self._failed_node_id = self._rng.choice(
                     [n.node_id for n in self._nodes if n.node_id != "node-0"]
                 )
                 target = next((n for n in self._nodes if n.node_id == self._failed_node_id), None)
                 if target:
                     target.is_scripted_failure = True
+            # No early return: DAG distributes traffic to the failed node normally.
+            # The dead node's service_rate=0 means outflow=0, so its children are
+            # starved. _update_queues() converts all its incoming traffic to
+            # dropped_requests.  The agent must issue REROUTE_TRAFFIC to shift
+            # the parent's split away from the dead child.
         elif self._task_id == "task-3":
             total_lambda = T3_INITIAL_LAMBDA
             phase = self._tick_count % T3_SURGE_CYCLE
             if self._t3_surge_start <= phase <= self._t3_surge_end:
+                # Surge is modelled as a direct external burst arriving at the
+                # checkout (node-1) and catalog (node-2) services from a side
+                # channel that bypasses the payment gateway ingress.
+                # Base λ still routes through the DAG; the surge is overlaid so
+                # CRITICAL_NODE protections (no SHED_LOAD on node-1/2) still apply.
+                for nid in ["node-1", "node-2"]:
+                    direct_injections[nid] = T3_SURGE_MAGNITUDE
+        # -------------------------------------------------------------------
+        # Phase 2: DAG topological distribution
+        # -------------------------------------------------------------------
+        node_incoming: dict[str, float] = {n.node_id: 0.0 for n in self._nodes}
+        node_map: dict[str, "NodeState"] = {n.node_id: n for n in self._nodes}
+        # Seed ingress nodes with their share of external λ
+        node_incoming["node-0"] = total_lambda * EXTERNAL_LAMBDA_FRACTION
+        node_incoming["node-4"] = total_lambda * (1.0 - EXTERNAL_LAMBDA_FRACTION)
+        # Overlay task-specific direct injections (Task-3 surge)
+        for nid, extra in direct_injections.items():
+            node_incoming[nid] = node_incoming.get(nid, 0.0) + extra
+        # Propagate outflow through the graph in topological order
+        for parent_id in _TOPOLOGICAL_ORDER:
+            parent = node_map.get(parent_id)
+            if parent is None:
+                continue
+            parent.incoming_request_rate = node_incoming[parent_id]
+            # Outflow = requests the parent actually forwards downstream.
+            # FAILED nodes have service_rate=0 → outflow=0 → children starved.
+            outflow = min(parent.incoming_request_rate, parent.service_rate)
+            parent.outflow_rate = outflow
+            for child_id, split in DEFAULT_ROUTING_SPLIT.get(parent_id, {}).items():
+                node_incoming[child_id] = node_incoming.get(child_id, 0.0) + outflow * split
+        # -------------------------------------------------------------------
+        # Phase 3: REROUTE_TRAFFIC weight corrections (post-DAG)
+        # -------------------------------------------------------------------
         self._apply_reroute_weights()
+        # Recalculate outflow after reroute so the agent sees accurate
+        # per-node dispatch rates.  Without this, a node whose incoming was
+        # halved by reroute would still report its pre-reroute outflow.
+        for n in self._nodes:
+            n.outflow_rate = min(n.incoming_request_rate, n.service_rate)
     def _apply_reroute_weights(self) -> None:
         """
         Apply REROUTE_TRAFFIC adjustments.
                     n.incoming_request_rate += share
         # Decay weights — agent must keep re-issuing to maintain effect
+        # *= 0.5 retains 50% per tick (fast decay).
         for nid in list(self._reroute_weights.keys()):
+            self._reroute_weights[nid] *= 0.5
+            if self._reroute_weights[nid] < 0.05:
                 del self._reroute_weights[nid]
+    def _apply_backpressure(self) -> None:
+        """Reduce parent service rate when children are overloaded."""
+        for parent_id, children in CLUSTER_TOPOLOGY.items():
+            parent = next((n for n in self._nodes if n.node_id == parent_id), None)
+            if not children or not parent or parent.status == NodeStatus.FAILED:
+                continue
+            # Compute pressure from overloaded children
+            total_pressure = 0.0
+            for child_id in children:
+                child = next((n for n in self._nodes if n.node_id == child_id), None)
+                if child:
+                    excess = max(0.0, child.queue_depth - BACKPRESSURE_THRESHOLD)
+                    total_pressure += excess / FATAL_FAIL_THRESHOLD  # normalise to [0, 1]
+            # Reduce parent's effective capacity proportionally
+            pressure_factor = min(BACKPRESSURE_MAX_FACTOR, total_pressure * 0.6)
+            parent.capacity = max(1.0, parent.capacity * (1.0 - pressure_factor))
     def _update_queues(self) -> None:
         """
         Fluid-queue update for all nodes.
                 if n.status != NodeStatus.FAILED:
                     n.status = NodeStatus.FAILED
                     n.recovery_timer = NODE_RECOVERY_TICKS
+                    n.capacity = 0.5   # starts at half capacity when recovery begins
                     self._cascade_triggered = True  # Signal cascade detection
             elif n.queue_depth > OVERLOAD_THRESHOLD:
                 n.status = NodeStatus.DEGRADED
         Guardrails:
         - Only triggers when a NEW failure occurred this tick (not any failed node).
+        - Graph-bounded: cascade only propagates along edges (parents or children).
         - Scripted failures (Task 2) do not trigger cascades.
         """
         if not self._cascade_triggered:
             self._cascade_tick = 0
             return
+        # Find all currently failed nodes
+        failed_ids = {n.node_id for n in self._nodes if n.status == NodeStatus.FAILED}
+        # Build set of nodes adjacent to any failed node (upstream or downstream)
+        at_risk = set()
+        for failed_id in failed_ids:
+            # Downstream children of the failed node
+            at_risk.update(CLUSTER_TOPOLOGY.get(failed_id, []))
+            # Upstream parents of the failed node
+            for parent_id, children in CLUSTER_TOPOLOGY.items():
+                if failed_id in children:
+                    at_risk.add(parent_id)
         cascade_threshold = FATAL_FAIL_THRESHOLD * CASCADE_QUEUE_MULTIPLIER
         for n in self._nodes:
+            if n.node_id not in at_risk:
+                continue   # Not adjacent to failure — cannot cascade
+            if n.status == NodeStatus.FAILED or n.is_scripted_failure:
                 continue
             if n.queue_depth > cascade_threshold:
                 n.status = NodeStatus.DEGRADED
     def _process_recovery(self) -> None:
         """Count down recovery timers and bring FAILED nodes back online.
         Only overload-failed nodes (recovery_timer > 0) can recover.
         Scripted failures (is_scripted_failure=True) are excluded.
         """
+        RECOVERY_RAMP_PER_TICK: float = 0.5   # capacity added per tick during recovery
         for n in self._nodes:
             if n.is_scripted_failure:
                 continue
+            # Check recovery_timer > 0, not status == FAILED: the first recovery
+            # tick transitions the node to DEGRADED, but the timer must keep
+            # counting until it reaches 0 and the node becomes HEALTHY.
+            if n.recovery_timer > 0:
                 n.recovery_timer -= 1
                 if n.recovery_timer <= 0:
                     n.status = NodeStatus.HEALTHY
                     n.queue_depth = 0.0
                     n.latency_ms = BASE_LATENCY_MS
                     n.cpu_utilization = 0.0
+                    # capacity stays at whatever it ramped to
+                else:
+                    # Still in recovery: ramp capacity up, stay DEGRADED
+                    n.capacity = min(DEFAULT_CAPACITY, n.capacity + RECOVERY_RAMP_PER_TICK)
+                    n.status = NodeStatus.DEGRADED   # not HEALTHY until fully ramped
     def reconcile_state(self, telemetry_map: dict) -> None:
         """

smoke_test.py CHANGED Viewed

@@ -172,16 +172,16 @@ def test_simulator_node_count():
     sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
     nodes = sim.state(for_agent=False)
-    record("10 nodes created",
-           PASS if len(nodes) == 10 else FAIL,
            f"got {len(nodes)}")
     record("node-0 is VIP",
            PASS if nodes[0]["is_vip"] else FAIL,
            f"is_vip={nodes[0]['is_vip']}")
-    record("node-0 weight=4.0",
-           PASS if nodes[0]["importance_weight"] == 4.0 else FAIL,
            f"weight={nodes[0]['importance_weight']}")
     non_vip_weights = [n["importance_weight"] for n in nodes[1:]]
@@ -190,8 +190,8 @@ def test_simulator_node_count():
            f"unique weights={set(non_vip_weights)}")
     node_ids = [n["node_id"] for n in nodes]
-    expected_ids = [f"node-{i}" for i in range(10)]
-    record("Node IDs 0-9",
            PASS if node_ids == expected_ids else FAIL,
            f"ids={node_ids}")
@@ -542,8 +542,8 @@ def test_cascade_and_recovery():
     sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
     sim.reset(task_id="task-1", seed=1)
-    # Artificially overload a node to trigger failure
-    node = sim._nodes[5]
     node.queue_depth = 250.0  # > FATAL_FAIL_THRESHOLD=200
     sim._update_statuses()
     record("Node fails when queue > FATAL_FAIL_THRESHOLD",

     sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
     nodes = sim.state(for_agent=False)
+    record("5 nodes created",
+           PASS if len(nodes) == 5 else FAIL,
            f"got {len(nodes)}")
     record("node-0 is VIP",
            PASS if nodes[0]["is_vip"] else FAIL,
            f"is_vip={nodes[0]['is_vip']}")
+    record("node-0 weight=2.0",
+           PASS if nodes[0]["importance_weight"] == 2.0 else FAIL,
            f"weight={nodes[0]['importance_weight']}")
     non_vip_weights = [n["importance_weight"] for n in nodes[1:]]
            f"unique weights={set(non_vip_weights)}")
     node_ids = [n["node_id"] for n in nodes]
+    expected_ids = [f"node-{i}" for i in range(5)]
+    record("Node IDs 0-4",
            PASS if node_ids == expected_ids else FAIL,
            f"ids={node_ids}")
     sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
     sim.reset(task_id="task-1", seed=1)
+    # Artificially overload node-2 (has children, tests graph cascade)
+    node = sim._nodes[2]
     node.queue_depth = 250.0  # > FATAL_FAIL_THRESHOLD=200
     sim._update_statuses()
     record("Node fails when queue > FATAL_FAIL_THRESHOLD",

stability.py CHANGED Viewed

@@ -53,6 +53,9 @@ Q_BARRIER_MAX: float = 150.0
 Set higher than OVERLOAD_THRESHOLD (80) to allow the agent time to react
 before the barrier penalty kicks in."""
 BARRIER_NORM_SCALE: float = 10000.0
 """Normalization divisor for the barrier term.
 The raw barrier H(s) = sum(max(0, Q_i - Q_max)^2) can produce very large values
@@ -100,6 +103,44 @@ def compute_lyapunov(nodes: list[dict]) -> float:
         )
     )
 def compute_drift(v_prev: float, v_curr: float) -> float:
     """

 Set higher than OVERLOAD_THRESHOLD (80) to allow the agent time to react
 before the barrier penalty kicks in."""
+MAX_QUEUE_NORM: float = 200.0
+"""Normalization divisor shared with environment.py.  Matches FATAL_FAIL_THRESHOLD."""
 BARRIER_NORM_SCALE: float = 10000.0
 """Normalization divisor for the barrier term.
 The raw barrier H(s) = sum(max(0, Q_i - Q_max)^2) can produce very large values
         )
     )
+def compute_lyapunov_graph(
+    nodes: list[dict],
+    topology: dict[str, list[str]],
+    edge_weight: float = 5.0,
+) -> float:
+    """
+    V_graph(s) = Σ w_i·Q_i²  +  edge_weight · Σ_{(i,j)∈edges} |Q_i - Q_j|
+    The edge term penalises flow imbalance between connected nodes.
+    If node-0 is overloaded but node-1 (its child) is idle, the edge
+    term fires even if node-1's individual energy contribution is zero.
+    This gives the agent gradient signal to balance load across the graph,
+    not just reduce individual node queues.
+    With edge_weight=5.0 and max |Q_diff|=200 on 4 edges, the edge term
+    adds ~4000 to the Lyapunov energy, or ~2-5 % of the base energy at
+    full overload — a meaningful secondary gradient without dominating.
+    """
+    node_map = {n["node_id"]: n for n in nodes}
+    # Standard weighted Lyapunov
+    base_energy = compute_lyapunov(nodes)
+    # Edge imbalance penalty (raw queue-depth differences)
+    edge_penalty = 0.0
+    for parent_id, children in topology.items():
+        parent = node_map.get(parent_id)
+        if not parent:
+            continue
+        for child_id in children:
+            child = node_map.get(child_id)
+            if not child:
+                continue
+            imbalance = abs(float(parent["queue_depth"]) - float(child["queue_depth"]))
+            edge_penalty += imbalance
+    return base_energy + edge_weight * edge_penalty
 def compute_drift(v_prev: float, v_curr: float) -> float:
     """

validate_dag_physics.py ADDED Viewed

	@@ -0,0 +1,512 @@

+#!/usr/bin/env python3
+"""
+Comprehensive DAG Physics Validation for AntiAtropos training readiness.
+Verifies:
+  A. DAG traffic routing (parent->child propagation)
+  B. Task-2 scripted failure flows through DAG (not bypassed)
+  C. Task-3 surge correct overlay on DAG
+  D. Backpressure is temporary (not permanent capacity drain)
+  E. Gradual recovery completes fully
+  F. Graph-bounded cascades
+  G. Reroute weights work with DAG
+  H. Graph Lyapunov edge penalty
+  I. Environment observation populates graph fields
+  J. Reward components are non-degenerate across tasks
+Run: python validate_dag_physics.py
+"""
+import sys, os, math
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from simulator import (
+    ClusterSimulator, NodeStatus, DEFAULT_CAPACITY,
+    CLUSTER_TOPOLOGY, EXTERNAL_TRAFFIC_NODES, _TOPOLOGICAL_ORDER,
+    DEFAULT_ROUTING_SPLIT, T1_INITIAL_LAMBDA, T2_INITIAL_LAMBDA,
+    T3_INITIAL_LAMBDA, T3_SURGE_MAGNITUDE,
+    COST_PER_CAPACITY_UNIT_PER_HOUR, FATAL_FAIL_THRESHOLD,
+    NODE_RECOVERY_TICKS, BACKPRESSURE_THRESHOLD,
+)
+from stability import (
+    compute_lyapunov, compute_lyapunov_graph, compute_reward,
+    compute_barrier, normalize_reward, smooth_sla_penalty, compute_drift,
+    BARRIER_NORM_SCALE,
+)
+# --- Test harness ---
+PASS = "PASS"
+FAIL = "FAIL"
+results = []
+def record(name, status, detail=""):
+    results.append((name, status, detail))
+    icon = "+" if status == PASS else "X"
+    msg = f"  [{icon}] {name}"
+    if detail:
+        msg += f"  -- {detail}"
+    print(msg)
+# ============================================================================
+# A. DAG Traffic Routing
+# ============================================================================
+def test_A_dag_routing():
+    print("\n=== A. DAG Traffic Routing ===")
+    sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
+    sim.reset(task_id="task-1", seed=1)
+    # Tick once with NO_OP
+    class _A: pass
+    a = _A(); a.action_type = "NO_OP"; a.target_node_id = "node-0"; a.parameter = 0.0
+    sim.apply_action(a)
+    sim.tick()
+    nodes = {n.node_id: n for n in sim._nodes}
+    # node-0 and node-4 should receive external traffic
+    n0_in = nodes["node-0"].incoming_request_rate
+    n4_in = nodes["node-4"].incoming_request_rate
+    record("node-0 (ingress) receives traffic",
+           PASS if n0_in > 0 else FAIL,
+           f"incoming={n0_in:.1f}")
+    record("node-4 (ingress) receives traffic",
+           PASS if n4_in > 0 else FAIL,
+           f"incoming={n4_in:.1f}")
+    # node-1, node-2 should receive outflow from node-0
+    n1_in = nodes["node-1"].incoming_request_rate
+    n2_in = nodes["node-2"].incoming_request_rate
+    record("node-1 receives from node-0 (DAG child)",
+           PASS if n1_in > 0 else FAIL,
+           f"incoming={n1_in:.1f}")
+    record("node-2 receives from node-0 (DAG child)",
+           PASS if n2_in > 0 else FAIL,
+           f"incoming={n2_in:.1f}")
+    # node-3 should receive outflow from node-2
+    n3_in = nodes["node-3"].incoming_request_rate
+    record("node-3 receives from node-2 (DAG grandchild)",
+           PASS if n3_in > 0 else FAIL,
+           f"incoming={n3_in:.1f}")
+    # node-0 outflow should be ~incoming (since capacity >> lambda at start)
+    record("node-0 has positive outflow_rate",
+           PASS if nodes["node-0"].outflow_rate > 0 else FAIL,
+           f"outflow={nodes['node-0'].outflow_rate:.1f}")
+# ============================================================================
+# B. Task-2 Scripted Failure Flows Through DAG
+# ============================================================================
+def test_B_task2_dag():
+    print("\n=== B. Task-2 Failure Through DAG ===")
+    sim = ClusterSimulator(n_nodes=5, task_id="task-2", seed=42)
+    sim.reset(task_id="task-2", seed=42)
+    # Run enough ticks for failure to trigger
+    class _A: pass
+    a = _A(); a.action_type = "NO_OP"; a.target_node_id = "node-0"; a.parameter = 0.0
+    failed_id = None
+    for _ in range(60):
+        sim.apply_action(a)
+        sim.tick()
+        if sim._failed_node_id and failed_id is None:
+            failed_id = sim._failed_node_id
+    record("Scripted failure was assigned",
+           PASS if failed_id is not None else FAIL,
+           f"failed_id={failed_id}")
+    nodes = {n.node_id: n for n in sim._nodes}
+    failed_node = nodes.get(failed_id)
+    record("Failed node has FAILED status",
+           PASS if failed_node and failed_node.status == NodeStatus.FAILED else FAIL,
+           f"status={failed_node.status if failed_node else 'N/A'}")
+    # If failed node is a child of node-0 (e.g., node-1 or node-2),
+    # node-0 should still be sending traffic to it (flow not bypassed)
+    if failed_id in CLUSTER_TOPOLOGY.get("node-0", []):
+        # The failed node outflow should be 0 (service_rate=0),
+        # but it should still have incoming from DAG
+        record("Failed child still receives DAG traffic (as dropped requests)",
+               PASS if failed_node.incoming_request_rate >= 0 else FAIL,
+               f"incoming={failed_node.incoming_request_rate:.1f} dropped={failed_node.dropped_requests:.1f}")
+    # If failed node was node-2, node-3 should be starved (outflow=0 upstream)
+    if failed_id == "node-2":
+        n3 = nodes["node-3"]
+        record("node-3 starved when parent node-2 fails",
+               PASS if n3.incoming_request_rate == 0 else FAIL,
+               f"node-3 incoming={n3.incoming_request_rate:.1f}")
+# ============================================================================
+# C. Task-3 Surge Overlay on DAG
+# ============================================================================
+def test_C_task3_surge_dag():
+    print("\n=== C. Task-3 Surge Overlay ===")
+    sim = ClusterSimulator(n_nodes=5, task_id="task-3", seed=7)
+    sim.reset(task_id="task-3", seed=7)
+    # Force surge window to be active immediately
+    sim._t3_surge_start = 0
+    sim._t3_surge_end = 999
+    class _A: pass
+    a = _A(); a.action_type = "NO_OP"; a.target_node_id = "node-0"; a.parameter = 0.0
+    sim.apply_action(a)
+    sim.tick()
+    nodes = {n.node_id: n for n in sim._nodes}
+    # node-1 and node-2 should have surge + DAG traffic
+    n1_in = nodes["node-1"].incoming_request_rate
+    n2_in = nodes["node-2"].incoming_request_rate
+    record("node-1 receives surge + DAG traffic",
+           PASS if n1_in > T3_SURGE_MAGNITUDE else FAIL,
+           f"incoming={n1_in:.1f} (surge={T3_SURGE_MAGNITUDE})")
+    record("node-2 receives surge + DAG traffic",
+           PASS if n2_in > T3_SURGE_MAGNITUDE else FAIL,
+           f"incoming={n2_in:.1f} (surge={T3_SURGE_MAGNITUDE})")
+    # node-0 should still have base DAG traffic (not affected by surge directly)
+    n0_in = nodes["node-0"].incoming_request_rate
+    record("node-0 gets base DAG traffic (surge is side-channel)",
+           PASS if n0_in < T3_SURGE_MAGNITUDE else FAIL,
+           f"node-0 incoming={n0_in:.1f}")
+# ============================================================================
+# D. Backpressure Is Temporary
+# ============================================================================
+def test_D_backpressure_temporary():
+    print("\n=== D. Backpressure Temporary ===")
+    sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
+    sim.reset(task_id="task-1", seed=1)
+    node0 = next(n for n in sim._nodes if n.node_id == "node-0")
+    original_cap = node0.capacity
+    # Artificially overload node-0's children to trigger backpressure
+    for n in sim._nodes:
+        if n.node_id in CLUSTER_TOPOLOGY.get("node-0", []):
+            n.queue_depth = BACKPRESSURE_THRESHOLD + 100.0  # well above threshold
+    # Tick: backpressure should reduce node-0's capacity for THIS tick only
+    class _A: pass
+    a = _A(); a.action_type = "NO_OP"; a.target_node_id = "node-0"; a.parameter = 0.0
+    sim.apply_action(a)
+    sim.tick()
+    cap_after_tick = node0.capacity
+    record("node-0 capacity restored after backpressure tick",
+           PASS if abs(cap_after_tick - original_cap) < 0.01 else FAIL,
+           f"before={original_cap:.2f} after={cap_after_tick:.2f}")
+    # Tick again (children still overloaded) — capacity should still be original
+    sim.apply_action(a)
+    sim.tick()
+    cap_after_tick2 = node0.capacity
+    record("node-0 capacity intact after multiple backpressure ticks",
+           PASS if abs(cap_after_tick2 - original_cap) < 0.01 else FAIL,
+           f"after 2 ticks={cap_after_tick2:.2f} original={original_cap:.2f}")
+    # Clear children overload — capacity should remain original
+    for n in sim._nodes:
+        if n.node_id in CLUSTER_TOPOLOGY.get("node-0", []):
+            n.queue_depth = 0.0
+    sim.apply_action(a)
+    sim.tick()
+    cap_clear = node0.capacity
+    record("node-0 capacity unchanged after children clear",
+           PASS if abs(cap_clear - original_cap) < 0.01 else FAIL,
+           f"capacity={cap_clear:.2f}")
+# ============================================================================
+# E. Gradual Recovery Completes
+# ============================================================================
+def test_E_gradual_recovery():
+    print("\n=== E. Gradual Recovery ===")
+    sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
+    sim.reset(task_id="task-1", seed=1)
+    node = sim._nodes[2]
+    node.queue_depth = 250.0
+    sim._update_statuses()
+    record("Node becomes FAILED on overload",
+           PASS if node.status == NodeStatus.FAILED else FAIL,
+           f"status={node.status}")
+    record("Capacity drops to 0.5 at failure",
+           PASS if abs(node.capacity - 0.5) < 0.01 else FAIL,
+           f"capacity={node.capacity}")
+    # Tick through full recovery (NODE_RECOVERY_TICKS + some margin)
+    class _A: pass
+    a = _A(); a.action_type = "NO_OP"; a.target_node_id = "node-0"; a.parameter = 0.0
+    for _ in range(NODE_RECOVERY_TICKS + 5):
+        sim.apply_action(a)
+        sim.tick()
+    record("Node reaches HEALTHY after full recovery",
+           PASS if node.status == NodeStatus.HEALTHY else FAIL,
+           f"status={node.status}")
+    # Capacity should have ramped: start=0.5, each recovery tick adds 0.5
+    # After NODE_RECOVERY_TICKS=20 ticks: 0.5 + 20*0.5 = 10.5, capped at 3.0
+    record("Capacity recovered to DEFAULT_CAPACITY (capped)",
+           PASS if abs(node.capacity - DEFAULT_CAPACITY) < 0.01 else FAIL,
+           f"capacity={node.capacity:.2f} expected={DEFAULT_CAPACITY}")
+# ============================================================================
+# F. Graph-Bounded Cascades
+# ============================================================================
+def test_F_graph_cascade():
+    print("\n=== F. Graph-Bounded Cascades ===")
+    sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
+    sim.reset(task_id="task-1", seed=1)
+    # Fail node-2, overload its children/parents
+    node2 = sim._nodes[2]
+    node2.queue_depth = 250.0
+    sim._update_statuses()  # node-2 becomes FAILED, triggers cascade
+    # node-3 is child of node-2 — should be at_risk
+    # node-0 is parent of node-2 — should be at_risk
+    node3 = sim._nodes[3]
+    node0 = sim._nodes[0]
+    # Overload node-3 to trigger cascade
+    node3.queue_depth = FATAL_FAIL_THRESHOLD * 1.5  # > cascade threshold
+    sim._cascade_failures()
+    record("node-3 (child of failed node-2) cascades to DEGRADED",
+           PASS if node3.status == NodeStatus.DEGRADED else FAIL,
+           f"node-3 status={node3.status}")
+    # node-4 is NOT adjacent to node-2 — should NOT cascade
+    node4 = sim._nodes[4]
+    node4.queue_depth = FATAL_FAIL_THRESHOLD * 1.5
+    sim._cascade_failures()
+    record("node-4 (not adjacent to failed) does NOT cascade",
+           PASS if node4.status != NodeStatus.DEGRADED else FAIL,
+           f"node-4 status={node4.status}")
+# ============================================================================
+# G. Reroute Weights with DAG
+# ============================================================================
+def test_G_reroute_with_dag():
+    print("\n=== G. Reroute Weights with DAG ===")
+    sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
+    sim.reset(task_id="task-1", seed=1)
+    class _A: pass
+    # Tick once to establish baseline
+    a = _A(); a.action_type = "NO_OP"; a.target_node_id = "node-0"; a.parameter = 0.0
+    sim.apply_action(a)
+    sim.tick()
+    node0 = next(n for n in sim._nodes if n.node_id == "node-0")
+    node4 = next(n for n in sim._nodes if n.node_id == "node-4")
+    baseline_n0_in = node0.incoming_request_rate
+    # Reroute 100% of node-0 traffic away
+    a2 = _A(); a2.action_type = "REROUTE_TRAFFIC"; a2.target_node_id = "node-0"; a2.parameter = 1.0
+    sim.apply_action(a2)
+    sim.tick()
+    n0_in_after = node0.incoming_request_rate
+    record("Reroute reduces node-0 incoming traffic",
+           PASS if n0_in_after < baseline_n0_in else FAIL,
+           f"before={baseline_n0_in:.1f} after={n0_in_after:.1f}")
+    # Verify outflow_rate is also reduced (since incoming is lower)
+    record("node-0 outflow reduced after reroute",
+           PASS if node0.outflow_rate < baseline_n0_in else FAIL,
+           f"outflow={node0.outflow_rate:.1f}")
+    # Reroute weight should decay each tick (0.5 factor)
+    w = sim._reroute_weights.get("node-0", 0.0)
+    record("Reroute weight decays (0.5 * prev)",
+           PASS if w < 1.0 else FAIL,
+           f"weight after first decay={w:.3f}")
+# ============================================================================
+# H. Graph Lyapunov Edge Penalty
+# ============================================================================
+def test_H_graph_lyapunov():
+    print("\n=== H. Graph Lyapunov Edge Penalty ===")
+    nodes_balanced = [
+        {"node_id": "node-0", "queue_depth": 50.0, "importance_weight": 2.0},
+        {"node_id": "node-4", "queue_depth": 50.0, "importance_weight": 1.0},
+        {"node_id": "node-1", "queue_depth": 50.0, "importance_weight": 1.0},
+        {"node_id": "node-2", "queue_depth": 50.0, "importance_weight": 1.0},
+        {"node_id": "node-3", "queue_depth": 50.0, "importance_weight": 1.0},
+    ]
+    nodes_imbalanced = [
+        {"node_id": "node-0", "queue_depth": 200.0, "importance_weight": 2.0},
+        {"node_id": "node-4", "queue_depth": 10.0, "importance_weight": 1.0},
+        {"node_id": "node-1", "queue_depth": 10.0, "importance_weight": 1.0},
+        {"node_id": "node-2", "queue_depth": 10.0, "importance_weight": 1.0},
+        {"node_id": "node-3", "queue_depth": 10.0, "importance_weight": 1.0},
+    ]
+    v_bal = compute_lyapunov_graph(nodes_balanced, CLUSTER_TOPOLOGY)
+    v_imb = compute_lyapunov_graph(nodes_imbalanced, CLUSTER_TOPOLOGY)
+    record("Graph Lyapunov: imbalanced > balanced",
+           PASS if v_imb > v_bal else FAIL,
+           f"balanced={v_bal:.1f} imbalanced={v_imb:.1f}")
+    # Compare with flat Lyapunov: graph version should add edge penalty
+    v_flat_imb = compute_lyapunov(nodes_imbalanced)
+    record("Graph Lyapunov > flat Lyapunov for imbalanced cluster",
+           PASS if v_imb > v_flat_imb else FAIL,
+           f"graph={v_imb:.1f} flat={v_flat_imb:.1f}")
+# ============================================================================
+# I. Environment Observation Populates Graph Fields
+# ============================================================================
+def test_I_env_graph_fields():
+    print("\n=== I. Environment Graph Fields ===")
+    try:
+        from server.AntiAtropos_environment import AntiAtroposEnvironment
+        from models import SREAction, ActionType
+    except ImportError:
+        record("Environment import", FAIL, "Cannot import AntiAtroposEnvironment")
+        return
+    env = AntiAtroposEnvironment()
+    obs = env.reset(task_id="task-1", mode="simulated", seed=42)
+    # Check that NodeObservations have graph fields
+    n0 = next((n for n in obs.nodes if n.node_id == "node-0"), None)
+    record("Environment reset succeeds",
+           PASS if n0 is not None else FAIL, "")
+    if n0:
+        record("node-0 has downstream_nodes",
+               PASS if isinstance(n0.downstream_nodes, list) and len(n0.downstream_nodes) > 0 else FAIL,
+               f"downstream={n0.downstream_nodes}")
+        record("node-0 has upstream_nodes",
+               PASS if isinstance(n0.upstream_nodes, list) else FAIL,
+               f"upstream={n0.upstream_nodes}")
+        record("node-0 has upstream_pressure",
+               PASS if n0.upstream_pressure is not None else FAIL,
+               f"pressure={n0.upstream_pressure:.3f}")
+        record("node-0 has outflow_rate",
+               PASS if n0.outflow_rate is not None else FAIL,
+               f"outflow={n0.outflow_rate:.3f}")
+    # Step once with SCALE_UP to verify reward computation
+    action = SREAction(action_type=ActionType.SCALE_UP, target_node_id="node-0", parameter=0.5)
+    obs2 = env.step(action)
+    record("Step reward is non-zero",
+           PASS if obs2.reward != 0.0 else FAIL,
+           f"reward={obs2.reward:.4f}")
+    # Lyapunov can be 0 on first tick if all queues are below capacity.
+    # Just verify it's a number (not None/NaN).
+    lyap_ok = obs2.lyapunov_energy is not None and not math.isnan(obs2.lyapunov_energy)
+    record("lyapunov_energy is valid (>=0, not NaN)",
+           PASS if lyap_ok else FAIL,
+           f"energy={obs2.lyapunov_energy}")
+# ============================================================================
+# J. Reward Components Across Tasks
+# ============================================================================
+def test_J_reward_components():
+    print("\n=== J. Reward Components ===")
+    env_module = None
+    try:
+        from server.AntiAtropos_environment import AntiAtroposEnvironment
+        from models import SREAction, ActionType
+    except ImportError:
+        record("Reward components", FAIL, "Cannot import environment")
+        return
+    for task_id, warmup_ticks in [("task-1", 30), ("task-2", 5), ("task-3", 5)]:
+        env = AntiAtroposEnvironment()
+        env.reset(task_id=task_id, mode="simulated", seed=42)
+        # Task-3 surge may not be active in early ticks (window depends on
+        # jitter).  Force the surge window to be wide-open for validation.
+        if task_id == "task-3":
+            env._sim._t3_surge_start = 1
+            env._sim._t3_surge_end = 999
+        # Run enough ticks for queues to accumulate and drift to become non-zero.
+        # Task-1 has a slow ramp (0.5/tick) and needs ~15+ ticks to exceed ingress
+        # capacity of 90 req/tick (starting from ~86).
+        action = SREAction(action_type=ActionType.NO_OP, target_node_id="node-0", parameter=0.0)
+        obs = None
+        for _ in range(warmup_ticks):
+            obs = env.step(action)
+        # All sub-components should be present
+        has_drift = obs.reward_drift != 0.0
+        has_cost = obs.reward_cost != 0.0
+        record(f"{task_id}: reward_drift non-zero (after {warmup_ticks} ticks)",
+               PASS if has_drift else FAIL,
+               f"drift={obs.reward_drift:.4f}")
+        record(f"{task_id}: reward_cost non-zero",
+               PASS if has_cost else FAIL,
+               f"cost={obs.reward_cost:.4f}")
+        record(f"{task_id}: no NaN in raw_reward",
+               PASS if not math.isnan(obs.raw_reward) else FAIL,
+               f"raw={obs.raw_reward:.4f}")
+    # Quick check: drift and cost should be negative (penalties)
+    env = AntiAtroposEnvironment()
+    env.reset(task_id="task-1", mode="simulated", seed=42)
+    action = SREAction(action_type=ActionType.NO_OP, target_node_id="node-0", parameter=0.0)
+    obs = env.step(action)
+    record("reward_drift <= 0 (penalty, not reward)",
+           PASS if obs.reward_drift <= 0 else FAIL,
+           f"drift={obs.reward_drift:.4f}")
+# ============================================================================
+# Main
+# ============================================================================
+def main():
+    print("=" * 65)
+    print("AntiAtropos DAG Physics Validation")
+    print("=" * 65)
+    test_A_dag_routing()
+    test_B_task2_dag()
+    test_C_task3_surge_dag()
+    test_D_backpressure_temporary()
+    test_E_gradual_recovery()
+    test_F_graph_cascade()
+    test_G_reroute_with_dag()
+    test_H_graph_lyapunov()
+    test_I_env_graph_fields()
+    test_J_reward_components()
+    passed = sum(1 for _, s, _ in results if s == PASS)
+    failed = sum(1 for _, s, _ in results if s == FAIL)
+    total = len(results)
+    print("\n" + "=" * 65)
+    print(f"RESULTS: {passed}/{total} passed, {failed} failed")
+    print("=" * 65)
+    if failed > 0:
+        print("\nFAILED TESTS:")
+        for name, status, detail in results:
+            if status == FAIL:
+                print(f"  X {name}: {detail}")
+    return 0 if failed == 0 else 1
+if __name__ == "__main__":
+    sys.exit(main())