Spaces:

Pranavkk
/

AntiAtropos

Sleeping

App Files Files Community

AntiAtropos / validate_dag_physics.py

Pranavkk

Upload folder using huggingface_hub

dcb9a04 verified about 1 month ago

raw

history blame contribute delete

21.4 kB

	#!/usr/bin/env python3
	"""
	Comprehensive DAG Physics Validation for AntiAtropos training readiness.

	Verifies:
	A. DAG traffic routing (parent->child propagation)
	B. Task-2 scripted failure flows through DAG (not bypassed)
	C. Task-3 surge correct overlay on DAG
	D. Backpressure is temporary (not permanent capacity drain)
	E. Gradual recovery completes fully
	F. Graph-bounded cascades
	G. Reroute weights work with DAG
	H. Graph Lyapunov edge penalty
	I. Environment observation populates graph fields
	J. Reward components are non-degenerate across tasks

	Run: python validate_dag_physics.py
	"""

	import sys, os, math
	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

	from simulator import (
	ClusterSimulator, NodeStatus, DEFAULT_CAPACITY,
	CLUSTER_TOPOLOGY, EXTERNAL_TRAFFIC_NODES, _TOPOLOGICAL_ORDER,
	DEFAULT_ROUTING_SPLIT, T1_INITIAL_LAMBDA, T2_INITIAL_LAMBDA,
	T3_INITIAL_LAMBDA, T3_SURGE_MAGNITUDE,
	COST_PER_CAPACITY_UNIT_PER_HOUR, FATAL_FAIL_THRESHOLD,
	NODE_RECOVERY_TICKS, BACKPRESSURE_THRESHOLD,
	)
	from stability import (
	compute_lyapunov, compute_lyapunov_graph, compute_reward,
	compute_barrier, normalize_reward, smooth_sla_penalty, compute_drift,
	BARRIER_NORM_SCALE,
	)

	# --- Test harness ---
	PASS = "PASS"
	FAIL = "FAIL"
	results = []

	def record(name, status, detail=""):
	results.append((name, status, detail))
	icon = "+" if status == PASS else "X"
	msg = f" [{icon}] {name}"
	if detail:
	msg += f" -- {detail}"
	print(msg)

	# ============================================================================
	# A. DAG Traffic Routing
	# ============================================================================
	def test_A_dag_routing():
	print("\n=== A. DAG Traffic Routing ===")
	sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
	sim.reset(task_id="task-1", seed=1)

	# Tick once with NO_OP
	class _A: pass
	a = _A(); a.action_type = "NO_OP"; a.target_node_id = "node-0"; a.parameter = 0.0
	sim.apply_action(a)
	sim.tick()

	nodes = {n.node_id: n for n in sim._nodes}

	# node-0 and node-4 should receive external traffic
	n0_in = nodes["node-0"].incoming_request_rate
	n4_in = nodes["node-4"].incoming_request_rate
	record("node-0 (ingress) receives traffic",
	PASS if n0_in > 0 else FAIL,
	f"incoming={n0_in:.1f}")
	record("node-4 (ingress) receives traffic",
	PASS if n4_in > 0 else FAIL,
	f"incoming={n4_in:.1f}")

	# node-1, node-2 should receive outflow from node-0
	n1_in = nodes["node-1"].incoming_request_rate
	n2_in = nodes["node-2"].incoming_request_rate
	record("node-1 receives from node-0 (DAG child)",
	PASS if n1_in > 0 else FAIL,
	f"incoming={n1_in:.1f}")
	record("node-2 receives from node-0 (DAG child)",
	PASS if n2_in > 0 else FAIL,
	f"incoming={n2_in:.1f}")

	# node-3 should receive outflow from node-2
	n3_in = nodes["node-3"].incoming_request_rate
	record("node-3 receives from node-2 (DAG grandchild)",
	PASS if n3_in > 0 else FAIL,
	f"incoming={n3_in:.1f}")

	# node-0 outflow should be ~incoming (since capacity >> lambda at start)
	record("node-0 has positive outflow_rate",
	PASS if nodes["node-0"].outflow_rate > 0 else FAIL,
	f"outflow={nodes['node-0'].outflow_rate:.1f}")


	# ============================================================================
	# B. Task-2 Scripted Failure Flows Through DAG
	# ============================================================================
	def test_B_task2_dag():
	print("\n=== B. Task-2 Failure Through DAG ===")
	sim = ClusterSimulator(n_nodes=5, task_id="task-2", seed=42)
	sim.reset(task_id="task-2", seed=42)

	# Run enough ticks for failure to trigger
	class _A: pass
	a = _A(); a.action_type = "NO_OP"; a.target_node_id = "node-0"; a.parameter = 0.0

	failed_id = None
	for _ in range(60):
	sim.apply_action(a)
	sim.tick()
	if sim._failed_node_id and failed_id is None:
	failed_id = sim._failed_node_id

	record("Scripted failure was assigned",
	PASS if failed_id is not None else FAIL,
	f"failed_id={failed_id}")

	nodes = {n.node_id: n for n in sim._nodes}
	failed_node = nodes.get(failed_id)
	record("Failed node has FAILED status",
	PASS if failed_node and failed_node.status == NodeStatus.FAILED else FAIL,
	f"status={failed_node.status if failed_node else 'N/A'}")

	# If failed node is a child of node-0 (e.g., node-1 or node-2),
	# node-0 should still be sending traffic to it (flow not bypassed)
	if failed_id in CLUSTER_TOPOLOGY.get("node-0", []):
	# The failed node outflow should be 0 (service_rate=0),
	# but it should still have incoming from DAG
	record("Failed child still receives DAG traffic (as dropped requests)",
	PASS if failed_node.incoming_request_rate >= 0 else FAIL,
	f"incoming={failed_node.incoming_request_rate:.1f} dropped={failed_node.dropped_requests:.1f}")

	# If failed node was node-2, node-3 should be starved (outflow=0 upstream)
	if failed_id == "node-2":
	n3 = nodes["node-3"]
	record("node-3 starved when parent node-2 fails",
	PASS if n3.incoming_request_rate == 0 else FAIL,
	f"node-3 incoming={n3.incoming_request_rate:.1f}")


	# ============================================================================
	# C. Task-3 Surge Overlay on DAG
	# ============================================================================
	def test_C_task3_surge_dag():
	print("\n=== C. Task-3 Surge Overlay ===")
	sim = ClusterSimulator(n_nodes=5, task_id="task-3", seed=7)
	sim.reset(task_id="task-3", seed=7)

	# Force surge window to be active immediately
	sim._t3_surge_start = 0
	sim._t3_surge_end = 999

	class _A: pass
	a = _A(); a.action_type = "NO_OP"; a.target_node_id = "node-0"; a.parameter = 0.0
	sim.apply_action(a)
	sim.tick()

	nodes = {n.node_id: n for n in sim._nodes}

	# node-1 and node-2 should have surge + DAG traffic
	n1_in = nodes["node-1"].incoming_request_rate
	n2_in = nodes["node-2"].incoming_request_rate
	record("node-1 receives surge + DAG traffic",
	PASS if n1_in > T3_SURGE_MAGNITUDE else FAIL,
	f"incoming={n1_in:.1f} (surge={T3_SURGE_MAGNITUDE})")
	record("node-2 receives surge + DAG traffic",
	PASS if n2_in > T3_SURGE_MAGNITUDE else FAIL,
	f"incoming={n2_in:.1f} (surge={T3_SURGE_MAGNITUDE})")

	# node-0 should still have base DAG traffic (not affected by surge directly)
	n0_in = nodes["node-0"].incoming_request_rate
	record("node-0 gets base DAG traffic (surge is side-channel)",
	PASS if n0_in < T3_SURGE_MAGNITUDE else FAIL,
	f"node-0 incoming={n0_in:.1f}")


	# ============================================================================
	# D. Backpressure Is Temporary
	# ============================================================================
	def test_D_backpressure_temporary():
	print("\n=== D. Backpressure Temporary ===")
	sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
	sim.reset(task_id="task-1", seed=1)

	node0 = next(n for n in sim._nodes if n.node_id == "node-0")
	original_cap = node0.capacity

	# Artificially overload node-0's children to trigger backpressure
	for n in sim._nodes:
	if n.node_id in CLUSTER_TOPOLOGY.get("node-0", []):
	n.queue_depth = BACKPRESSURE_THRESHOLD + 100.0 # well above threshold

	# Tick: backpressure should reduce node-0's capacity for THIS tick only
	class _A: pass
	a = _A(); a.action_type = "NO_OP"; a.target_node_id = "node-0"; a.parameter = 0.0
	sim.apply_action(a)
	sim.tick()

	cap_after_tick = node0.capacity
	record("node-0 capacity restored after backpressure tick",
	PASS if abs(cap_after_tick - original_cap) < 0.01 else FAIL,
	f"before={original_cap:.2f} after={cap_after_tick:.2f}")

	# Tick again (children still overloaded) — capacity should still be original
	sim.apply_action(a)
	sim.tick()
	cap_after_tick2 = node0.capacity
	record("node-0 capacity intact after multiple backpressure ticks",
	PASS if abs(cap_after_tick2 - original_cap) < 0.01 else FAIL,
	f"after 2 ticks={cap_after_tick2:.2f} original={original_cap:.2f}")

	# Clear children overload — capacity should remain original
	for n in sim._nodes:
	if n.node_id in CLUSTER_TOPOLOGY.get("node-0", []):
	n.queue_depth = 0.0
	sim.apply_action(a)
	sim.tick()
	cap_clear = node0.capacity
	record("node-0 capacity unchanged after children clear",
	PASS if abs(cap_clear - original_cap) < 0.01 else FAIL,
	f"capacity={cap_clear:.2f}")


	# ============================================================================
	# E. Gradual Recovery Completes
	# ============================================================================
	def test_E_gradual_recovery():
	print("\n=== E. Gradual Recovery ===")
	sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
	sim.reset(task_id="task-1", seed=1)

	node = sim._nodes[2]
	node.queue_depth = 250.0
	sim._update_statuses()

	record("Node becomes FAILED on overload",
	PASS if node.status == NodeStatus.FAILED else FAIL,
	f"status={node.status}")
	record("Capacity drops to 0.5 at failure",
	PASS if abs(node.capacity - 0.5) < 0.01 else FAIL,
	f"capacity={node.capacity}")

	# Tick through full recovery (NODE_RECOVERY_TICKS + some margin)
	class _A: pass
	a = _A(); a.action_type = "NO_OP"; a.target_node_id = "node-0"; a.parameter = 0.0
	for _ in range(NODE_RECOVERY_TICKS + 5):
	sim.apply_action(a)
	sim.tick()

	record("Node reaches HEALTHY after full recovery",
	PASS if node.status == NodeStatus.HEALTHY else FAIL,
	f"status={node.status}")

	# Capacity should have ramped: start=0.5, each recovery tick adds 0.5
	# After NODE_RECOVERY_TICKS=20 ticks: 0.5 + 20*0.5 = 10.5, capped at 3.0
	record("Capacity recovered to DEFAULT_CAPACITY (capped)",
	PASS if abs(node.capacity - DEFAULT_CAPACITY) < 0.01 else FAIL,
	f"capacity={node.capacity:.2f} expected={DEFAULT_CAPACITY}")


	# ============================================================================
	# F. Graph-Bounded Cascades
	# ============================================================================
	def test_F_graph_cascade():
	print("\n=== F. Graph-Bounded Cascades ===")
	sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
	sim.reset(task_id="task-1", seed=1)

	# Fail node-2, overload its children/parents
	node2 = sim._nodes[2]
	node2.queue_depth = 250.0
	sim._update_statuses() # node-2 becomes FAILED, triggers cascade

	# node-3 is child of node-2 — should be at_risk
	# node-0 is parent of node-2 — should be at_risk
	node3 = sim._nodes[3]
	node0 = sim._nodes[0]
	# Overload node-3 to trigger cascade
	node3.queue_depth = FATAL_FAIL_THRESHOLD * 1.5 # > cascade threshold
	sim._cascade_failures()

	record("node-3 (child of failed node-2) cascades to DEGRADED",
	PASS if node3.status == NodeStatus.DEGRADED else FAIL,
	f"node-3 status={node3.status}")

	# node-4 is NOT adjacent to node-2 — should NOT cascade
	node4 = sim._nodes[4]
	node4.queue_depth = FATAL_FAIL_THRESHOLD * 1.5
	sim._cascade_failures()
	record("node-4 (not adjacent to failed) does NOT cascade",
	PASS if node4.status != NodeStatus.DEGRADED else FAIL,
	f"node-4 status={node4.status}")


	# ============================================================================
	# G. Reroute Weights with DAG
	# ============================================================================
	def test_G_reroute_with_dag():
	print("\n=== G. Reroute Weights with DAG ===")
	sim = ClusterSimulator(n_nodes=5, task_id="task-1", seed=1)
	sim.reset(task_id="task-1", seed=1)

	class _A: pass

	# Tick once to establish baseline
	a = _A(); a.action_type = "NO_OP"; a.target_node_id = "node-0"; a.parameter = 0.0
	sim.apply_action(a)
	sim.tick()

	node0 = next(n for n in sim._nodes if n.node_id == "node-0")
	node4 = next(n for n in sim._nodes if n.node_id == "node-4")
	baseline_n0_in = node0.incoming_request_rate

	# Reroute 100% of node-0 traffic away
	a2 = _A(); a2.action_type = "REROUTE_TRAFFIC"; a2.target_node_id = "node-0"; a2.parameter = 1.0
	sim.apply_action(a2)
	sim.tick()

	n0_in_after = node0.incoming_request_rate
	record("Reroute reduces node-0 incoming traffic",
	PASS if n0_in_after < baseline_n0_in else FAIL,
	f"before={baseline_n0_in:.1f} after={n0_in_after:.1f}")

	# Verify outflow_rate is also reduced (since incoming is lower)
	record("node-0 outflow reduced after reroute",
	PASS if node0.outflow_rate < baseline_n0_in else FAIL,
	f"outflow={node0.outflow_rate:.1f}")

	# Reroute weight should decay each tick (0.5 factor)
	w = sim._reroute_weights.get("node-0", 0.0)
	record("Reroute weight decays (0.5 * prev)",
	PASS if w < 1.0 else FAIL,
	f"weight after first decay={w:.3f}")


	# ============================================================================
	# H. Graph Lyapunov Edge Penalty
	# ============================================================================
	def test_H_graph_lyapunov():
	print("\n=== H. Graph Lyapunov Edge Penalty ===")
	nodes_balanced = [
	{"node_id": "node-0", "queue_depth": 50.0, "importance_weight": 2.0},
	{"node_id": "node-4", "queue_depth": 50.0, "importance_weight": 1.0},
	{"node_id": "node-1", "queue_depth": 50.0, "importance_weight": 1.0},
	{"node_id": "node-2", "queue_depth": 50.0, "importance_weight": 1.0},
	{"node_id": "node-3", "queue_depth": 50.0, "importance_weight": 1.0},
	]
	nodes_imbalanced = [
	{"node_id": "node-0", "queue_depth": 200.0, "importance_weight": 2.0},
	{"node_id": "node-4", "queue_depth": 10.0, "importance_weight": 1.0},
	{"node_id": "node-1", "queue_depth": 10.0, "importance_weight": 1.0},
	{"node_id": "node-2", "queue_depth": 10.0, "importance_weight": 1.0},
	{"node_id": "node-3", "queue_depth": 10.0, "importance_weight": 1.0},
	]

	v_bal = compute_lyapunov_graph(nodes_balanced, CLUSTER_TOPOLOGY)
	v_imb = compute_lyapunov_graph(nodes_imbalanced, CLUSTER_TOPOLOGY)
	record("Graph Lyapunov: imbalanced > balanced",
	PASS if v_imb > v_bal else FAIL,
	f"balanced={v_bal:.1f} imbalanced={v_imb:.1f}")

	# Compare with flat Lyapunov: graph version should add edge penalty
	v_flat_imb = compute_lyapunov(nodes_imbalanced)
	record("Graph Lyapunov > flat Lyapunov for imbalanced cluster",
	PASS if v_imb > v_flat_imb else FAIL,
	f"graph={v_imb:.1f} flat={v_flat_imb:.1f}")


	# ============================================================================
	# I. Environment Observation Populates Graph Fields
	# ============================================================================
	def test_I_env_graph_fields():
	print("\n=== I. Environment Graph Fields ===")
	try:
	from server.AntiAtropos_environment import AntiAtroposEnvironment
	from models import SREAction, ActionType
	except ImportError:
	record("Environment import", FAIL, "Cannot import AntiAtroposEnvironment")
	return

	env = AntiAtroposEnvironment()
	obs = env.reset(task_id="task-1", mode="simulated", seed=42)

	# Check that NodeObservations have graph fields
	n0 = next((n for n in obs.nodes if n.node_id == "node-0"), None)
	record("Environment reset succeeds",
	PASS if n0 is not None else FAIL, "")

	if n0:
	record("node-0 has downstream_nodes",
	PASS if isinstance(n0.downstream_nodes, list) and len(n0.downstream_nodes) > 0 else FAIL,
	f"downstream={n0.downstream_nodes}")
	record("node-0 has upstream_nodes",
	PASS if isinstance(n0.upstream_nodes, list) else FAIL,
	f"upstream={n0.upstream_nodes}")
	record("node-0 has upstream_pressure",
	PASS if n0.upstream_pressure is not None else FAIL,
	f"pressure={n0.upstream_pressure:.3f}")
	record("node-0 has outflow_rate",
	PASS if n0.outflow_rate is not None else FAIL,
	f"outflow={n0.outflow_rate:.3f}")

	# Step once with SCALE_UP to verify reward computation
	action = SREAction(action_type=ActionType.SCALE_UP, target_node_id="node-0", parameter=0.5)
	obs2 = env.step(action)

	record("Step reward is non-zero",
	PASS if obs2.reward != 0.0 else FAIL,
	f"reward={obs2.reward:.4f}")
	# Lyapunov can be 0 on first tick if all queues are below capacity.
	# Just verify it's a number (not None/NaN).
	lyap_ok = obs2.lyapunov_energy is not None and not math.isnan(obs2.lyapunov_energy)
	record("lyapunov_energy is valid (>=0, not NaN)",
	PASS if lyap_ok else FAIL,
	f"energy={obs2.lyapunov_energy}")


	# ============================================================================
	# J. Reward Components Across Tasks
	# ============================================================================
	def test_J_reward_components():
	print("\n=== J. Reward Components ===")
	env_module = None
	try:
	from server.AntiAtropos_environment import AntiAtroposEnvironment
	from models import SREAction, ActionType
	except ImportError:
	record("Reward components", FAIL, "Cannot import environment")
	return

	for task_id, warmup_ticks in [("task-1", 30), ("task-2", 5), ("task-3", 5)]:
	env = AntiAtroposEnvironment()
	env.reset(task_id=task_id, mode="simulated", seed=42)

	# Task-3 surge may not be active in early ticks (window depends on
	# jitter). Force the surge window to be wide-open for validation.
	if task_id == "task-3":
	env._sim._t3_surge_start = 1
	env._sim._t3_surge_end = 999

	# Run enough ticks for queues to accumulate and drift to become non-zero.
	# Task-1 has a slow ramp (0.5/tick) and needs ~15+ ticks to exceed ingress
	# capacity of 90 req/tick (starting from ~86).
	action = SREAction(action_type=ActionType.NO_OP, target_node_id="node-0", parameter=0.0)
	obs = None
	for _ in range(warmup_ticks):
	obs = env.step(action)

	# All sub-components should be present
	has_drift = obs.reward_drift != 0.0
	has_cost = obs.reward_cost != 0.0
	record(f"{task_id}: reward_drift non-zero (after {warmup_ticks} ticks)",
	PASS if has_drift else FAIL,
	f"drift={obs.reward_drift:.4f}")
	record(f"{task_id}: reward_cost non-zero",
	PASS if has_cost else FAIL,
	f"cost={obs.reward_cost:.4f}")
	record(f"{task_id}: no NaN in raw_reward",
	PASS if not math.isnan(obs.raw_reward) else FAIL,
	f"raw={obs.raw_reward:.4f}")

	# Quick check: drift and cost should be negative (penalties)
	env = AntiAtroposEnvironment()
	env.reset(task_id="task-1", mode="simulated", seed=42)
	action = SREAction(action_type=ActionType.NO_OP, target_node_id="node-0", parameter=0.0)
	obs = env.step(action)
	record("reward_drift <= 0 (penalty, not reward)",
	PASS if obs.reward_drift <= 0 else FAIL,
	f"drift={obs.reward_drift:.4f}")


	# ============================================================================
	# Main
	# ============================================================================
	def main():
	print("=" * 65)
	print("AntiAtropos DAG Physics Validation")
	print("=" * 65)

	test_A_dag_routing()
	test_B_task2_dag()
	test_C_task3_surge_dag()
	test_D_backpressure_temporary()
	test_E_gradual_recovery()
	test_F_graph_cascade()
	test_G_reroute_with_dag()
	test_H_graph_lyapunov()
	test_I_env_graph_fields()
	test_J_reward_components()

	passed = sum(1 for _, s, _ in results if s == PASS)
	failed = sum(1 for _, s, _ in results if s == FAIL)
	total = len(results)

	print("\n" + "=" * 65)
	print(f"RESULTS: {passed}/{total} passed, {failed} failed")
	print("=" * 65)

	if failed > 0:
	print("\nFAILED TESTS:")
	for name, status, detail in results:
	if status == FAIL:
	print(f" X {name}: {detail}")

	return 0 if failed == 0 else 1


	if __name__ == "__main__":
	sys.exit(main())