Spaces:
Configuration error
Configuration error
| """ | |
| Graded tasks with curriculum-based difficulty levels for the | |
| Distributed Infrastructure Management Environment. | |
| Each task provides: | |
| - setup(env, rng): configure initial node states and scenario parameters | |
| - grade(env): return float in (0.0, 1.0) with partial credit | |
| - is_done(env): termination condition check | |
| - hint: natural language task description for the agent | |
| Curriculum Levels | |
| ----------------- | |
| Level 1 Warm Start — Identify the failing node from logs (high success rate) | |
| Level 2 Single Fix — One node fails, agent must restart it | |
| Level 3 Stochastic — Gaussian traffic spikes, multi-step interventions | |
| Level 4 Expert — Brutal cascading failures with tight budgets | |
| """ | |
| from __future__ import annotations | |
| from typing import TYPE_CHECKING | |
| if TYPE_CHECKING: | |
| import random | |
| from server.environment import DistributedInfraEnvironment | |
| # ============================================================================ | |
| # Level 1 — Warm Start: Read Logs & Identify Failing Node | |
| # ============================================================================ | |
| def _setup_level_1(env: "DistributedInfraEnvironment", rng: "random.Random"): | |
| """One node is pre-failed. Agent just needs to identify it via query_logs.""" | |
| sim = env.sim | |
| sim.max_steps = 15 | |
| sim.current_request_rate = sim.base_request_rate * 1.0 # normal traffic | |
| # Fail one random node | |
| fail_idx = rng.randint(0, len(sim.nodes) - 1) | |
| sim.nodes[fail_idx].is_failed = True | |
| sim.nodes[fail_idx].cpu_util = 0.0 | |
| sim.nodes[fail_idx].queue_length = 0 | |
| def _grade_level_1(env: "DistributedInfraEnvironment") -> float: | |
| """ | |
| Score = 0.7 * identified failing node (restarted it) + 0.3 * speed. | |
| """ | |
| sim = env.sim | |
| # Did the agent restart the failed node? | |
| all_alive = all(not n.is_failed for n in sim.nodes) | |
| identification = 1.0 if all_alive else 0.2 | |
| # Speed bonus: faster = better | |
| speed = max(0.0, 1.0 - sim.step_count / sim.max_steps) | |
| score = 0.70 * identification + 0.30 * speed | |
| return round(min(0.99, max(0.01, score)), 4) | |
| def _is_done_level_1(env: "DistributedInfraEnvironment") -> bool: | |
| sim = env.sim | |
| # Done if agent fixed the node or time ran out | |
| all_alive = all(not n.is_failed and n.restart_countdown == 0 for n in sim.nodes) | |
| return all_alive or sim.step_count >= sim.max_steps | |
| # ============================================================================ | |
| # Level 2 / Task 1 — Traffic Spike Recovery | |
| # ============================================================================ | |
| def _setup_traffic_spike(env: "DistributedInfraEnvironment", rng: "random.Random"): | |
| """System receives 3x normal request rate.""" | |
| sim = env.sim | |
| sim.current_request_rate = sim.base_request_rate * 3.0 | |
| sim.max_steps = 30 | |
| # Start with moderate load | |
| for node in sim.nodes: | |
| node.cpu_util = 0.45 + rng.uniform(-0.05, 0.1) | |
| node.queue_length = rng.randint(5, 15) | |
| def _grade_traffic_spike(env: "DistributedInfraEnvironment") -> float: | |
| """ | |
| Score = latency below threshold (50%) + uptime (30%) + resource efficiency (20%). | |
| """ | |
| sim = env.sim | |
| if not sim.latency_history: | |
| return 0.01 | |
| # Latency component: fraction of steps where latency was below target | |
| target = 50.0 # ms | |
| below_target = sum(1 for lat in sim.latency_history if lat < target) | |
| latency_score = below_target / len(sim.latency_history) | |
| # Uptime component: average uptime ratio | |
| avg_uptime = ( | |
| sum(sim.uptime_history) / len(sim.uptime_history) if sim.uptime_history else 1.0 | |
| ) | |
| # Efficiency: penalty for excessive actions | |
| max_reasonable = sim.max_steps * 0.5 | |
| efficiency = max(0.0, 1.0 - sim.actions_taken / max(1, max_reasonable)) | |
| score = 0.50 * latency_score + 0.30 * avg_uptime + 0.20 * efficiency | |
| return round(min(0.99, max(0.01, score)), 4) | |
| def _is_done_traffic_spike(env: "DistributedInfraEnvironment") -> bool: | |
| return env.sim.step_count >= env.sim.max_steps | |
| # ============================================================================ | |
| # Level 2 / Task 2 — Single Node Failure | |
| # ============================================================================ | |
| def _setup_node_failure(env: "DistributedInfraEnvironment", rng: "random.Random"): | |
| """One node will fail at step 5. Agent must maintain 80%+ uptime.""" | |
| sim = env.sim | |
| sim.max_steps = 40 | |
| sim.current_request_rate = sim.base_request_rate * 1.5 | |
| # Mark node 3 for pre-programmed failure | |
| sim.nodes[3].cpu_util = 0.60 | |
| sim.nodes[3].queue_length = 20 | |
| def _grade_node_failure(env: "DistributedInfraEnvironment") -> float: | |
| """ | |
| Score = MTTR (40%) + uptime during failure window (40%) - restart penalty (20%). | |
| """ | |
| sim = env.sim | |
| if not sim.uptime_history: | |
| return 0.01 | |
| # MTTR: how quickly system recovered from the failure | |
| failure_duration = 0 | |
| in_failure = False | |
| for uptime in sim.uptime_history: | |
| if uptime < 1.0: | |
| in_failure = True | |
| failure_duration += 1 | |
| elif in_failure: | |
| break | |
| max_failure_window = 10 | |
| mttr_score = max(0.0, 1.0 - failure_duration / max_failure_window) | |
| # Uptime component: fraction of steps with >80% uptime | |
| above_80 = sum(1 for u in sim.uptime_history if u >= 0.80) | |
| uptime_score = above_80 / len(sim.uptime_history) | |
| # Restart penalty: more than 2 restarts is wasteful | |
| restart_penalty = max(0.0, 1.0 - max(0, sim.restart_count - 1) / 5) | |
| score = 0.40 * mttr_score + 0.40 * uptime_score + 0.20 * restart_penalty | |
| return round(min(0.99, max(0.01, score)), 4) | |
| def _is_done_node_failure(env: "DistributedInfraEnvironment") -> bool: | |
| sim = env.sim | |
| # Inject failure at step 5 | |
| if sim.step_count == 5 and 3 < len(sim.nodes) and not sim.nodes[3].is_failed: | |
| sim.nodes[3].is_failed = True | |
| sim.nodes[3].cpu_util = 0.0 | |
| sim.nodes[3].queue_length = 0 | |
| # Redistribute its load | |
| env._redistribute_from_node(3) | |
| return sim.step_count >= sim.max_steps | |
| # ============================================================================ | |
| # Level 2 — Alias: Single Fix (same as node_failure) | |
| # ============================================================================ | |
| _setup_level_2 = _setup_node_failure | |
| _grade_level_2 = _grade_node_failure | |
| _is_done_level_2 = _is_done_node_failure | |
| # ============================================================================ | |
| # Level 3 / Task 3 — Cascading Failure Prevention (Stochastic) | |
| # ============================================================================ | |
| def _setup_cascading_failure(env: "DistributedInfraEnvironment", rng: "random.Random"): | |
| """Two nodes near critical CPU. Agent must prevent cascade chain.""" | |
| sim = env.sim | |
| sim.max_steps = 50 | |
| sim.current_request_rate = sim.base_request_rate * 2.0 | |
| # Put nodes 1 and 4 near critical | |
| sim.nodes[1].cpu_util = 0.88 | |
| sim.nodes[1].queue_length = 30 | |
| sim.nodes[1].high_cpu_streak = 2 | |
| sim.nodes[4].cpu_util = 0.86 | |
| sim.nodes[4].queue_length = 25 | |
| sim.nodes[4].high_cpu_streak = 1 | |
| # Higher base load across all nodes | |
| for i, node in enumerate(sim.nodes): | |
| if i not in (1, 4): | |
| node.cpu_util = 0.55 + rng.uniform(-0.05, 0.1) | |
| node.queue_length = rng.randint(8, 20) | |
| def _grade_cascading_failure(env: "DistributedInfraEnvironment") -> float: | |
| """ | |
| Score = cascade prevented (50%) + nodes below 85% CPU (30%) | |
| + action efficiency (20%). | |
| """ | |
| sim = env.sim | |
| cascade_score = 1.0 if not sim.cascade_occurred else 0.3 | |
| if sim.uptime_history: | |
| healthy_now = sum(1 for n in sim.nodes if not n.is_failed and n.cpu_util < 0.85) | |
| total_now = len(sim.nodes) | |
| cpu_score = healthy_now / total_now if total_now > 0 else 0.0 | |
| else: | |
| cpu_score = 0.0 | |
| max_reasonable = sim.max_steps * 0.4 | |
| efficiency = max(0.0, 1.0 - sim.actions_taken / max(1, max_reasonable)) | |
| score = 0.50 * cascade_score + 0.30 * cpu_score + 0.20 * efficiency | |
| return round(min(0.99, max(0.01, score)), 4) | |
| def _is_done_cascading_failure(env: "DistributedInfraEnvironment") -> bool: | |
| sim = env.sim | |
| failed_count = sum(1 for n in sim.nodes if n.is_failed) | |
| if failed_count > len(sim.nodes) // 2: | |
| return True | |
| return sim.step_count >= sim.max_steps | |
| # ============================================================================ | |
| # Level 3 — Alias: Stochastic (enhanced version of cascading_failure) | |
| # ============================================================================ | |
| def _setup_level_3(env: "DistributedInfraEnvironment", rng: "random.Random"): | |
| """Gaussian stochastic traffic spikes with noisy sensors.""" | |
| _setup_cascading_failure(env, rng) | |
| sim = env.sim | |
| # Add Gaussian noise to request rate each step (handled in sim dynamics) | |
| sim.current_request_rate = sim.base_request_rate * (2.0 + rng.gauss(0, 0.5)) | |
| sim.max_steps = 45 | |
| _grade_level_3 = _grade_cascading_failure | |
| _is_done_level_3 = _is_done_cascading_failure | |
| # ============================================================================ | |
| # Level 4 / Task 4 — Expert: Flash Crowd | |
| # ============================================================================ | |
| def _setup_flash_crowd(env: "DistributedInfraEnvironment", rng: "random.Random"): | |
| """Massive 5x traffic spike. Agent must scale up AND throttle to survive.""" | |
| sim = env.sim | |
| sim.current_request_rate = sim.base_request_rate * 5.0 | |
| sim.max_steps = 40 | |
| for node in sim.nodes: | |
| node.cpu_util = 0.60 + rng.uniform(-0.05, 0.1) | |
| node.queue_length = rng.randint(15, 30) | |
| def _grade_flash_crowd(env: "DistributedInfraEnvironment") -> float: | |
| """ | |
| Score = Survival Uptime (50%) + Latency control (50%). | |
| Cascade penalty applied if the system collapses. | |
| """ | |
| sim = env.sim | |
| avg_uptime = ( | |
| sum(sim.uptime_history) / len(sim.uptime_history) if sim.uptime_history else 0.0 | |
| ) | |
| # Latency target is more generous for a massive flash crowd (100ms) | |
| target = 100.0 | |
| below_target = sum(1 for lat in sim.latency_history if lat < target) | |
| latency_score = ( | |
| below_target / len(sim.latency_history) if sim.latency_history else 0.0 | |
| ) | |
| cascade_penalty = 0.4 if sim.cascade_occurred else 0.0 | |
| score = 0.50 * avg_uptime + 0.50 * latency_score - cascade_penalty | |
| return round(min(0.99, max(0.01, score)), 4) | |
| def _is_done_flash_crowd(env: "DistributedInfraEnvironment") -> bool: | |
| failed_count = sum(1 for n in env.sim.nodes if n.is_failed) | |
| # Terminate early if more than 60% of the cluster dies | |
| if failed_count > len(env.sim.nodes) * 0.6: | |
| return True | |
| return env.sim.step_count >= env.sim.max_steps | |
| # ============================================================================ | |
| # Level 4 — Alias: Expert (flash crowd with tightest constraints) | |
| # ============================================================================ | |
| _setup_level_4 = _setup_flash_crowd | |
| _grade_level_4 = _grade_flash_crowd | |
| _is_done_level_4 = _is_done_flash_crowd | |
| # ============================================================================ | |
| # Level 5 — Alibaba Trace Replay (Real-World Production Traffic) | |
| # ============================================================================ | |
| def _setup_alibaba_trace(env: "DistributedInfraEnvironment", rng: "random.Random"): | |
| """Load real Alibaba cluster trace and replay it step-by-step.""" | |
| from server.trace_loader import load_default_trace | |
| sim = env.sim | |
| sim.max_steps = 60 # ~30 minutes of real time at 30s intervals (or longer trace window) | |
| sim.cloud_budget = 8 # tight budget | |
| sim.error_budget = 100.0 | |
| sim.scenario = sim.task_id | |
| trace = load_default_trace() | |
| if trace is not None: | |
| sim.trace_replay = trace | |
| # Start replay from a deterministic benchmark offset when provided, | |
| # otherwise preserve the existing stochastic task variation. | |
| if not sim.trace_offset_locked: | |
| sim.trace_offset = rng.randint(0, max(1, len(trace) - sim.max_steps)) | |
| sim.current_request_rate = trace.get_step(0, offset=sim.trace_offset).request_rate | |
| else: | |
| # Fallback: synthetic 2x traffic if trace not generated | |
| sim.current_request_rate = sim.base_request_rate * 2.0 | |
| # Pre-stress the cluster slightly | |
| for node in sim.nodes: | |
| if node.role == "app_server": | |
| node.cpu_util = 0.40 + rng.uniform(-0.05, 0.1) | |
| node.queue_length = rng.randint(3, 12) | |
| elif node.role == "database": | |
| node.cpu_util = 0.35 + rng.uniform(-0.03, 0.05) | |
| node.queue_length = rng.randint(2, 8) | |
| def _grade_alibaba_trace(env: "DistributedInfraEnvironment") -> float: | |
| """ | |
| Score = Uptime (35%) + Latency (30%) + Throughput (20%) + Efficiency (15%). | |
| """ | |
| sim = env.sim | |
| # Uptime | |
| avg_uptime = ( | |
| sum(sim.uptime_history) / len(sim.uptime_history) if sim.uptime_history else 0.0 | |
| ) | |
| # Latency: fraction of steps below 80ms (more generous for real traffic) | |
| target = 80.0 | |
| below_target = sum(1 for lat in sim.latency_history if lat < target) | |
| latency_score = ( | |
| below_target / len(sim.latency_history) if sim.latency_history else 0.0 | |
| ) | |
| # Throughput: did the agent actually serve requests? | |
| throughput_ratio = sim.total_requests_served / max(1, sim.total_requests_received) | |
| throughput_score = min(1.0, throughput_ratio / 0.6) # 60% = full marks | |
| # Efficiency: budget conservation | |
| budget_used = 8 - sim.cloud_budget | |
| efficiency_score = max(0.0, 1.0 - budget_used / 8) | |
| score = ( | |
| 0.35 * avg_uptime | |
| + 0.30 * latency_score | |
| + 0.20 * throughput_score | |
| + 0.15 * efficiency_score | |
| ) | |
| return round(min(0.99, max(0.01, score)), 4) | |
| def _is_done_alibaba_trace(env: "DistributedInfraEnvironment") -> bool: | |
| sim = env.sim | |
| # Terminate early if >70% of cluster dies | |
| failed_count = sum(1 for n in sim.nodes if n.is_failed) | |
| if failed_count > len(sim.nodes) * 0.7: | |
| return True | |
| return sim.step_count >= sim.max_steps | |
| # ============================================================================ | |
| # Task Registry | |
| # ============================================================================ | |
| TASKS = { | |
| # --- Curriculum levels --- | |
| "level_1_read_logs": { | |
| "setup": _setup_level_1, | |
| "grade": _grade_level_1, | |
| "is_done": _is_done_level_1, | |
| "hint": ( | |
| "WARM START (Level 1): One node in your cluster has silently failed. " | |
| "Use 'query_logs' to investigate nodes with telemetry dropouts and " | |
| "identify the failing node. Then restart it. " | |
| "This is a diagnostic exercise — focus on observation before action." | |
| ), | |
| }, | |
| "level_2_single_fix": { | |
| "setup": _setup_level_2, | |
| "grade": _grade_level_2, | |
| "is_done": _is_done_level_2, | |
| "hint": ( | |
| "SINGLE FIX (Level 2): A node failure will occur during this episode. " | |
| "Detect the failure, restart the affected node, and maintain uptime " | |
| "above 80%%. Minimise unnecessary restarts." | |
| ), | |
| }, | |
| "level_3_stochastic": { | |
| "setup": _setup_level_3, | |
| "grade": _grade_level_3, | |
| "is_done": _is_done_level_3, | |
| "hint": ( | |
| "STOCHASTIC SPIKES (Level 3): Traffic follows a noisy Gaussian pattern. " | |
| "Multiple nodes are near critical CPU. Proactively reroute traffic and " | |
| "scale up before cascading failures occur. Telemetry may be spotty — " | |
| "use query_logs to investigate timeouts." | |
| ), | |
| }, | |
| "level_4_expert": { | |
| "setup": _setup_level_4, | |
| "grade": _grade_level_4, | |
| "is_done": _is_done_level_4, | |
| "hint": ( | |
| "EXPERT MODE (Level 4): A brutal 5x flash crowd with tight cloud budget. " | |
| "You MUST aggressively scale up AND throttle to survive. Budget is limited — " | |
| "every scale_up costs 1 unit. If you exhaust your budget, you cannot add " | |
| "more capacity. Plan wisely." | |
| ), | |
| }, | |
| # --- Original task aliases (backward-compatible) --- | |
| "traffic_spike": { | |
| "setup": _setup_traffic_spike, | |
| "grade": _grade_traffic_spike, | |
| "is_done": _is_done_traffic_spike, | |
| "hint": ( | |
| "TRAFFIC SPIKE: The system is experiencing 3x normal request volume. " | |
| "Your goal is to keep latency below 50ms while maintaining full uptime. " | |
| "Consider rerouting traffic from overloaded nodes, scaling up capacity, " | |
| "or throttling incoming requests. Minimize unnecessary actions." | |
| ), | |
| }, | |
| "node_failure": { | |
| "setup": _setup_node_failure, | |
| "grade": _grade_node_failure, | |
| "is_done": _is_done_node_failure, | |
| "hint": ( | |
| "NODE FAILURE: A node failure will occur during this episode. " | |
| "You must detect the failure, restart the affected node, and maintain " | |
| "system uptime above 80%%. React quickly — Mean Time To Repair matters. " | |
| "Avoid unnecessary restarts of healthy nodes." | |
| ), | |
| }, | |
| "cascading_failure": { | |
| "setup": _setup_cascading_failure, | |
| "grade": _grade_cascading_failure, | |
| "is_done": _is_done_cascading_failure, | |
| "hint": ( | |
| "CASCADING FAILURE PREVENTION: Two nodes are near critical CPU load " | |
| "(>85%%). If they reach 90%% for 3 consecutive steps, they will fail " | |
| "and their load will cascade to neighbors, potentially triggering a " | |
| "chain reaction. ACT PROACTIVELY: reroute traffic away from hot nodes " | |
| "BEFORE they fail. Scaling up can help absorb excess load. " | |
| "Prevention is rewarded more than recovery." | |
| ), | |
| }, | |
| "flash_crowd": { | |
| "setup": _setup_flash_crowd, | |
| "grade": _grade_flash_crowd, | |
| "is_done": _is_done_flash_crowd, | |
| "hint": ( | |
| "FLASH CROWD: The system is facing an unprecedented 5x traffic surge. " | |
| "Your objective is pure survival. You MUST aggressively use 'scale_up' " | |
| "to add capacity AND use 'throttle' to drop excess traffic. " | |
| "If you do not shed load, the cluster will collapse." | |
| ), | |
| }, | |
| # --- Level 5: Alibaba Trace Replay --- | |
| "level_5_alibaba_trace": { | |
| "setup": _setup_alibaba_trace, | |
| "grade": _grade_alibaba_trace, | |
| "is_done": _is_done_alibaba_trace, | |
| "hint": ( | |
| "ALIBABA TRACE REPLAY (Level 5): You are operating on REAL production " | |
| "traffic from Alibaba's microservices cluster (2021 trace data). " | |
| "Traffic has multimodal spikes, micro-bursts, and silent maintenance windows. " | |
| "Node 0 is the DATABASE (single point of failure). Nodes 1-7 are app servers. " | |
| "New nodes have a 3-step cold start. Budget is tight (8 credits). " | |
| "Read Prometheus metrics carefully — they follow production scrape format." | |
| ), | |
| }, | |
| # --- Level 5 Trace-backed chaos scenarios (aliases for evaluation loops) --- | |
| "thundering_herd": { | |
| "setup": _setup_alibaba_trace, | |
| "grade": _grade_alibaba_trace, | |
| "is_done": _is_done_alibaba_trace, | |
| "hint": ( | |
| "THUNDERING HERD: Sudden heavy-tailed traffic spikes drive tail latency and " | |
| "retry amplification. Use throttling judiciously to prevent runaway retries " | |
| "while protecting the DB (node 0)." | |
| ), | |
| }, | |
| "zombie_node": { | |
| "setup": _setup_alibaba_trace, | |
| "grade": _grade_alibaba_trace, | |
| "is_done": _is_done_alibaba_trace, | |
| "hint": ( | |
| "ZOMBIE NODE: A worker may appear underutilized while contributing to severe " | |
| "tail latency. Consider rerouting away from suspicious nodes and restarting " | |
| "if needed." | |
| ), | |
| }, | |
| "memory_leak_slow_burn": { | |
| "setup": _setup_alibaba_trace, | |
| "grade": _grade_alibaba_trace, | |
| "is_done": _is_done_alibaba_trace, | |
| "hint": ( | |
| "MEMORY LEAK (SLOW BURN): One worker's memory creeps upward toward an OOM cliff. " | |
| "Scaling does not fix leaks—restart the leaking pod before it crosses the cliff." | |
| ), | |
| }, | |
| "split_brain_io_bottleneck": { | |
| "setup": _setup_alibaba_trace, | |
| "grade": _grade_alibaba_trace, | |
| "is_done": _is_done_alibaba_trace, | |
| "hint": ( | |
| "SPLIT-BRAIN / IO BOTTLENECK: The database disk (node 0) can saturate (high io_wait). " | |
| "Avoid scaling when DB I/O is pegged; shed load to protect the SPOF." | |
| ), | |
| }, | |
| "black_swan_az_failure": { | |
| "setup": _setup_alibaba_trace, | |
| "grade": _grade_alibaba_trace, | |
| "is_done": _is_done_alibaba_trace, | |
| "hint": ( | |
| "BLACK SWAN: Multi-node failures can cascade. Prioritize survivor protection " | |
| "and graceful load shedding while recovering failed capacity." | |
| ), | |
| }, | |
| "retry_storm": { | |
| "setup": _setup_alibaba_trace, | |
| "grade": _grade_alibaba_trace, | |
| "is_done": _is_done_alibaba_trace, | |
| "hint": ( | |
| "RETRY STORM: Tail latency spikes can induce exponential retries. Break the loop " | |
| "with traffic throttling before the DB collapses." | |
| ), | |
| }, | |
| "hot_shard_skew": { | |
| "setup": _setup_alibaba_trace, | |
| "grade": _grade_alibaba_trace, | |
| "is_done": _is_done_alibaba_trace, | |
| "hint": ( | |
| "HOT SHARD / SKEW: One worker may run hot while others remain cool. Prefer traffic " | |
| "shifts (reroute) over scaling when the cluster has spare headroom." | |
| ), | |
| }, | |
| "connection_pool_deadlock": { | |
| "setup": _setup_alibaba_trace, | |
| "grade": _grade_alibaba_trace, | |
| "is_done": _is_done_alibaba_trace, | |
| "hint": ( | |
| "CONNECTION POOL DEADLOCK: Symptoms can look like low CPU with high latency. " | |
| "Reroute away from the stuck node or restart it to clear deadlocks." | |
| ), | |
| }, | |
| "autoscaler_flapping_trap": { | |
| "setup": _setup_alibaba_trace, | |
| "grade": _grade_alibaba_trace, | |
| "is_done": _is_done_alibaba_trace, | |
| "hint": ( | |
| "AUTOSCALER FLAPPING TRAP: Avoid overreacting to small oscillations. Prefer no_op " | |
| "when stable, and make only high-confidence interventions." | |
| ), | |
| }, | |
| } | |