Spaces:
Sleeping
Sleeping
| """ | |
| Task catalog for the hardened NovaTech incident environment. | |
| """ | |
| from __future__ import annotations | |
| from typing import Dict, List | |
| DEPENDENCY_GRAPH: Dict[str, List[str]] = { | |
| "auth-service": ["user-service", "payment-api", "ldap-directory"], | |
| "payment-api": ["auth-service", "payment-gateway", "mysql"], | |
| "order-service": ["payment-api", "mysql", "notification-service"], | |
| "notification-service": ["order-service", "email-relay"], | |
| "reporting-service": ["mysql"], | |
| "user-service": ["auth-service", "ldap-directory"], | |
| } | |
| CONTAINMENT_DESCRIPTIONS: Dict[str, str] = { | |
| "increase_auth_heap": "Increase heap headroom for auth-service.", | |
| "enable_login_rate_limiting": "Slow abusive login traffic without fully blocking healthy users.", | |
| "restore_payment_gateway_connectivity": "Repair egress, routes, or credentials to the payment gateway.", | |
| "reduce_checkout_retry_pressure": "Reduce retry storms and non-critical checkout retries.", | |
| "free_order_log_disk": "Recover /var/log capacity on order-service hosts.", | |
| "reset_mysql_connection_pool": "Safely recycle exhausted database connection pools.", | |
| "renew_smtp_certificate": "Renew SMTP or relay TLS certificates before reconnecting.", | |
| "reroute_notification_traffic": "Use a safe backup path for notifications.", | |
| "page_major_incident_team": "Escalate to major-incident coordination.", | |
| "block_all_login_traffic": "Broadly block all login traffic.", | |
| "wipe_application_logs": "Delete logs to free resources.", | |
| "restart_everything": "Restart all major services immediately.", | |
| } | |
| TASK_SPECS: Dict[str, Dict[str, object]] = { | |
| "easy": { | |
| "incident_id": "INC-2101", | |
| "title": "Auth Heap Exhaustion", | |
| "max_steps": 6, | |
| "objective": "Detect the direct cause of a login outage and choose a safe first containment move.", | |
| "incident_window_start": "2025-06-15 02:00:00", | |
| "incident_window_end": "2025-06-15 02:25:59", | |
| "suspected_services": ["auth-service", "user-service", "payment-api"], | |
| "customer_statement": "Support agents report that enterprise admins cannot log in reliably.", | |
| "operational_constraints": [ | |
| "Do not erase evidence.", | |
| "Do not fully block all login traffic.", | |
| "You have six actions before leadership expects a recommendation.", | |
| ], | |
| "scope_servers": ["server_01"], | |
| "scope_services": ["auth-service", "user-service", "payment-api", "notification-service"], | |
| "must_include_ids": [72, 74, 75, 76, 77], | |
| "noise_sample_size": 8, | |
| "gold_evidence_ids": [74, 76, 77], | |
| "plausible_but_wrong_ids": [72, 75], | |
| "root_cause": { | |
| "primary_service": "auth-service", | |
| "failure_mode": "resource_exhaustion", | |
| "dependency": "none", | |
| "customer_impact": "login_failures", | |
| }, | |
| "impacted_services": ["auth-service"], | |
| "recommended_containment": ["increase_auth_heap", "enable_login_rate_limiting"], | |
| "forbidden_containment": ["block_all_login_traffic", "wipe_application_logs", "restart_everything"], | |
| }, | |
| "medium": { | |
| "incident_id": "INC-2144", | |
| "title": "Checkout Competing Hypotheses", | |
| "max_steps": 7, | |
| "objective": "Distinguish a genuine payment dependency outage from plausible but unrelated upstream noise.", | |
| "incident_window_start": "2025-06-15 06:20:00", | |
| "incident_window_end": "2025-06-15 06:45:59", | |
| "suspected_services": ["payment-api", "auth-service", "user-service"], | |
| "customer_statement": "Customers complete checkout, but confirmations remain pending for tens of seconds.", | |
| "operational_constraints": [ | |
| "Keep checkout partially available if possible.", | |
| "Avoid blind restarts.", | |
| "You must justify why the leading hypothesis beats the alternative.", | |
| ], | |
| "scope_servers": ["server_01", "server_02"], | |
| "scope_services": ["payment-api", "auth-service", "user-service", "order-service"], | |
| "must_include_ids": [74, 76, 77, 193, 194, 195, 607], | |
| "noise_sample_size": 12, | |
| "gold_evidence_ids": [193, 194, 195], | |
| "plausible_but_wrong_ids": [74, 76, 77, 607], | |
| "root_cause": { | |
| "primary_service": "payment-api", | |
| "failure_mode": "dependency_outage", | |
| "dependency": "payment-gateway", | |
| "customer_impact": "checkout_delays", | |
| }, | |
| "impacted_services": ["payment-api"], | |
| "recommended_containment": ["restore_payment_gateway_connectivity", "reduce_checkout_retry_pressure"], | |
| "forbidden_containment": ["restart_everything", "wipe_application_logs"], | |
| }, | |
| "hard": { | |
| "incident_id": "INC-2199", | |
| "title": "Cascading Multi-Service Incident", | |
| "max_steps": 9, | |
| "objective": "Reconstruct a cascading outage under partial observability, identify the initiating service, and choose layered containment.", | |
| "incident_window_start": "2025-06-15 02:00:00", | |
| "incident_window_end": "2025-06-15 18:45:00", | |
| "suspected_services": ["auth-service", "payment-api", "order-service", "notification-service", "reporting-service"], | |
| "customer_statement": "Users report failed logins, stuck payments, delayed orders, and missing outbound notifications.", | |
| "operational_constraints": [ | |
| "Preserve evidence across services.", | |
| "Prioritize customer safety and data integrity over broad resets.", | |
| "A major-incident bridge is already open.", | |
| ], | |
| "scope_servers": ["server_01", "server_02", "server_03", "server_04"], | |
| "scope_services": ["auth-service", "payment-api", "order-service", "notification-service", "reporting-service", "user-service"], | |
| "must_include_ids": [72, 74, 76, 77, 193, 194, 195, 266, 267, 334, 426, 429, 481, 564, 607], | |
| "noise_sample_size": 24, | |
| "gold_evidence_ids": [74, 76, 77, 193, 194, 266, 267, 426, 429, 564], | |
| "plausible_but_wrong_ids": [195, 334, 481, 607], | |
| "root_cause": { | |
| "primary_service": "auth-service", | |
| "failure_mode": "resource_exhaustion", | |
| "dependency": "payment-api", | |
| "customer_impact": "cross_service_major_incident", | |
| }, | |
| "impacted_services": ["auth-service", "payment-api", "order-service", "notification-service"], | |
| "recommended_containment": [ | |
| "increase_auth_heap", | |
| "enable_login_rate_limiting", | |
| "restore_payment_gateway_connectivity", | |
| "free_order_log_disk", | |
| "renew_smtp_certificate", | |
| "page_major_incident_team", | |
| ], | |
| "forbidden_containment": ["wipe_application_logs", "block_all_login_traffic", "restart_everything"], | |
| }, | |
| } | |