""" Task catalog for the hardened NovaTech incident environment. """ from __future__ import annotations from typing import Dict, List DEPENDENCY_GRAPH: Dict[str, List[str]] = { "auth-service": ["user-service", "payment-api", "ldap-directory"], "payment-api": ["auth-service", "payment-gateway", "mysql"], "order-service": ["payment-api", "mysql", "notification-service"], "notification-service": ["order-service", "email-relay"], "reporting-service": ["mysql"], "user-service": ["auth-service", "ldap-directory"], } CONTAINMENT_DESCRIPTIONS: Dict[str, str] = { "increase_auth_heap": "Increase heap headroom for auth-service.", "enable_login_rate_limiting": "Slow abusive login traffic without fully blocking healthy users.", "restore_payment_gateway_connectivity": "Repair egress, routes, or credentials to the payment gateway.", "reduce_checkout_retry_pressure": "Reduce retry storms and non-critical checkout retries.", "free_order_log_disk": "Recover /var/log capacity on order-service hosts.", "reset_mysql_connection_pool": "Safely recycle exhausted database connection pools.", "renew_smtp_certificate": "Renew SMTP or relay TLS certificates before reconnecting.", "reroute_notification_traffic": "Use a safe backup path for notifications.", "page_major_incident_team": "Escalate to major-incident coordination.", "block_all_login_traffic": "Broadly block all login traffic.", "wipe_application_logs": "Delete logs to free resources.", "restart_everything": "Restart all major services immediately.", } TASK_SPECS: Dict[str, Dict[str, object]] = { "easy": { "incident_id": "INC-2101", "title": "Auth Heap Exhaustion", "max_steps": 6, "objective": "Detect the direct cause of a login outage and choose a safe first containment move.", "incident_window_start": "2025-06-15 02:00:00", "incident_window_end": "2025-06-15 02:25:59", "suspected_services": ["auth-service", "user-service", "payment-api"], "customer_statement": "Support agents report that enterprise admins cannot log in reliably.", "operational_constraints": [ "Do not erase evidence.", "Do not fully block all login traffic.", "You have six actions before leadership expects a recommendation.", ], "scope_servers": ["server_01"], "scope_services": ["auth-service", "user-service", "payment-api", "notification-service"], "must_include_ids": [72, 74, 75, 76, 77], "noise_sample_size": 8, "gold_evidence_ids": [74, 76, 77], "plausible_but_wrong_ids": [72, 75], "root_cause": { "primary_service": "auth-service", "failure_mode": "resource_exhaustion", "dependency": "none", "customer_impact": "login_failures", }, "impacted_services": ["auth-service"], "recommended_containment": ["increase_auth_heap", "enable_login_rate_limiting"], "forbidden_containment": ["block_all_login_traffic", "wipe_application_logs", "restart_everything"], }, "medium": { "incident_id": "INC-2144", "title": "Checkout Competing Hypotheses", "max_steps": 7, "objective": "Distinguish a genuine payment dependency outage from plausible but unrelated upstream noise.", "incident_window_start": "2025-06-15 06:20:00", "incident_window_end": "2025-06-15 06:45:59", "suspected_services": ["payment-api", "auth-service", "user-service"], "customer_statement": "Customers complete checkout, but confirmations remain pending for tens of seconds.", "operational_constraints": [ "Keep checkout partially available if possible.", "Avoid blind restarts.", "You must justify why the leading hypothesis beats the alternative.", ], "scope_servers": ["server_01", "server_02"], "scope_services": ["payment-api", "auth-service", "user-service", "order-service"], "must_include_ids": [74, 76, 77, 193, 194, 195, 607], "noise_sample_size": 12, "gold_evidence_ids": [193, 194, 195], "plausible_but_wrong_ids": [74, 76, 77, 607], "root_cause": { "primary_service": "payment-api", "failure_mode": "dependency_outage", "dependency": "payment-gateway", "customer_impact": "checkout_delays", }, "impacted_services": ["payment-api"], "recommended_containment": ["restore_payment_gateway_connectivity", "reduce_checkout_retry_pressure"], "forbidden_containment": ["restart_everything", "wipe_application_logs"], }, "hard": { "incident_id": "INC-2199", "title": "Cascading Multi-Service Incident", "max_steps": 9, "objective": "Reconstruct a cascading outage under partial observability, identify the initiating service, and choose layered containment.", "incident_window_start": "2025-06-15 02:00:00", "incident_window_end": "2025-06-15 18:45:00", "suspected_services": ["auth-service", "payment-api", "order-service", "notification-service", "reporting-service"], "customer_statement": "Users report failed logins, stuck payments, delayed orders, and missing outbound notifications.", "operational_constraints": [ "Preserve evidence across services.", "Prioritize customer safety and data integrity over broad resets.", "A major-incident bridge is already open.", ], "scope_servers": ["server_01", "server_02", "server_03", "server_04"], "scope_services": ["auth-service", "payment-api", "order-service", "notification-service", "reporting-service", "user-service"], "must_include_ids": [72, 74, 76, 77, 193, 194, 195, 266, 267, 334, 426, 429, 481, 564, 607], "noise_sample_size": 24, "gold_evidence_ids": [74, 76, 77, 193, 194, 266, 267, 426, 429, 564], "plausible_but_wrong_ids": [195, 334, 481, 607], "root_cause": { "primary_service": "auth-service", "failure_mode": "resource_exhaustion", "dependency": "payment-api", "customer_impact": "cross_service_major_incident", }, "impacted_services": ["auth-service", "payment-api", "order-service", "notification-service"], "recommended_containment": [ "increase_auth_heap", "enable_login_rate_limiting", "restore_payment_gateway_connectivity", "free_order_log_disk", "renew_smtp_certificate", "page_major_incident_team", ], "forbidden_containment": ["wipe_application_logs", "block_all_login_traffic", "restart_everything"], }, }