CausalOps-Env / tasks /catalog.py
omm7's picture
Upload folder using huggingface_hub
bc2ead7 verified
"""
Task catalog for the hardened NovaTech incident environment.
"""
from __future__ import annotations
from typing import Dict, List
DEPENDENCY_GRAPH: Dict[str, List[str]] = {
"auth-service": ["user-service", "payment-api", "ldap-directory"],
"payment-api": ["auth-service", "payment-gateway", "mysql"],
"order-service": ["payment-api", "mysql", "notification-service"],
"notification-service": ["order-service", "email-relay"],
"reporting-service": ["mysql"],
"user-service": ["auth-service", "ldap-directory"],
}
CONTAINMENT_DESCRIPTIONS: Dict[str, str] = {
"increase_auth_heap": "Increase heap headroom for auth-service.",
"enable_login_rate_limiting": "Slow abusive login traffic without fully blocking healthy users.",
"restore_payment_gateway_connectivity": "Repair egress, routes, or credentials to the payment gateway.",
"reduce_checkout_retry_pressure": "Reduce retry storms and non-critical checkout retries.",
"free_order_log_disk": "Recover /var/log capacity on order-service hosts.",
"reset_mysql_connection_pool": "Safely recycle exhausted database connection pools.",
"renew_smtp_certificate": "Renew SMTP or relay TLS certificates before reconnecting.",
"reroute_notification_traffic": "Use a safe backup path for notifications.",
"page_major_incident_team": "Escalate to major-incident coordination.",
"block_all_login_traffic": "Broadly block all login traffic.",
"wipe_application_logs": "Delete logs to free resources.",
"restart_everything": "Restart all major services immediately.",
}
TASK_SPECS: Dict[str, Dict[str, object]] = {
"easy": {
"incident_id": "INC-2101",
"title": "Auth Heap Exhaustion",
"max_steps": 6,
"objective": "Detect the direct cause of a login outage and choose a safe first containment move.",
"incident_window_start": "2025-06-15 02:00:00",
"incident_window_end": "2025-06-15 02:25:59",
"suspected_services": ["auth-service", "user-service", "payment-api"],
"customer_statement": "Support agents report that enterprise admins cannot log in reliably.",
"operational_constraints": [
"Do not erase evidence.",
"Do not fully block all login traffic.",
"You have six actions before leadership expects a recommendation.",
],
"scope_servers": ["server_01"],
"scope_services": ["auth-service", "user-service", "payment-api", "notification-service"],
"must_include_ids": [72, 74, 75, 76, 77],
"noise_sample_size": 8,
"gold_evidence_ids": [74, 76, 77],
"plausible_but_wrong_ids": [72, 75],
"root_cause": {
"primary_service": "auth-service",
"failure_mode": "resource_exhaustion",
"dependency": "none",
"customer_impact": "login_failures",
},
"impacted_services": ["auth-service"],
"recommended_containment": ["increase_auth_heap", "enable_login_rate_limiting"],
"forbidden_containment": ["block_all_login_traffic", "wipe_application_logs", "restart_everything"],
},
"medium": {
"incident_id": "INC-2144",
"title": "Checkout Competing Hypotheses",
"max_steps": 7,
"objective": "Distinguish a genuine payment dependency outage from plausible but unrelated upstream noise.",
"incident_window_start": "2025-06-15 06:20:00",
"incident_window_end": "2025-06-15 06:45:59",
"suspected_services": ["payment-api", "auth-service", "user-service"],
"customer_statement": "Customers complete checkout, but confirmations remain pending for tens of seconds.",
"operational_constraints": [
"Keep checkout partially available if possible.",
"Avoid blind restarts.",
"You must justify why the leading hypothesis beats the alternative.",
],
"scope_servers": ["server_01", "server_02"],
"scope_services": ["payment-api", "auth-service", "user-service", "order-service"],
"must_include_ids": [74, 76, 77, 193, 194, 195, 607],
"noise_sample_size": 12,
"gold_evidence_ids": [193, 194, 195],
"plausible_but_wrong_ids": [74, 76, 77, 607],
"root_cause": {
"primary_service": "payment-api",
"failure_mode": "dependency_outage",
"dependency": "payment-gateway",
"customer_impact": "checkout_delays",
},
"impacted_services": ["payment-api"],
"recommended_containment": ["restore_payment_gateway_connectivity", "reduce_checkout_retry_pressure"],
"forbidden_containment": ["restart_everything", "wipe_application_logs"],
},
"hard": {
"incident_id": "INC-2199",
"title": "Cascading Multi-Service Incident",
"max_steps": 9,
"objective": "Reconstruct a cascading outage under partial observability, identify the initiating service, and choose layered containment.",
"incident_window_start": "2025-06-15 02:00:00",
"incident_window_end": "2025-06-15 18:45:00",
"suspected_services": ["auth-service", "payment-api", "order-service", "notification-service", "reporting-service"],
"customer_statement": "Users report failed logins, stuck payments, delayed orders, and missing outbound notifications.",
"operational_constraints": [
"Preserve evidence across services.",
"Prioritize customer safety and data integrity over broad resets.",
"A major-incident bridge is already open.",
],
"scope_servers": ["server_01", "server_02", "server_03", "server_04"],
"scope_services": ["auth-service", "payment-api", "order-service", "notification-service", "reporting-service", "user-service"],
"must_include_ids": [72, 74, 76, 77, 193, 194, 195, 266, 267, 334, 426, 429, 481, 564, 607],
"noise_sample_size": 24,
"gold_evidence_ids": [74, 76, 77, 193, 194, 266, 267, 426, 429, 564],
"plausible_but_wrong_ids": [195, 334, 481, 607],
"root_cause": {
"primary_service": "auth-service",
"failure_mode": "resource_exhaustion",
"dependency": "payment-api",
"customer_impact": "cross_service_major_incident",
},
"impacted_services": ["auth-service", "payment-api", "order-service", "notification-service"],
"recommended_containment": [
"increase_auth_heap",
"enable_login_rate_limiting",
"restore_payment_gateway_connectivity",
"free_order_log_disk",
"renew_smtp_certificate",
"page_major_incident_team",
],
"forbidden_containment": ["wipe_application_logs", "block_all_login_traffic", "restart_everything"],
},
}