""" Incident scenarios for the demo - EXPANDED VERSION WITH REALISM UPGRADES Version: 3.3.9+realism """ INCIDENT_SCENARIOS = { "Cache Miss Storm": { "description": "Redis cluster experiencing 80% cache miss rate causing database overload", "severity": "CRITICAL", "component": "redis_cache", "metrics": { "cache_hit_rate": 18.5, "database_load": 92, "response_time_ms": 1850, "affected_users": 45000, "eviction_rate_per_sec": 125 }, "business_impact": { "revenue_loss_per_hour": 8500, "sla_violation": True, "customer_sat_change": -40, "affected_services": ["API Gateway", "User Service", "Payment"] }, "roi_data": { "hourly_revenue_loss": 8500, "manual_recovery_hours": 1.0, "enterprise_recovery_hours": 0.2, "engineers_required": 4, "engineer_hourly_rate": 150, "estimated_monthly_occurrences": 2, "enterprise_savings_percentage": 0.85 }, # ============ REALISM UPGRADES ============ "realism": { "ranked_actions": [ { "rank": 1, "confidence": 87, "action": "Scale Redis cluster from 3 to 5 nodes", "rationale": "Immediate throughput increase, reduces contention", "risk": "Cold cache amplification: Medium", "tradeoff": "Adds $420/month infrastructure cost", "execution_time": "8-12 minutes", "success_rate": "94% based on 18 similar incidents" }, { "rank": 2, "confidence": 62, "action": "Implement request coalescing with 500ms window", "rationale": "Reduces duplicate DB queries, lower blast radius", "risk": "Adds 150-200ms latency per request", "tradeoff": "Slower stabilization (15-20 minutes)", "rejection_note": "Secondary option if scaling unavailable" }, { "rank": 3, "confidence": 34, "action": "Restart Redis cluster with warmup script", "rationale": "Clears fragmentation, resets eviction policies", "risk": "HIGH: 45-second service interruption", "rejection_reason": "Rejected: High data loss risk during peak traffic", "safety_override": "Required for Enterprise execution" } ], "risk_assessment": { "stampede_probability": "18%", "cold_cache_impact": "Medium", "data_inconsistency_risk": "Low", "recovery_complexity": "Medium" }, "constraints": { "max_redis_nodes": 8, "scaling_cooldown": "30 minutes", "concurrent_connections": "25,000", "data_size_gb": 42 }, "confidence_degradation": { "initial": 94, "after_8_min": 71, "after_15_min": 52, "escalation_threshold": 60 } } }, "Database Connection Pool Exhaustion": { "description": "PostgreSQL connection pool exhausted causing API timeouts", "severity": "HIGH", "component": "postgresql_database", "metrics": { "active_connections": 98, "max_connections": 100, "api_latency_ms": 2450, "error_rate": 15.2, "queue_depth": 1250, "connection_wait_seconds": 45 }, "business_impact": { "revenue_loss_per_hour": 4200, "affected_services": ["API Gateway", "User Service", "Payment Service"], "sla_violation": True, "partner_api_impact": 3 }, "roi_data": { "hourly_revenue_loss": 4200, "manual_recovery_hours": 0.75, "enterprise_recovery_hours": 0.13, "engineers_required": 2, "engineer_hourly_rate": 150, "estimated_monthly_occurrences": 3, "enterprise_savings_percentage": 0.82 }, # ============ REALISM UPGRADES ============ "realism": { "ranked_actions": [ { "rank": 1, "confidence": 82, "action": "Increase max_connections from 100 to 115 (+15%)", "rationale": "Immediate relief, within safe operating limits", "risk": "Disk I/O contention: Medium", "constraint": "DB max_connections: 82% utilized (pre)", "monitoring": "Monitor connection churn for 30 minutes" }, { "rank": 2, "confidence": 58, "action": "Enable statement timeout (5s) + connection recycling", "rationale": "Prevents runaway queries, faster pool turnover", "risk": "Query cancellation may cause application errors", "tradeoff": "Adds development/testing overhead" }, { "rank": 3, "confidence": 29, "action": "Switch to pgbouncer in transaction pooling mode", "rationale": "10x connection multiplexing possible", "risk": "HIGH: Requires application changes, 2-hour migration", "rejection_reason": "Rejected: Too invasive for incident response" } ], "constraint_awareness": { "disk_io_headroom": "Low", "memory_available_gb": 8.2, "pool_increase_cap": "+15%", "monitoring_gap": "Connection churn not tracked" } } }, "Kubernetes Memory Leak": { "description": "Java microservice memory leak causing pod restarts", "severity": "HIGH", "component": "java_payment_service", "metrics": { "memory_usage": 96, "gc_pause_time_ms": 4500, "error_rate": 28.5, "restart_frequency_per_hour": 12, "heap_fragmentation": 42 }, "business_impact": { "revenue_loss_per_hour": 5500, "session_loss": 8500, "payment_failures_percentage": 3.2, "support_tickets_increase": 300 }, "roi_data": { "hourly_revenue_loss": 5500, "manual_recovery_hours": 1.5, "enterprise_recovery_hours": 0.25, "engineers_required": 3, "engineer_hourly_rate": 150, "estimated_monthly_occurrences": 1, "enterprise_savings_percentage": 0.79 }, # ============ REALISM UPGRADES ============ "realism": { "ranked_actions": [ { "rank": 1, "confidence": 76, "action": "Canary restart (1/4 pods) with heap dump analysis", "rationale": "Minimizes blast radius, enables root cause capture", "risk": "Cold-start latency: +2.3s per pod", "blast_radius_economics": { "canary_restart_cost": "$850", "full_restart_cost": "$3,400", "payment_retry_risk": "Medium", "safer_order": "Canary → scale → rollout" } }, { "rank": 2, "confidence": 63, "action": "Increase heap from 2GB to 3GB with monitoring", "rationale": "Buy time for analysis, reduces restart frequency", "risk": "Delays root cause identification", "tradeoff": "Temporary fix, adds memory cost" } ] } }, "Network Partition": { "description": "Network partition causing split-brain in distributed database", "severity": "CRITICAL", "component": "distributed_database", "metrics": { "partition_detected": True, "write_conflicts": 1250, "data_inconsistency_percentage": 8.5, "replication_lag_seconds": 45, "quorum_lost": True }, "business_impact": { "revenue_loss_per_hour": 12000, "data_corruption_risk": True, "recovery_complexity": "HIGH", "compliance_violation": True }, "roi_data": { "hourly_revenue_loss": 12000, "manual_recovery_hours": 2.0, "enterprise_recovery_hours": 0.3, "engineers_required": 5, "engineer_hourly_rate": 150, "estimated_monthly_occurrences": 0.5, "enterprise_savings_percentage": 0.88 }, # ============ REALISM UPGRADES ============ "realism": { "competing_hypotheses": [ { "cause": "Network partition (control plane)", "confidence": 61, "evidence": "Quorum lost, replication lag > 30s", "investigation_path": "Check network mesh, BGP status" }, { "cause": "Control plane overload", "confidence": 24, "evidence": "High CPU on orchestration nodes", "investigation_path": "Scale control plane, check etcd health" }, { "cause": "Downstream timeout amplification", "confidence": 15, "evidence": "Cascading failures in 3 dependent services", "investigation_path": "Implement circuit breakers" } ] } }, "API Rate Limit Storm": { "description": "Third-party API rate limiting causing cascading failures", "severity": "MEDIUM", "component": "external_api_gateway", "metrics": { "rate_limit_hits_percentage": 95, "error_rate": 42.8, "retry_storm": True, "cascade_effect_services": 3, "queue_backlog": 8500 }, "business_impact": { "revenue_loss_per_hour": 3800, "partner_sla_breach": True, "data_sync_delay_hours": 4, "customer_reports_delay_hours": 6 }, "roi_data": { "hourly_revenue_loss": 3800, "manual_recovery_hours": 1.25, "enterprise_recovery_hours": 0.17, "engineers_required": 3, "engineer_hourly_rate": 150, "estimated_monthly_occurrences": 4, "enterprise_savings_percentage": 0.85 }, # ============ REALISM UPGRADES ============ "realism": { "contract_aware_reasoning": { "burst_limit": "1.2× allowed", "penalty_window": "15 minutes", "degradation_mode": "Non-premium users only", "contractual_limits": { "requests_per_second": 100, "monthly_overage_fee": "$0.15/request", "suspension_threshold": "3 violations/month" } } } }, "Storage I/O Saturation": { "description": "Storage system I/O saturation causing application timeouts", "severity": "HIGH", "component": "storage_cluster", "metrics": { "io_utilization": 98, "latency_ms": 450, "throughput_mbps": 1250, "queue_depth": 850, "error_rate": 8.5 }, "business_impact": { "revenue_loss_per_hour": 6800, "data_processing_delay_hours": 3, "analytics_backlog": True, "reporting_failure": True }, "roi_data": { "hourly_revenue_loss": 6800, "manual_recovery_hours": 1.75, "enterprise_recovery_hours": 0.22, "engineers_required": 3, "engineer_hourly_rate": 150, "estimated_monthly_occurrences": 1.5, "enterprise_savings_percentage": 0.83 }, # ============ REALISM UPGRADES ============ "realism": { "irreversibility_warnings": { "rebalance_duration": "18-25 minutes", "write_amplification_risk": "High", "requires_explicit_approval": True, "approval_level": "Director+", "rollback_complexity": "High (requires snapshot restore)" } } } }