| """ |
| Incident scenarios for the demo - EXPANDED VERSION WITH REALISM UPGRADES |
| Version: 3.3.9+realism |
| """ |
|
|
| INCIDENT_SCENARIOS = { |
| "Cache Miss Storm": { |
| "description": "Redis cluster experiencing 80% cache miss rate causing database overload", |
| "severity": "CRITICAL", |
| "component": "redis_cache", |
| "metrics": { |
| "cache_hit_rate": 18.5, |
| "database_load": 92, |
| "response_time_ms": 1850, |
| "affected_users": 45000, |
| "eviction_rate_per_sec": 125 |
| }, |
| "business_impact": { |
| "revenue_loss_per_hour": 8500, |
| "sla_violation": True, |
| "customer_sat_change": -40, |
| "affected_services": ["API Gateway", "User Service", "Payment"] |
| }, |
| "roi_data": { |
| "hourly_revenue_loss": 8500, |
| "manual_recovery_hours": 1.0, |
| "enterprise_recovery_hours": 0.2, |
| "engineers_required": 4, |
| "engineer_hourly_rate": 150, |
| "estimated_monthly_occurrences": 2, |
| "enterprise_savings_percentage": 0.85 |
| }, |
| |
| "realism": { |
| "ranked_actions": [ |
| { |
| "rank": 1, |
| "confidence": 87, |
| "action": "Scale Redis cluster from 3 to 5 nodes", |
| "rationale": "Immediate throughput increase, reduces contention", |
| "risk": "Cold cache amplification: Medium", |
| "tradeoff": "Adds $420/month infrastructure cost", |
| "execution_time": "8-12 minutes", |
| "success_rate": "94% based on 18 similar incidents" |
| }, |
| { |
| "rank": 2, |
| "confidence": 62, |
| "action": "Implement request coalescing with 500ms window", |
| "rationale": "Reduces duplicate DB queries, lower blast radius", |
| "risk": "Adds 150-200ms latency per request", |
| "tradeoff": "Slower stabilization (15-20 minutes)", |
| "rejection_note": "Secondary option if scaling unavailable" |
| }, |
| { |
| "rank": 3, |
| "confidence": 34, |
| "action": "Restart Redis cluster with warmup script", |
| "rationale": "Clears fragmentation, resets eviction policies", |
| "risk": "HIGH: 45-second service interruption", |
| "rejection_reason": "Rejected: High data loss risk during peak traffic", |
| "safety_override": "Required for Enterprise execution" |
| } |
| ], |
| "risk_assessment": { |
| "stampede_probability": "18%", |
| "cold_cache_impact": "Medium", |
| "data_inconsistency_risk": "Low", |
| "recovery_complexity": "Medium" |
| }, |
| "constraints": { |
| "max_redis_nodes": 8, |
| "scaling_cooldown": "30 minutes", |
| "concurrent_connections": "25,000", |
| "data_size_gb": 42 |
| }, |
| "confidence_degradation": { |
| "initial": 94, |
| "after_8_min": 71, |
| "after_15_min": 52, |
| "escalation_threshold": 60 |
| } |
| } |
| }, |
| |
| "Database Connection Pool Exhaustion": { |
| "description": "PostgreSQL connection pool exhausted causing API timeouts", |
| "severity": "HIGH", |
| "component": "postgresql_database", |
| "metrics": { |
| "active_connections": 98, |
| "max_connections": 100, |
| "api_latency_ms": 2450, |
| "error_rate": 15.2, |
| "queue_depth": 1250, |
| "connection_wait_seconds": 45 |
| }, |
| "business_impact": { |
| "revenue_loss_per_hour": 4200, |
| "affected_services": ["API Gateway", "User Service", "Payment Service"], |
| "sla_violation": True, |
| "partner_api_impact": 3 |
| }, |
| "roi_data": { |
| "hourly_revenue_loss": 4200, |
| "manual_recovery_hours": 0.75, |
| "enterprise_recovery_hours": 0.13, |
| "engineers_required": 2, |
| "engineer_hourly_rate": 150, |
| "estimated_monthly_occurrences": 3, |
| "enterprise_savings_percentage": 0.82 |
| }, |
| |
| "realism": { |
| "ranked_actions": [ |
| { |
| "rank": 1, |
| "confidence": 82, |
| "action": "Increase max_connections from 100 to 115 (+15%)", |
| "rationale": "Immediate relief, within safe operating limits", |
| "risk": "Disk I/O contention: Medium", |
| "constraint": "DB max_connections: 82% utilized (pre)", |
| "monitoring": "Monitor connection churn for 30 minutes" |
| }, |
| { |
| "rank": 2, |
| "confidence": 58, |
| "action": "Enable statement timeout (5s) + connection recycling", |
| "rationale": "Prevents runaway queries, faster pool turnover", |
| "risk": "Query cancellation may cause application errors", |
| "tradeoff": "Adds development/testing overhead" |
| }, |
| { |
| "rank": 3, |
| "confidence": 29, |
| "action": "Switch to pgbouncer in transaction pooling mode", |
| "rationale": "10x connection multiplexing possible", |
| "risk": "HIGH: Requires application changes, 2-hour migration", |
| "rejection_reason": "Rejected: Too invasive for incident response" |
| } |
| ], |
| "constraint_awareness": { |
| "disk_io_headroom": "Low", |
| "memory_available_gb": 8.2, |
| "pool_increase_cap": "+15%", |
| "monitoring_gap": "Connection churn not tracked" |
| } |
| } |
| }, |
| |
| "Kubernetes Memory Leak": { |
| "description": "Java microservice memory leak causing pod restarts", |
| "severity": "HIGH", |
| "component": "java_payment_service", |
| "metrics": { |
| "memory_usage": 96, |
| "gc_pause_time_ms": 4500, |
| "error_rate": 28.5, |
| "restart_frequency_per_hour": 12, |
| "heap_fragmentation": 42 |
| }, |
| "business_impact": { |
| "revenue_loss_per_hour": 5500, |
| "session_loss": 8500, |
| "payment_failures_percentage": 3.2, |
| "support_tickets_increase": 300 |
| }, |
| "roi_data": { |
| "hourly_revenue_loss": 5500, |
| "manual_recovery_hours": 1.5, |
| "enterprise_recovery_hours": 0.25, |
| "engineers_required": 3, |
| "engineer_hourly_rate": 150, |
| "estimated_monthly_occurrences": 1, |
| "enterprise_savings_percentage": 0.79 |
| }, |
| |
| "realism": { |
| "ranked_actions": [ |
| { |
| "rank": 1, |
| "confidence": 76, |
| "action": "Canary restart (1/4 pods) with heap dump analysis", |
| "rationale": "Minimizes blast radius, enables root cause capture", |
| "risk": "Cold-start latency: +2.3s per pod", |
| "blast_radius_economics": { |
| "canary_restart_cost": "$850", |
| "full_restart_cost": "$3,400", |
| "payment_retry_risk": "Medium", |
| "safer_order": "Canary → scale → rollout" |
| } |
| }, |
| { |
| "rank": 2, |
| "confidence": 63, |
| "action": "Increase heap from 2GB to 3GB with monitoring", |
| "rationale": "Buy time for analysis, reduces restart frequency", |
| "risk": "Delays root cause identification", |
| "tradeoff": "Temporary fix, adds memory cost" |
| } |
| ] |
| } |
| }, |
| |
| "Network Partition": { |
| "description": "Network partition causing split-brain in distributed database", |
| "severity": "CRITICAL", |
| "component": "distributed_database", |
| "metrics": { |
| "partition_detected": True, |
| "write_conflicts": 1250, |
| "data_inconsistency_percentage": 8.5, |
| "replication_lag_seconds": 45, |
| "quorum_lost": True |
| }, |
| "business_impact": { |
| "revenue_loss_per_hour": 12000, |
| "data_corruption_risk": True, |
| "recovery_complexity": "HIGH", |
| "compliance_violation": True |
| }, |
| "roi_data": { |
| "hourly_revenue_loss": 12000, |
| "manual_recovery_hours": 2.0, |
| "enterprise_recovery_hours": 0.3, |
| "engineers_required": 5, |
| "engineer_hourly_rate": 150, |
| "estimated_monthly_occurrences": 0.5, |
| "enterprise_savings_percentage": 0.88 |
| }, |
| |
| "realism": { |
| "competing_hypotheses": [ |
| { |
| "cause": "Network partition (control plane)", |
| "confidence": 61, |
| "evidence": "Quorum lost, replication lag > 30s", |
| "investigation_path": "Check network mesh, BGP status" |
| }, |
| { |
| "cause": "Control plane overload", |
| "confidence": 24, |
| "evidence": "High CPU on orchestration nodes", |
| "investigation_path": "Scale control plane, check etcd health" |
| }, |
| { |
| "cause": "Downstream timeout amplification", |
| "confidence": 15, |
| "evidence": "Cascading failures in 3 dependent services", |
| "investigation_path": "Implement circuit breakers" |
| } |
| ] |
| } |
| }, |
| |
| "API Rate Limit Storm": { |
| "description": "Third-party API rate limiting causing cascading failures", |
| "severity": "MEDIUM", |
| "component": "external_api_gateway", |
| "metrics": { |
| "rate_limit_hits_percentage": 95, |
| "error_rate": 42.8, |
| "retry_storm": True, |
| "cascade_effect_services": 3, |
| "queue_backlog": 8500 |
| }, |
| "business_impact": { |
| "revenue_loss_per_hour": 3800, |
| "partner_sla_breach": True, |
| "data_sync_delay_hours": 4, |
| "customer_reports_delay_hours": 6 |
| }, |
| "roi_data": { |
| "hourly_revenue_loss": 3800, |
| "manual_recovery_hours": 1.25, |
| "enterprise_recovery_hours": 0.17, |
| "engineers_required": 3, |
| "engineer_hourly_rate": 150, |
| "estimated_monthly_occurrences": 4, |
| "enterprise_savings_percentage": 0.85 |
| }, |
| |
| "realism": { |
| "contract_aware_reasoning": { |
| "burst_limit": "1.2× allowed", |
| "penalty_window": "15 minutes", |
| "degradation_mode": "Non-premium users only", |
| "contractual_limits": { |
| "requests_per_second": 100, |
| "monthly_overage_fee": "$0.15/request", |
| "suspension_threshold": "3 violations/month" |
| } |
| } |
| } |
| }, |
| |
| "Storage I/O Saturation": { |
| "description": "Storage system I/O saturation causing application timeouts", |
| "severity": "HIGH", |
| "component": "storage_cluster", |
| "metrics": { |
| "io_utilization": 98, |
| "latency_ms": 450, |
| "throughput_mbps": 1250, |
| "queue_depth": 850, |
| "error_rate": 8.5 |
| }, |
| "business_impact": { |
| "revenue_loss_per_hour": 6800, |
| "data_processing_delay_hours": 3, |
| "analytics_backlog": True, |
| "reporting_failure": True |
| }, |
| "roi_data": { |
| "hourly_revenue_loss": 6800, |
| "manual_recovery_hours": 1.75, |
| "enterprise_recovery_hours": 0.22, |
| "engineers_required": 3, |
| "engineer_hourly_rate": 150, |
| "estimated_monthly_occurrences": 1.5, |
| "enterprise_savings_percentage": 0.83 |
| }, |
| |
| "realism": { |
| "irreversibility_warnings": { |
| "rebalance_duration": "18-25 minutes", |
| "write_amplification_risk": "High", |
| "requires_explicit_approval": True, |
| "approval_level": "Director+", |
| "rollback_complexity": "High (requires snapshot restore)" |
| } |
| } |
| } |
| } |