| Cache Miss Storm: | |
| description: "Redis cluster experiencing 80% cache miss rate causing database overload" | |
| severity: "CRITICAL" | |
| component: "redis_cache" | |
| metrics: | |
| cache_hit_rate: 18.5 | |
| database_load: 92 | |
| response_time_ms: 1850 | |
| affected_users: 45000 | |
| eviction_rate_per_sec: 125 | |
| business_impact: | |
| revenue_loss_per_hour: 8500 | |
| sla_violation: true | |
| customer_sat_change: -40 | |
| affected_services: | |
| - "API Gateway" | |
| - "User Service" | |
| - "Payment" | |
| roi_data: | |
| hourly_revenue_loss: 8500 | |
| manual_recovery_hours: 1.0 | |
| enterprise_recovery_hours: 0.2 | |
| engineers_required: 4 | |
| engineer_hourly_rate: 150 | |
| estimated_monthly_occurrences: 2 | |
| enterprise_savings_percentage: 0.85 | |
| Database Connection Pool Exhaustion: | |
| description: "PostgreSQL connection pool exhausted causing API timeouts" | |
| severity: "HIGH" | |
| component: "postgresql_database" | |
| metrics: | |
| active_connections: 98 | |
| max_connections: 100 | |
| api_latency_ms: 2450 | |
| error_rate: 15.2 | |
| queue_depth: 1250 | |
| connection_wait_seconds: 45 | |
| business_impact: | |
| revenue_loss_per_hour: 4200 | |
| affected_services: | |
| - "API Gateway" | |
| - "User Service" | |
| - "Payment Service" | |
| sla_violation: true | |
| partner_api_impact: 3 | |
| roi_data: | |
| hourly_revenue_loss: 4200 | |
| manual_recovery_hours: 0.75 | |
| enterprise_recovery_hours: 0.13 | |
| engineers_required: 2 | |
| engineer_hourly_rate: 150 | |
| estimated_monthly_occurrences: 3 | |
| enterprise_savings_percentage: 0.82 | |
| Kubernetes Memory Leak: | |
| description: "Java microservice memory leak causing pod restarts" | |
| severity: "HIGH" | |
| component: "java_payment_service" | |
| metrics: | |
| memory_usage: 96 | |
| gc_pause_time_ms: 4500 | |
| error_rate: 28.5 | |
| restart_frequency_per_hour: 12 | |
| heap_fragmentation: 42 | |
| business_impact: | |
| revenue_loss_per_hour: 5500 | |
| session_loss: 8500 | |
| payment_failures_percentage: 3.2 | |
| support_tickets_increase: 300 | |
| roi_data: | |
| hourly_revenue_loss: 5500 | |
| manual_recovery_hours: 1.5 | |
| enterprise_recovery_hours: 0.25 | |
| engineers_required: 3 | |
| engineer_hourly_rate: 150 | |
| estimated_monthly_occurrences: 1 | |
| enterprise_savings_percentage: 0.79 | |
| API Rate Limit Storm: | |
| description: "Third-party API rate limiting causing cascading failures" | |
| severity: "MEDIUM" | |
| component: "external_api_gateway" | |
| metrics: | |
| rate_limit_hits_percentage: 95 | |
| error_rate: 42.8 | |
| retry_storm: true | |
| cascade_effect_services: 3 | |
| queue_backlog: 8500 | |
| business_impact: | |
| revenue_loss_per_hour: 3800 | |
| partner_sla_breach: true | |
| data_sync_delay_hours: 4 | |
| customer_reports_delay_hours: 6 | |
| roi_data: | |
| hourly_revenue_loss: 3800 | |
| manual_recovery_hours: 1.25 | |
| enterprise_recovery_hours: 0.17 | |
| engineers_required: 3 | |
| engineer_hourly_rate: 150 | |
| estimated_monthly_occurrences: 4 | |
| enterprise_savings_percentage: 0.85 | |
| Network Partition: | |
| description: "Network partition causing split-brain in distributed database" | |
| severity: "CRITICAL" | |
| component: "distributed_database" | |
| metrics: | |
| partition_detected: true | |
| write_conflicts: 1250 | |
| data_inconsistency_percentage: 8.5 | |
| replication_lag_seconds: 45 | |
| quorum_lost: true | |
| business_impact: | |
| revenue_loss_per_hour: 12000 | |
| data_corruption_risk: true | |
| recovery_complexity: "HIGH" | |
| compliance_violation: true | |
| roi_data: | |
| hourly_revenue_loss: 12000 | |
| manual_recovery_hours: 2.0 | |
| enterprise_recovery_hours: 0.3 | |
| engineers_required: 5 | |
| engineer_hourly_rate: 150 | |
| estimated_monthly_occurrences: 0.5 | |
| enterprise_savings_percentage: 0.88 | |
| Storage I/O Saturation: | |
| description: "Storage system I/O saturation causing application timeouts" | |
| severity: "HIGH" | |
| component: "storage_cluster" | |
| metrics: | |
| io_utilization: 98 | |
| latency_ms: 450 | |
| throughput_mbps: 1250 | |
| queue_depth: 850 | |
| error_rate: 8.5 | |
| business_impact: | |
| revenue_loss_per_hour: 6800 | |
| data_processing_delay_hours: 3 | |
| analytics_backlog: true | |
| reporting_failure: true | |
| roi_data: | |
| hourly_revenue_loss: 6800 | |
| manual_recovery_hours: 1.75 | |
| enterprise_recovery_hours: 0.22 | |
| engineers_required: 3 | |
| engineer_hourly_rate: 150 | |
| estimated_monthly_occurrences: 1.5 | |
| enterprise_savings_percentage: 0.83 |