petter2025's picture
Create scenarios.yaml
bdff4ef verified
raw
history blame
4.23 kB
Cache Miss Storm:
description: "Redis cluster experiencing 80% cache miss rate causing database overload"
severity: "CRITICAL"
component: "redis_cache"
metrics:
cache_hit_rate: 18.5
database_load: 92
response_time_ms: 1850
affected_users: 45000
eviction_rate_per_sec: 125
business_impact:
revenue_loss_per_hour: 8500
sla_violation: true
customer_sat_change: -40
affected_services:
- "API Gateway"
- "User Service"
- "Payment"
roi_data:
hourly_revenue_loss: 8500
manual_recovery_hours: 1.0
enterprise_recovery_hours: 0.2
engineers_required: 4
engineer_hourly_rate: 150
estimated_monthly_occurrences: 2
enterprise_savings_percentage: 0.85
Database Connection Pool Exhaustion:
description: "PostgreSQL connection pool exhausted causing API timeouts"
severity: "HIGH"
component: "postgresql_database"
metrics:
active_connections: 98
max_connections: 100
api_latency_ms: 2450
error_rate: 15.2
queue_depth: 1250
connection_wait_seconds: 45
business_impact:
revenue_loss_per_hour: 4200
affected_services:
- "API Gateway"
- "User Service"
- "Payment Service"
sla_violation: true
partner_api_impact: 3
roi_data:
hourly_revenue_loss: 4200
manual_recovery_hours: 0.75
enterprise_recovery_hours: 0.13
engineers_required: 2
engineer_hourly_rate: 150
estimated_monthly_occurrences: 3
enterprise_savings_percentage: 0.82
Kubernetes Memory Leak:
description: "Java microservice memory leak causing pod restarts"
severity: "HIGH"
component: "java_payment_service"
metrics:
memory_usage: 96
gc_pause_time_ms: 4500
error_rate: 28.5
restart_frequency_per_hour: 12
heap_fragmentation: 42
business_impact:
revenue_loss_per_hour: 5500
session_loss: 8500
payment_failures_percentage: 3.2
support_tickets_increase: 300
roi_data:
hourly_revenue_loss: 5500
manual_recovery_hours: 1.5
enterprise_recovery_hours: 0.25
engineers_required: 3
engineer_hourly_rate: 150
estimated_monthly_occurrences: 1
enterprise_savings_percentage: 0.79
API Rate Limit Storm:
description: "Third-party API rate limiting causing cascading failures"
severity: "MEDIUM"
component: "external_api_gateway"
metrics:
rate_limit_hits_percentage: 95
error_rate: 42.8
retry_storm: true
cascade_effect_services: 3
queue_backlog: 8500
business_impact:
revenue_loss_per_hour: 3800
partner_sla_breach: true
data_sync_delay_hours: 4
customer_reports_delay_hours: 6
roi_data:
hourly_revenue_loss: 3800
manual_recovery_hours: 1.25
enterprise_recovery_hours: 0.17
engineers_required: 3
engineer_hourly_rate: 150
estimated_monthly_occurrences: 4
enterprise_savings_percentage: 0.85
Network Partition:
description: "Network partition causing split-brain in distributed database"
severity: "CRITICAL"
component: "distributed_database"
metrics:
partition_detected: true
write_conflicts: 1250
data_inconsistency_percentage: 8.5
replication_lag_seconds: 45
quorum_lost: true
business_impact:
revenue_loss_per_hour: 12000
data_corruption_risk: true
recovery_complexity: "HIGH"
compliance_violation: true
roi_data:
hourly_revenue_loss: 12000
manual_recovery_hours: 2.0
enterprise_recovery_hours: 0.3
engineers_required: 5
engineer_hourly_rate: 150
estimated_monthly_occurrences: 0.5
enterprise_savings_percentage: 0.88
Storage I/O Saturation:
description: "Storage system I/O saturation causing application timeouts"
severity: "HIGH"
component: "storage_cluster"
metrics:
io_utilization: 98
latency_ms: 450
throughput_mbps: 1250
queue_depth: 850
error_rate: 8.5
business_impact:
revenue_loss_per_hour: 6800
data_processing_delay_hours: 3
analytics_backlog: true
reporting_failure: true
roi_data:
hourly_revenue_loss: 6800
manual_recovery_hours: 1.75
enterprise_recovery_hours: 0.22
engineers_required: 3
engineer_hourly_rate: 150
estimated_monthly_occurrences: 1.5
enterprise_savings_percentage: 0.83