petter2025's picture
Update demo/scenarios.py
fde25b3 verified
raw
history blame
13.1 kB
"""
Incident scenarios for the demo - EXPANDED VERSION WITH REALISM UPGRADES
Version: 3.3.9+realism
"""
INCIDENT_SCENARIOS = {
"Cache Miss Storm": {
"description": "Redis cluster experiencing 80% cache miss rate causing database overload",
"severity": "CRITICAL",
"component": "redis_cache",
"metrics": {
"cache_hit_rate": 18.5,
"database_load": 92,
"response_time_ms": 1850,
"affected_users": 45000,
"eviction_rate_per_sec": 125
},
"business_impact": {
"revenue_loss_per_hour": 8500,
"sla_violation": True,
"customer_sat_change": -40,
"affected_services": ["API Gateway", "User Service", "Payment"]
},
"roi_data": {
"hourly_revenue_loss": 8500,
"manual_recovery_hours": 1.0,
"enterprise_recovery_hours": 0.2,
"engineers_required": 4,
"engineer_hourly_rate": 150,
"estimated_monthly_occurrences": 2,
"enterprise_savings_percentage": 0.85
},
# ============ REALISM UPGRADES ============
"realism": {
"ranked_actions": [
{
"rank": 1,
"confidence": 87,
"action": "Scale Redis cluster from 3 to 5 nodes",
"rationale": "Immediate throughput increase, reduces contention",
"risk": "Cold cache amplification: Medium",
"tradeoff": "Adds $420/month infrastructure cost",
"execution_time": "8-12 minutes",
"success_rate": "94% based on 18 similar incidents"
},
{
"rank": 2,
"confidence": 62,
"action": "Implement request coalescing with 500ms window",
"rationale": "Reduces duplicate DB queries, lower blast radius",
"risk": "Adds 150-200ms latency per request",
"tradeoff": "Slower stabilization (15-20 minutes)",
"rejection_note": "Secondary option if scaling unavailable"
},
{
"rank": 3,
"confidence": 34,
"action": "Restart Redis cluster with warmup script",
"rationale": "Clears fragmentation, resets eviction policies",
"risk": "HIGH: 45-second service interruption",
"rejection_reason": "Rejected: High data loss risk during peak traffic",
"safety_override": "Required for Enterprise execution"
}
],
"risk_assessment": {
"stampede_probability": "18%",
"cold_cache_impact": "Medium",
"data_inconsistency_risk": "Low",
"recovery_complexity": "Medium"
},
"constraints": {
"max_redis_nodes": 8,
"scaling_cooldown": "30 minutes",
"concurrent_connections": "25,000",
"data_size_gb": 42
},
"confidence_degradation": {
"initial": 94,
"after_8_min": 71,
"after_15_min": 52,
"escalation_threshold": 60
}
}
},
"Database Connection Pool Exhaustion": {
"description": "PostgreSQL connection pool exhausted causing API timeouts",
"severity": "HIGH",
"component": "postgresql_database",
"metrics": {
"active_connections": 98,
"max_connections": 100,
"api_latency_ms": 2450,
"error_rate": 15.2,
"queue_depth": 1250,
"connection_wait_seconds": 45
},
"business_impact": {
"revenue_loss_per_hour": 4200,
"affected_services": ["API Gateway", "User Service", "Payment Service"],
"sla_violation": True,
"partner_api_impact": 3
},
"roi_data": {
"hourly_revenue_loss": 4200,
"manual_recovery_hours": 0.75,
"enterprise_recovery_hours": 0.13,
"engineers_required": 2,
"engineer_hourly_rate": 150,
"estimated_monthly_occurrences": 3,
"enterprise_savings_percentage": 0.82
},
# ============ REALISM UPGRADES ============
"realism": {
"ranked_actions": [
{
"rank": 1,
"confidence": 82,
"action": "Increase max_connections from 100 to 115 (+15%)",
"rationale": "Immediate relief, within safe operating limits",
"risk": "Disk I/O contention: Medium",
"constraint": "DB max_connections: 82% utilized (pre)",
"monitoring": "Monitor connection churn for 30 minutes"
},
{
"rank": 2,
"confidence": 58,
"action": "Enable statement timeout (5s) + connection recycling",
"rationale": "Prevents runaway queries, faster pool turnover",
"risk": "Query cancellation may cause application errors",
"tradeoff": "Adds development/testing overhead"
},
{
"rank": 3,
"confidence": 29,
"action": "Switch to pgbouncer in transaction pooling mode",
"rationale": "10x connection multiplexing possible",
"risk": "HIGH: Requires application changes, 2-hour migration",
"rejection_reason": "Rejected: Too invasive for incident response"
}
],
"constraint_awareness": {
"disk_io_headroom": "Low",
"memory_available_gb": 8.2,
"pool_increase_cap": "+15%",
"monitoring_gap": "Connection churn not tracked"
}
}
},
"Kubernetes Memory Leak": {
"description": "Java microservice memory leak causing pod restarts",
"severity": "HIGH",
"component": "java_payment_service",
"metrics": {
"memory_usage": 96,
"gc_pause_time_ms": 4500,
"error_rate": 28.5,
"restart_frequency_per_hour": 12,
"heap_fragmentation": 42
},
"business_impact": {
"revenue_loss_per_hour": 5500,
"session_loss": 8500,
"payment_failures_percentage": 3.2,
"support_tickets_increase": 300
},
"roi_data": {
"hourly_revenue_loss": 5500,
"manual_recovery_hours": 1.5,
"enterprise_recovery_hours": 0.25,
"engineers_required": 3,
"engineer_hourly_rate": 150,
"estimated_monthly_occurrences": 1,
"enterprise_savings_percentage": 0.79
},
# ============ REALISM UPGRADES ============
"realism": {
"ranked_actions": [
{
"rank": 1,
"confidence": 76,
"action": "Canary restart (1/4 pods) with heap dump analysis",
"rationale": "Minimizes blast radius, enables root cause capture",
"risk": "Cold-start latency: +2.3s per pod",
"blast_radius_economics": {
"canary_restart_cost": "$850",
"full_restart_cost": "$3,400",
"payment_retry_risk": "Medium",
"safer_order": "Canary → scale → rollout"
}
},
{
"rank": 2,
"confidence": 63,
"action": "Increase heap from 2GB to 3GB with monitoring",
"rationale": "Buy time for analysis, reduces restart frequency",
"risk": "Delays root cause identification",
"tradeoff": "Temporary fix, adds memory cost"
}
]
}
},
"Network Partition": {
"description": "Network partition causing split-brain in distributed database",
"severity": "CRITICAL",
"component": "distributed_database",
"metrics": {
"partition_detected": True,
"write_conflicts": 1250,
"data_inconsistency_percentage": 8.5,
"replication_lag_seconds": 45,
"quorum_lost": True
},
"business_impact": {
"revenue_loss_per_hour": 12000,
"data_corruption_risk": True,
"recovery_complexity": "HIGH",
"compliance_violation": True
},
"roi_data": {
"hourly_revenue_loss": 12000,
"manual_recovery_hours": 2.0,
"enterprise_recovery_hours": 0.3,
"engineers_required": 5,
"engineer_hourly_rate": 150,
"estimated_monthly_occurrences": 0.5,
"enterprise_savings_percentage": 0.88
},
# ============ REALISM UPGRADES ============
"realism": {
"competing_hypotheses": [
{
"cause": "Network partition (control plane)",
"confidence": 61,
"evidence": "Quorum lost, replication lag > 30s",
"investigation_path": "Check network mesh, BGP status"
},
{
"cause": "Control plane overload",
"confidence": 24,
"evidence": "High CPU on orchestration nodes",
"investigation_path": "Scale control plane, check etcd health"
},
{
"cause": "Downstream timeout amplification",
"confidence": 15,
"evidence": "Cascading failures in 3 dependent services",
"investigation_path": "Implement circuit breakers"
}
]
}
},
"API Rate Limit Storm": {
"description": "Third-party API rate limiting causing cascading failures",
"severity": "MEDIUM",
"component": "external_api_gateway",
"metrics": {
"rate_limit_hits_percentage": 95,
"error_rate": 42.8,
"retry_storm": True,
"cascade_effect_services": 3,
"queue_backlog": 8500
},
"business_impact": {
"revenue_loss_per_hour": 3800,
"partner_sla_breach": True,
"data_sync_delay_hours": 4,
"customer_reports_delay_hours": 6
},
"roi_data": {
"hourly_revenue_loss": 3800,
"manual_recovery_hours": 1.25,
"enterprise_recovery_hours": 0.17,
"engineers_required": 3,
"engineer_hourly_rate": 150,
"estimated_monthly_occurrences": 4,
"enterprise_savings_percentage": 0.85
},
# ============ REALISM UPGRADES ============
"realism": {
"contract_aware_reasoning": {
"burst_limit": "1.2× allowed",
"penalty_window": "15 minutes",
"degradation_mode": "Non-premium users only",
"contractual_limits": {
"requests_per_second": 100,
"monthly_overage_fee": "$0.15/request",
"suspension_threshold": "3 violations/month"
}
}
}
},
"Storage I/O Saturation": {
"description": "Storage system I/O saturation causing application timeouts",
"severity": "HIGH",
"component": "storage_cluster",
"metrics": {
"io_utilization": 98,
"latency_ms": 450,
"throughput_mbps": 1250,
"queue_depth": 850,
"error_rate": 8.5
},
"business_impact": {
"revenue_loss_per_hour": 6800,
"data_processing_delay_hours": 3,
"analytics_backlog": True,
"reporting_failure": True
},
"roi_data": {
"hourly_revenue_loss": 6800,
"manual_recovery_hours": 1.75,
"enterprise_recovery_hours": 0.22,
"engineers_required": 3,
"engineer_hourly_rate": 150,
"estimated_monthly_occurrences": 1.5,
"enterprise_savings_percentage": 0.83
},
# ============ REALISM UPGRADES ============
"realism": {
"irreversibility_warnings": {
"rebalance_duration": "18-25 minutes",
"write_amplification_risk": "High",
"requires_explicit_approval": True,
"approval_level": "Director+",
"rollback_complexity": "High (requires snapshot restore)"
}
}
}
}