Update demo/scenarios.py
Browse files- demo/scenarios.py +188 -28
demo/scenarios.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
-
Incident scenarios for the demo - EXPANDED VERSION
|
|
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
INCIDENT_SCENARIOS = {
|
|
@@ -28,6 +29,57 @@ INCIDENT_SCENARIOS = {
|
|
| 28 |
"engineer_hourly_rate": 150,
|
| 29 |
"estimated_monthly_occurrences": 2,
|
| 30 |
"enterprise_savings_percentage": 0.85
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
}
|
| 32 |
},
|
| 33 |
|
|
@@ -57,6 +109,42 @@ INCIDENT_SCENARIOS = {
|
|
| 57 |
"engineer_hourly_rate": 150,
|
| 58 |
"estimated_monthly_occurrences": 3,
|
| 59 |
"enterprise_savings_percentage": 0.82
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
}
|
| 61 |
},
|
| 62 |
|
|
@@ -85,34 +173,32 @@ INCIDENT_SCENARIOS = {
|
|
| 85 |
"engineer_hourly_rate": 150,
|
| 86 |
"estimated_monthly_occurrences": 1,
|
| 87 |
"enterprise_savings_percentage": 0.79
|
| 88 |
-
}
|
| 89 |
-
},
|
| 90 |
-
|
| 91 |
-
"API Rate Limit Storm": {
|
| 92 |
-
"description": "Third-party API rate limiting causing cascading failures",
|
| 93 |
-
"severity": "MEDIUM",
|
| 94 |
-
"component": "external_api_gateway",
|
| 95 |
-
"metrics": {
|
| 96 |
-
"rate_limit_hits_percentage": 95,
|
| 97 |
-
"error_rate": 42.8,
|
| 98 |
-
"retry_storm": True,
|
| 99 |
-
"cascade_effect_services": 3,
|
| 100 |
-
"queue_backlog": 8500
|
| 101 |
},
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
"
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
}
|
| 117 |
},
|
| 118 |
|
|
@@ -141,6 +227,70 @@ INCIDENT_SCENARIOS = {
|
|
| 141 |
"engineer_hourly_rate": 150,
|
| 142 |
"estimated_monthly_occurrences": 0.5,
|
| 143 |
"enterprise_savings_percentage": 0.88
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
}
|
| 145 |
},
|
| 146 |
|
|
@@ -169,6 +319,16 @@ INCIDENT_SCENARIOS = {
|
|
| 169 |
"engineer_hourly_rate": 150,
|
| 170 |
"estimated_monthly_occurrences": 1.5,
|
| 171 |
"enterprise_savings_percentage": 0.83
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
}
|
| 173 |
}
|
| 174 |
}
|
|
|
|
| 1 |
"""
|
| 2 |
+
Incident scenarios for the demo - EXPANDED VERSION WITH REALISM UPGRADES
|
| 3 |
+
Version: 3.3.9+realism
|
| 4 |
"""
|
| 5 |
|
| 6 |
INCIDENT_SCENARIOS = {
|
|
|
|
| 29 |
"engineer_hourly_rate": 150,
|
| 30 |
"estimated_monthly_occurrences": 2,
|
| 31 |
"enterprise_savings_percentage": 0.85
|
| 32 |
+
},
|
| 33 |
+
# ============ REALISM UPGRADES ============
|
| 34 |
+
"realism": {
|
| 35 |
+
"ranked_actions": [
|
| 36 |
+
{
|
| 37 |
+
"rank": 1,
|
| 38 |
+
"confidence": 87,
|
| 39 |
+
"action": "Scale Redis cluster from 3 to 5 nodes",
|
| 40 |
+
"rationale": "Immediate throughput increase, reduces contention",
|
| 41 |
+
"risk": "Cold cache amplification: Medium",
|
| 42 |
+
"tradeoff": "Adds $420/month infrastructure cost",
|
| 43 |
+
"execution_time": "8-12 minutes",
|
| 44 |
+
"success_rate": "94% based on 18 similar incidents"
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"rank": 2,
|
| 48 |
+
"confidence": 62,
|
| 49 |
+
"action": "Implement request coalescing with 500ms window",
|
| 50 |
+
"rationale": "Reduces duplicate DB queries, lower blast radius",
|
| 51 |
+
"risk": "Adds 150-200ms latency per request",
|
| 52 |
+
"tradeoff": "Slower stabilization (15-20 minutes)",
|
| 53 |
+
"rejection_note": "Secondary option if scaling unavailable"
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"rank": 3,
|
| 57 |
+
"confidence": 34,
|
| 58 |
+
"action": "Restart Redis cluster with warmup script",
|
| 59 |
+
"rationale": "Clears fragmentation, resets eviction policies",
|
| 60 |
+
"risk": "HIGH: 45-second service interruption",
|
| 61 |
+
"rejection_reason": "Rejected: High data loss risk during peak traffic",
|
| 62 |
+
"safety_override": "Required for Enterprise execution"
|
| 63 |
+
}
|
| 64 |
+
],
|
| 65 |
+
"risk_assessment": {
|
| 66 |
+
"stampede_probability": "18%",
|
| 67 |
+
"cold_cache_impact": "Medium",
|
| 68 |
+
"data_inconsistency_risk": "Low",
|
| 69 |
+
"recovery_complexity": "Medium"
|
| 70 |
+
},
|
| 71 |
+
"constraints": {
|
| 72 |
+
"max_redis_nodes": 8,
|
| 73 |
+
"scaling_cooldown": "30 minutes",
|
| 74 |
+
"concurrent_connections": "25,000",
|
| 75 |
+
"data_size_gb": 42
|
| 76 |
+
},
|
| 77 |
+
"confidence_degradation": {
|
| 78 |
+
"initial": 94,
|
| 79 |
+
"after_8_min": 71,
|
| 80 |
+
"after_15_min": 52,
|
| 81 |
+
"escalation_threshold": 60
|
| 82 |
+
}
|
| 83 |
}
|
| 84 |
},
|
| 85 |
|
|
|
|
| 109 |
"engineer_hourly_rate": 150,
|
| 110 |
"estimated_monthly_occurrences": 3,
|
| 111 |
"enterprise_savings_percentage": 0.82
|
| 112 |
+
},
|
| 113 |
+
# ============ REALISM UPGRADES ============
|
| 114 |
+
"realism": {
|
| 115 |
+
"ranked_actions": [
|
| 116 |
+
{
|
| 117 |
+
"rank": 1,
|
| 118 |
+
"confidence": 82,
|
| 119 |
+
"action": "Increase max_connections from 100 to 115 (+15%)",
|
| 120 |
+
"rationale": "Immediate relief, within safe operating limits",
|
| 121 |
+
"risk": "Disk I/O contention: Medium",
|
| 122 |
+
"constraint": "DB max_connections: 82% utilized (pre)",
|
| 123 |
+
"monitoring": "Monitor connection churn for 30 minutes"
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"rank": 2,
|
| 127 |
+
"confidence": 58,
|
| 128 |
+
"action": "Enable statement timeout (5s) + connection recycling",
|
| 129 |
+
"rationale": "Prevents runaway queries, faster pool turnover",
|
| 130 |
+
"risk": "Query cancellation may cause application errors",
|
| 131 |
+
"tradeoff": "Adds development/testing overhead"
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"rank": 3,
|
| 135 |
+
"confidence": 29,
|
| 136 |
+
"action": "Switch to pgbouncer in transaction pooling mode",
|
| 137 |
+
"rationale": "10x connection multiplexing possible",
|
| 138 |
+
"risk": "HIGH: Requires application changes, 2-hour migration",
|
| 139 |
+
"rejection_reason": "Rejected: Too invasive for incident response"
|
| 140 |
+
}
|
| 141 |
+
],
|
| 142 |
+
"constraint_awareness": {
|
| 143 |
+
"disk_io_headroom": "Low",
|
| 144 |
+
"memory_available_gb": 8.2,
|
| 145 |
+
"pool_increase_cap": "+15%",
|
| 146 |
+
"monitoring_gap": "Connection churn not tracked"
|
| 147 |
+
}
|
| 148 |
}
|
| 149 |
},
|
| 150 |
|
|
|
|
| 173 |
"engineer_hourly_rate": 150,
|
| 174 |
"estimated_monthly_occurrences": 1,
|
| 175 |
"enterprise_savings_percentage": 0.79
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
},
|
| 177 |
+
# ============ REALISM UPGRADES ============
|
| 178 |
+
"realism": {
|
| 179 |
+
"ranked_actions": [
|
| 180 |
+
{
|
| 181 |
+
"rank": 1,
|
| 182 |
+
"confidence": 76,
|
| 183 |
+
"action": "Canary restart (1/4 pods) with heap dump analysis",
|
| 184 |
+
"rationale": "Minimizes blast radius, enables root cause capture",
|
| 185 |
+
"risk": "Cold-start latency: +2.3s per pod",
|
| 186 |
+
"blast_radius_economics": {
|
| 187 |
+
"canary_restart_cost": "$850",
|
| 188 |
+
"full_restart_cost": "$3,400",
|
| 189 |
+
"payment_retry_risk": "Medium",
|
| 190 |
+
"safer_order": "Canary → scale → rollout"
|
| 191 |
+
}
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"rank": 2,
|
| 195 |
+
"confidence": 63,
|
| 196 |
+
"action": "Increase heap from 2GB to 3GB with monitoring",
|
| 197 |
+
"rationale": "Buy time for analysis, reduces restart frequency",
|
| 198 |
+
"risk": "Delays root cause identification",
|
| 199 |
+
"tradeoff": "Temporary fix, adds memory cost"
|
| 200 |
+
}
|
| 201 |
+
]
|
| 202 |
}
|
| 203 |
},
|
| 204 |
|
|
|
|
| 227 |
"engineer_hourly_rate": 150,
|
| 228 |
"estimated_monthly_occurrences": 0.5,
|
| 229 |
"enterprise_savings_percentage": 0.88
|
| 230 |
+
},
|
| 231 |
+
# ============ REALISM UPGRADES ============
|
| 232 |
+
"realism": {
|
| 233 |
+
"competing_hypotheses": [
|
| 234 |
+
{
|
| 235 |
+
"cause": "Network partition (control plane)",
|
| 236 |
+
"confidence": 61,
|
| 237 |
+
"evidence": "Quorum lost, replication lag > 30s",
|
| 238 |
+
"investigation_path": "Check network mesh, BGP status"
|
| 239 |
+
},
|
| 240 |
+
{
|
| 241 |
+
"cause": "Control plane overload",
|
| 242 |
+
"confidence": 24,
|
| 243 |
+
"evidence": "High CPU on orchestration nodes",
|
| 244 |
+
"investigation_path": "Scale control plane, check etcd health"
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"cause": "Downstream timeout amplification",
|
| 248 |
+
"confidence": 15,
|
| 249 |
+
"evidence": "Cascading failures in 3 dependent services",
|
| 250 |
+
"investigation_path": "Implement circuit breakers"
|
| 251 |
+
}
|
| 252 |
+
]
|
| 253 |
+
}
|
| 254 |
+
},
|
| 255 |
+
|
| 256 |
+
"API Rate Limit Storm": {
|
| 257 |
+
"description": "Third-party API rate limiting causing cascading failures",
|
| 258 |
+
"severity": "MEDIUM",
|
| 259 |
+
"component": "external_api_gateway",
|
| 260 |
+
"metrics": {
|
| 261 |
+
"rate_limit_hits_percentage": 95,
|
| 262 |
+
"error_rate": 42.8,
|
| 263 |
+
"retry_storm": True,
|
| 264 |
+
"cascade_effect_services": 3,
|
| 265 |
+
"queue_backlog": 8500
|
| 266 |
+
},
|
| 267 |
+
"business_impact": {
|
| 268 |
+
"revenue_loss_per_hour": 3800,
|
| 269 |
+
"partner_sla_breach": True,
|
| 270 |
+
"data_sync_delay_hours": 4,
|
| 271 |
+
"customer_reports_delay_hours": 6
|
| 272 |
+
},
|
| 273 |
+
"roi_data": {
|
| 274 |
+
"hourly_revenue_loss": 3800,
|
| 275 |
+
"manual_recovery_hours": 1.25,
|
| 276 |
+
"enterprise_recovery_hours": 0.17,
|
| 277 |
+
"engineers_required": 3,
|
| 278 |
+
"engineer_hourly_rate": 150,
|
| 279 |
+
"estimated_monthly_occurrences": 4,
|
| 280 |
+
"enterprise_savings_percentage": 0.85
|
| 281 |
+
},
|
| 282 |
+
# ============ REALISM UPGRADES ============
|
| 283 |
+
"realism": {
|
| 284 |
+
"contract_aware_reasoning": {
|
| 285 |
+
"burst_limit": "1.2× allowed",
|
| 286 |
+
"penalty_window": "15 minutes",
|
| 287 |
+
"degradation_mode": "Non-premium users only",
|
| 288 |
+
"contractual_limits": {
|
| 289 |
+
"requests_per_second": 100,
|
| 290 |
+
"monthly_overage_fee": "$0.15/request",
|
| 291 |
+
"suspension_threshold": "3 violations/month"
|
| 292 |
+
}
|
| 293 |
+
}
|
| 294 |
}
|
| 295 |
},
|
| 296 |
|
|
|
|
| 319 |
"engineer_hourly_rate": 150,
|
| 320 |
"estimated_monthly_occurrences": 1.5,
|
| 321 |
"enterprise_savings_percentage": 0.83
|
| 322 |
+
},
|
| 323 |
+
# ============ REALISM UPGRADES ============
|
| 324 |
+
"realism": {
|
| 325 |
+
"irreversibility_warnings": {
|
| 326 |
+
"rebalance_duration": "18-25 minutes",
|
| 327 |
+
"write_amplification_risk": "High",
|
| 328 |
+
"requires_explicit_approval": True,
|
| 329 |
+
"approval_level": "Director+",
|
| 330 |
+
"rollback_complexity": "High (requires snapshot restore)"
|
| 331 |
+
}
|
| 332 |
}
|
| 333 |
}
|
| 334 |
}
|