petter2025 commited on
Commit
fde25b3
·
verified ·
1 Parent(s): 2eed631

Update demo/scenarios.py

Browse files
Files changed (1) hide show
  1. demo/scenarios.py +188 -28
demo/scenarios.py CHANGED
@@ -1,5 +1,6 @@
1
  """
2
- Incident scenarios for the demo - EXPANDED VERSION
 
3
  """
4
 
5
  INCIDENT_SCENARIOS = {
@@ -28,6 +29,57 @@ INCIDENT_SCENARIOS = {
28
  "engineer_hourly_rate": 150,
29
  "estimated_monthly_occurrences": 2,
30
  "enterprise_savings_percentage": 0.85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
  },
33
 
@@ -57,6 +109,42 @@ INCIDENT_SCENARIOS = {
57
  "engineer_hourly_rate": 150,
58
  "estimated_monthly_occurrences": 3,
59
  "enterprise_savings_percentage": 0.82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  }
61
  },
62
 
@@ -85,34 +173,32 @@ INCIDENT_SCENARIOS = {
85
  "engineer_hourly_rate": 150,
86
  "estimated_monthly_occurrences": 1,
87
  "enterprise_savings_percentage": 0.79
88
- }
89
- },
90
-
91
- "API Rate Limit Storm": {
92
- "description": "Third-party API rate limiting causing cascading failures",
93
- "severity": "MEDIUM",
94
- "component": "external_api_gateway",
95
- "metrics": {
96
- "rate_limit_hits_percentage": 95,
97
- "error_rate": 42.8,
98
- "retry_storm": True,
99
- "cascade_effect_services": 3,
100
- "queue_backlog": 8500
101
  },
102
- "business_impact": {
103
- "revenue_loss_per_hour": 3800,
104
- "partner_sla_breach": True,
105
- "data_sync_delay_hours": 4,
106
- "customer_reports_delay_hours": 6
107
- },
108
- "roi_data": {
109
- "hourly_revenue_loss": 3800,
110
- "manual_recovery_hours": 1.25,
111
- "enterprise_recovery_hours": 0.17,
112
- "engineers_required": 3,
113
- "engineer_hourly_rate": 150,
114
- "estimated_monthly_occurrences": 4,
115
- "enterprise_savings_percentage": 0.85
 
 
 
 
 
 
 
 
 
 
 
116
  }
117
  },
118
 
@@ -141,6 +227,70 @@ INCIDENT_SCENARIOS = {
141
  "engineer_hourly_rate": 150,
142
  "estimated_monthly_occurrences": 0.5,
143
  "enterprise_savings_percentage": 0.88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  }
145
  },
146
 
@@ -169,6 +319,16 @@ INCIDENT_SCENARIOS = {
169
  "engineer_hourly_rate": 150,
170
  "estimated_monthly_occurrences": 1.5,
171
  "enterprise_savings_percentage": 0.83
 
 
 
 
 
 
 
 
 
 
172
  }
173
  }
174
  }
 
1
  """
2
+ Incident scenarios for the demo - EXPANDED VERSION WITH REALISM UPGRADES
3
+ Version: 3.3.9+realism
4
  """
5
 
6
  INCIDENT_SCENARIOS = {
 
29
  "engineer_hourly_rate": 150,
30
  "estimated_monthly_occurrences": 2,
31
  "enterprise_savings_percentage": 0.85
32
+ },
33
+ # ============ REALISM UPGRADES ============
34
+ "realism": {
35
+ "ranked_actions": [
36
+ {
37
+ "rank": 1,
38
+ "confidence": 87,
39
+ "action": "Scale Redis cluster from 3 to 5 nodes",
40
+ "rationale": "Immediate throughput increase, reduces contention",
41
+ "risk": "Cold cache amplification: Medium",
42
+ "tradeoff": "Adds $420/month infrastructure cost",
43
+ "execution_time": "8-12 minutes",
44
+ "success_rate": "94% based on 18 similar incidents"
45
+ },
46
+ {
47
+ "rank": 2,
48
+ "confidence": 62,
49
+ "action": "Implement request coalescing with 500ms window",
50
+ "rationale": "Reduces duplicate DB queries, lower blast radius",
51
+ "risk": "Adds 150-200ms latency per request",
52
+ "tradeoff": "Slower stabilization (15-20 minutes)",
53
+ "rejection_note": "Secondary option if scaling unavailable"
54
+ },
55
+ {
56
+ "rank": 3,
57
+ "confidence": 34,
58
+ "action": "Restart Redis cluster with warmup script",
59
+ "rationale": "Clears fragmentation, resets eviction policies",
60
+ "risk": "HIGH: 45-second service interruption",
61
+ "rejection_reason": "Rejected: High data loss risk during peak traffic",
62
+ "safety_override": "Required for Enterprise execution"
63
+ }
64
+ ],
65
+ "risk_assessment": {
66
+ "stampede_probability": "18%",
67
+ "cold_cache_impact": "Medium",
68
+ "data_inconsistency_risk": "Low",
69
+ "recovery_complexity": "Medium"
70
+ },
71
+ "constraints": {
72
+ "max_redis_nodes": 8,
73
+ "scaling_cooldown": "30 minutes",
74
+ "concurrent_connections": "25,000",
75
+ "data_size_gb": 42
76
+ },
77
+ "confidence_degradation": {
78
+ "initial": 94,
79
+ "after_8_min": 71,
80
+ "after_15_min": 52,
81
+ "escalation_threshold": 60
82
+ }
83
  }
84
  },
85
 
 
109
  "engineer_hourly_rate": 150,
110
  "estimated_monthly_occurrences": 3,
111
  "enterprise_savings_percentage": 0.82
112
+ },
113
+ # ============ REALISM UPGRADES ============
114
+ "realism": {
115
+ "ranked_actions": [
116
+ {
117
+ "rank": 1,
118
+ "confidence": 82,
119
+ "action": "Increase max_connections from 100 to 115 (+15%)",
120
+ "rationale": "Immediate relief, within safe operating limits",
121
+ "risk": "Disk I/O contention: Medium",
122
+ "constraint": "DB max_connections: 82% utilized (pre)",
123
+ "monitoring": "Monitor connection churn for 30 minutes"
124
+ },
125
+ {
126
+ "rank": 2,
127
+ "confidence": 58,
128
+ "action": "Enable statement timeout (5s) + connection recycling",
129
+ "rationale": "Prevents runaway queries, faster pool turnover",
130
+ "risk": "Query cancellation may cause application errors",
131
+ "tradeoff": "Adds development/testing overhead"
132
+ },
133
+ {
134
+ "rank": 3,
135
+ "confidence": 29,
136
+ "action": "Switch to pgbouncer in transaction pooling mode",
137
+ "rationale": "10x connection multiplexing possible",
138
+ "risk": "HIGH: Requires application changes, 2-hour migration",
139
+ "rejection_reason": "Rejected: Too invasive for incident response"
140
+ }
141
+ ],
142
+ "constraint_awareness": {
143
+ "disk_io_headroom": "Low",
144
+ "memory_available_gb": 8.2,
145
+ "pool_increase_cap": "+15%",
146
+ "monitoring_gap": "Connection churn not tracked"
147
+ }
148
  }
149
  },
150
 
 
173
  "engineer_hourly_rate": 150,
174
  "estimated_monthly_occurrences": 1,
175
  "enterprise_savings_percentage": 0.79
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  },
177
+ # ============ REALISM UPGRADES ============
178
+ "realism": {
179
+ "ranked_actions": [
180
+ {
181
+ "rank": 1,
182
+ "confidence": 76,
183
+ "action": "Canary restart (1/4 pods) with heap dump analysis",
184
+ "rationale": "Minimizes blast radius, enables root cause capture",
185
+ "risk": "Cold-start latency: +2.3s per pod",
186
+ "blast_radius_economics": {
187
+ "canary_restart_cost": "$850",
188
+ "full_restart_cost": "$3,400",
189
+ "payment_retry_risk": "Medium",
190
+ "safer_order": "Canary → scale → rollout"
191
+ }
192
+ },
193
+ {
194
+ "rank": 2,
195
+ "confidence": 63,
196
+ "action": "Increase heap from 2GB to 3GB with monitoring",
197
+ "rationale": "Buy time for analysis, reduces restart frequency",
198
+ "risk": "Delays root cause identification",
199
+ "tradeoff": "Temporary fix, adds memory cost"
200
+ }
201
+ ]
202
  }
203
  },
204
 
 
227
  "engineer_hourly_rate": 150,
228
  "estimated_monthly_occurrences": 0.5,
229
  "enterprise_savings_percentage": 0.88
230
+ },
231
+ # ============ REALISM UPGRADES ============
232
+ "realism": {
233
+ "competing_hypotheses": [
234
+ {
235
+ "cause": "Network partition (control plane)",
236
+ "confidence": 61,
237
+ "evidence": "Quorum lost, replication lag > 30s",
238
+ "investigation_path": "Check network mesh, BGP status"
239
+ },
240
+ {
241
+ "cause": "Control plane overload",
242
+ "confidence": 24,
243
+ "evidence": "High CPU on orchestration nodes",
244
+ "investigation_path": "Scale control plane, check etcd health"
245
+ },
246
+ {
247
+ "cause": "Downstream timeout amplification",
248
+ "confidence": 15,
249
+ "evidence": "Cascading failures in 3 dependent services",
250
+ "investigation_path": "Implement circuit breakers"
251
+ }
252
+ ]
253
+ }
254
+ },
255
+
256
+ "API Rate Limit Storm": {
257
+ "description": "Third-party API rate limiting causing cascading failures",
258
+ "severity": "MEDIUM",
259
+ "component": "external_api_gateway",
260
+ "metrics": {
261
+ "rate_limit_hits_percentage": 95,
262
+ "error_rate": 42.8,
263
+ "retry_storm": True,
264
+ "cascade_effect_services": 3,
265
+ "queue_backlog": 8500
266
+ },
267
+ "business_impact": {
268
+ "revenue_loss_per_hour": 3800,
269
+ "partner_sla_breach": True,
270
+ "data_sync_delay_hours": 4,
271
+ "customer_reports_delay_hours": 6
272
+ },
273
+ "roi_data": {
274
+ "hourly_revenue_loss": 3800,
275
+ "manual_recovery_hours": 1.25,
276
+ "enterprise_recovery_hours": 0.17,
277
+ "engineers_required": 3,
278
+ "engineer_hourly_rate": 150,
279
+ "estimated_monthly_occurrences": 4,
280
+ "enterprise_savings_percentage": 0.85
281
+ },
282
+ # ============ REALISM UPGRADES ============
283
+ "realism": {
284
+ "contract_aware_reasoning": {
285
+ "burst_limit": "1.2× allowed",
286
+ "penalty_window": "15 minutes",
287
+ "degradation_mode": "Non-premium users only",
288
+ "contractual_limits": {
289
+ "requests_per_second": 100,
290
+ "monthly_overage_fee": "$0.15/request",
291
+ "suspension_threshold": "3 violations/month"
292
+ }
293
+ }
294
  }
295
  },
296
 
 
319
  "engineer_hourly_rate": 150,
320
  "estimated_monthly_occurrences": 1.5,
321
  "enterprise_savings_percentage": 0.83
322
+ },
323
+ # ============ REALISM UPGRADES ============
324
+ "realism": {
325
+ "irreversibility_warnings": {
326
+ "rebalance_duration": "18-25 minutes",
327
+ "write_amplification_risk": "High",
328
+ "requires_explicit_approval": True,
329
+ "approval_level": "Director+",
330
+ "rollback_complexity": "High (requires snapshot restore)"
331
+ }
332
  }
333
  }
334
  }