petter2025 commited on
Commit
03cfe7a
·
verified ·
1 Parent(s): 0d80e53

Delete demo

Browse files
Files changed (5) hide show
  1. demo/__init__.py +0 -9
  2. demo/guidance.py +0 -549
  3. demo/mock_arf.py +0 -668
  4. demo/orchestrator.py +0 -98
  5. demo/scenarios.py +0 -334
demo/__init__.py DELETED
@@ -1,9 +0,0 @@
1
- """
2
- ARF Demo Package
3
- """
4
-
5
- from .orchestrator import DemoOrchestrator
6
- from .scenarios import INCIDENT_SCENARIOS
7
-
8
- __all__ = ['DemoOrchestrator', 'INCIDENT_SCENARIOS']
9
- __version__ = '3.8.0'
 
 
 
 
 
 
 
 
 
 
demo/guidance.py DELETED
@@ -1,549 +0,0 @@
1
- """
2
- Enhanced Demo Guidance System - Manages the psychology and flow of the ARF demo
3
- Adds clear narrative phases and boundary awareness
4
- """
5
-
6
- from enum import Enum
7
- from typing import Dict, List, Any, Optional
8
- from dataclasses import dataclass
9
- import time
10
-
11
- class DemoPhase(Enum):
12
- """Phases of the demo narrative with clear boundaries"""
13
- INTRODUCTION = "introduction"
14
- FAILURE_INJECTION = "failure_injection"
15
- REAL_OSS_ANALYSIS = "real_oss_analysis"
16
- DECISION_BOUNDARY = "decision_boundary"
17
- SIMULATED_ENTERPRISE = "simulated_enterprise"
18
- RESOLUTION = "resolution"
19
- ARCHITECTURE_REVIEW = "architecture_review"
20
-
21
- @dataclass
22
- class PhaseContent:
23
- """Enhanced content for each demo phase with boundary indicators"""
24
- phase: DemoPhase
25
- title: str
26
- narrative: str
27
- key_message: str
28
- visual_cue: str
29
- duration_seconds: int
30
- show_boundary: bool = False
31
- boundary_text: Optional[str] = None
32
- is_real_arf: bool = False
33
-
34
- def get_html(self, show_progress: bool = True, current_step: int = 1, total_steps: int = 7) -> str:
35
- """Get HTML for this phase with progress indicator"""
36
- # Progress indicator
37
- progress_html = ""
38
- if show_progress:
39
- progress_percentage = int((current_step / total_steps) * 100)
40
- progress_html = f"""
41
- <div style="margin-bottom: 20px;">
42
- <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
43
- <div style="font-size: 13px; color: #64748b; font-weight: 500;">
44
- Demo Progress: Step {current_step} of {total_steps}
45
- </div>
46
- <div style="font-size: 13px; color: #3b82f6; font-weight: 600;">
47
- {progress_percentage}%
48
- </div>
49
- </div>
50
- <div style="height: 6px; background: #e2e8f0; border-radius: 3px; overflow: hidden;">
51
- <div style="width: {progress_percentage}%; height: 100%;
52
- background: linear-gradient(90deg, #3b82f6, #8b5cf6);
53
- border-radius: 3px; transition: width 0.3s ease;">
54
- </div>
55
- </div>
56
- </div>
57
- """
58
-
59
- # Real ARF indicator
60
- real_arf_html = ""
61
- if self.is_real_arf:
62
- real_arf_html = f"""
63
- <div style="margin: 15px 0; padding: 10px; background: #f0fdf4;
64
- border-radius: 8px; border: 2px solid #10b981;">
65
- <div style="display: flex; align-items: center; gap: 8px;">
66
- <div style="font-size: 20px;">✅</div>
67
- <div style="font-weight: 600; color: #065f46;">REAL ARF OSS v3.3.7</div>
68
- </div>
69
- <div style="font-size: 13px; color: #047857; margin-top: 5px;">
70
- Running actual agentic-reliability-framework==3.3.7 package
71
- </div>
72
- </div>
73
- """
74
-
75
- # Boundary indicator
76
- boundary_html = ""
77
- if self.show_boundary and self.boundary_text:
78
- boundary_html = f"""
79
- <div style="margin: 15px 0; padding: 12px; background: #fef3c7;
80
- border-radius: 10px; border-left: 4px solid #f59e0b;">
81
- <div style="display: flex; align-items: center; gap: 8px; margin-bottom: 5px;">
82
- <div style="font-size: 20px;">🎭</div>
83
- <div style="font-weight: 600; color: #92400e;">Demo Boundary</div>
84
- </div>
85
- <div style="font-size: 13px; color: #b45309; line-height: 1.5;">
86
- {self.boundary_text}
87
- </div>
88
- </div>
89
- """
90
-
91
- return f"""
92
- <div style="border: 2px solid #3b82f6; border-radius: 16px; padding: 25px;
93
- background: linear-gradient(135deg, #f8fafc 0%, #ffffff 100%);
94
- box-shadow: 0 8px 32px rgba(59, 130, 246, 0.1); margin: 20px 0;">
95
- <div style="display: flex; align-items: center; gap: 15px; margin-bottom: 20px;">
96
- <div style="font-size: 36px;">{self.visual_cue}</div>
97
- <div>
98
- <h3 style="margin: 0 0 5px 0; color: #1e293b; font-size: 20px; font-weight: 700;">
99
- {self.title}
100
- </h3>
101
- <div style="font-size: 14px; color: #64748b;">
102
- Phase: {self.phase.value.replace('_', ' ').title()}
103
- </div>
104
- </div>
105
- </div>
106
-
107
- {progress_html}
108
-
109
- <div style="margin-bottom: 20px;">
110
- <div style="font-size: 16px; color: #475569; line-height: 1.6; margin-bottom: 15px;">
111
- {self.narrative}
112
- </div>
113
-
114
- {real_arf_html}
115
- {boundary_html}
116
-
117
- <div style="padding: 15px; background: #f1f5f9; border-radius: 10px;
118
- border-left: 4px solid #3b82f6;">
119
- <div style="font-weight: 600; color: #1e293b; margin-bottom: 5px;">
120
- 🎯 Key Message
121
- </div>
122
- <div style="font-size: 15px; color: #475569; line-height: 1.5;">
123
- {self.key_message}
124
- </div>
125
- </div>
126
- </div>
127
-
128
- <div style="display: flex; justify-content: space-between; align-items: center;
129
- margin-top: 20px; padding-top: 15px; border-top: 1px solid #e2e8f0;">
130
- <div style="font-size: 12px; color: #94a3b8;">
131
- ⏱️ Duration: {self.duration_seconds}s •
132
- 🎯 Focus: {self.phase.value.replace('_', ' ').title()}
133
- </div>
134
- <div style="display: flex; gap: 10px;">
135
- <div style="padding: 4px 10px; background: #e2e8f0;
136
- color: #64748b; border-radius: 12px; font-size: 11px; font-weight: 500;">
137
- Phase {current_step}
138
- </div>
139
- </div>
140
- </div>
141
- </div>
142
- """
143
-
144
- # Complete demo flow with psychological pacing
145
- DEMO_FLOW = {
146
- DemoPhase.INTRODUCTION: PhaseContent(
147
- phase=DemoPhase.INTRODUCTION,
148
- title="🚀 Welcome to ARF v3.3.7 - The Architecture Demo",
149
- narrative="""
150
- Most AI systems fail silently in production. This one doesn't. We're about to demonstrate
151
- a production-grade agentic reliability system with <strong>clear architectural boundaries</strong>.
152
-
153
- This demo shows:
154
- 1. <strong>Real ARF OSS v3.3.7</strong> - Actual advisory intelligence
155
- 2. <strong>Simulated Enterprise</strong> - Value proposition without infrastructure access
156
- 3. <strong>Clear separation</strong> - Honest boundaries between OSS and Enterprise
157
- """,
158
- key_message="This isn't AI theater. It's a production-ready system with architectural honesty.",
159
- visual_cue="🎭",
160
- duration_seconds=30,
161
- show_boundary=True,
162
- boundary_text="We're simulating Enterprise execution for the demo. Real execution requires production infrastructure.",
163
- is_real_arf=False
164
- ),
165
-
166
- DemoPhase.FAILURE_INJECTION: PhaseContent(
167
- phase=DemoPhase.FAILURE_INJECTION,
168
- title="🚨 Phase 1: Inject Production Failure",
169
- narrative="""
170
- We're simulating a <strong>Cache Miss Storm</strong> affecting 45,000 users with $8,500/hour revenue risk.
171
-
172
- This is how most systems look right before they fail silently. The metrics show:
173
- • Cache hit rate dropped from 85% to 18%
174
- • Database load increased to 92%
175
- • Response time spiked to 1,850ms
176
-
177
- Notice: No remediation is running yet. We're letting you feel the tension.
178
- """,
179
- key_message="Failure happens. The question is how quickly and intelligently you respond.",
180
- visual_cue="📉",
181
- duration_seconds=20,
182
- show_boundary=False,
183
- is_real_arf=False
184
- ),
185
-
186
- DemoPhase.REAL_OSS_ANALYSIS: PhaseContent(
187
- phase=DemoPhase.REAL_OSS_ANALYSIS,
188
- title="🧠 Phase 2: Real ARF OSS Intelligence Activates",
189
- narrative="""
190
- ARF OSS v3.3.7 is now <strong>analyzing the incident in real-time</strong>. This is not a mock:
191
-
192
- 1. <strong>Detection Agent</strong> - Finds anomalies with 98.7% confidence
193
- 2. <strong>Recall Agent</strong> - Searches RAG memory for similar incidents
194
- 3. <strong>Decision Agent</strong> - Generates healing intent with reasoning
195
-
196
- Watch the confidence scores increase as evidence accumulates. This is <strong>real inference</strong>,
197
- not pre-programmed responses. The system is reasoning, not reacting.
198
- """,
199
- key_message="ARF OSS provides production-grade intelligence. It reasons before it recommends.",
200
- visual_cue="🤖",
201
- duration_seconds=45,
202
- show_boundary=True,
203
- boundary_text="This is REAL ARF OSS v3.3.7 (Apache 2.0). It can analyze but not execute.",
204
- is_real_arf=True
205
- ),
206
-
207
- DemoPhase.DECISION_BOUNDARY: PhaseContent(
208
- phase=DemoPhase.DECISION_BOUNDARY,
209
- title="🎯 Phase 3: The Execution Boundary",
210
- narrative="""
211
- ARF OSS has created a <strong>HealingIntent with 94% confidence</strong>:
212
- • Action: Scale Redis cluster from 3 to 5 nodes
213
- • Pattern match: 87% success rate from similar incidents
214
- • Safety check: ✅ Passed (blast radius: 2 services)
215
-
216
- Now we pause intentionally. This is the <strong>architectural boundary</strong>:
217
- • <strong>OSS can reason</strong> (Apache 2.0, advisory only)
218
- • <strong>Enterprise can execute</strong> (Commercial, with safety guarantees)
219
-
220
- The system knows what to do, but requires authority to act.
221
- """,
222
- key_message="Reasoning and authority are not the same thing. This boundary is intentional.",
223
- visual_cue="⚖️",
224
- duration_seconds=25,
225
- show_boundary=True,
226
- boundary_text="OSS boundary reached. Execution requires Enterprise edition and infrastructure authority.",
227
- is_real_arf=True
228
- ),
229
-
230
- DemoPhase.SIMULATED_ENTERPRISE: PhaseContent(
231
- phase=DemoPhase.SIMULATED_ENTERPRISE,
232
- title="🏢 Phase 4: Simulated Enterprise Execution",
233
- narrative="""
234
- We're now simulating what <strong>ARF Enterprise</strong> would do:
235
-
236
- 1. <strong>Validate safety constraints</strong> - Business hours, blast radius, rollback plans
237
- 2. <strong>Apply novel execution protocols</strong> - Deterministic confidence, not just ML probabilities
238
- 3. <strong>Execute with guarantees</strong> - Rollback prepared, circuit breakers set
239
-
240
- In production, this would execute against real infrastructure (Kubernetes, cloud APIs, etc.).
241
- For the demo, we're showing the value proposition without real side effects.
242
- """,
243
- key_message="Enterprise adds execution authority, not just better intelligence.",
244
- visual_cue="⚡",
245
- duration_seconds=35,
246
- show_boundary=True,
247
- boundary_text="SIMULATED EXECUTION - Real Enterprise would execute against production infrastructure.",
248
- is_real_arf=False
249
- ),
250
-
251
- DemoPhase.RESOLUTION: PhaseContent(
252
- phase=DemoPhase.RESOLUTION,
253
- title="✅ Phase 5: Incident Resolution",
254
- narrative="""
255
- The simulated execution completes:
256
- • <strong>Recovery time:</strong> 12 minutes (vs 45 minutes manual)
257
- • <strong>Cost saved:</strong> $6,375
258
- • <strong>Users protected:</strong> 45,000 → 0 impacted
259
- • <strong>Learning:</strong> Pattern added to RAG memory
260
-
261
- System health normalizes. Confidence scores stabilize. The incident is marked as
262
- <strong>resolved autonomously</strong>.
263
-
264
- Key metrics show the impact:
265
- • Detection time: 45s (89% faster than average)
266
- • Auto-heal rate: 81.7% (5.4× industry average)
267
- """,
268
- key_message="Autonomous reliability creates measurable business impact.",
269
- visual_cue="📊",
270
- duration_seconds=30,
271
- show_boundary=False,
272
- is_real_arf=False
273
- ),
274
-
275
- DemoPhase.ARCHITECTURE_REVIEW: PhaseContent(
276
- phase=DemoPhase.ARCHITECTURE_REVIEW,
277
- title="🏗️ Phase 6: Architecture Validated",
278
- narrative="""
279
- Let's review what we demonstrated:
280
-
281
- <strong>✅ Real Components (Production-Ready):</strong>
282
- • ARF OSS v3.3.7 intelligence engine
283
- • Three-agent pattern (Detection, Recall, Decision)
284
- • RAG-based similarity search
285
- • Confidence scoring and reasoning chains
286
-
287
- <strong>🎭 Simulated Components (Demo Value):</strong>
288
- • Enterprise execution authority
289
- • Infrastructure orchestration
290
- • Rollback guarantees
291
- • Novel execution protocols
292
-
293
- <strong>🎯 Clear Boundaries (Architectural Honesty):</strong>
294
- • OSS advises, Enterprise executes
295
- • No hidden automation or deception
296
- • Production-ready separation
297
- """,
298
- key_message="This demo shows production architecture, not just AI capabilities.",
299
- visual_cue="💎",
300
- duration_seconds=40,
301
- show_boundary=True,
302
- boundary_text="Architecture validated: OSS for intelligence, Enterprise for execution.",
303
- is_real_arf=False
304
- )
305
- }
306
-
307
- # Original user journey steps (enhanced with phase alignment)
308
- USER_JOURNEY_STEPS = [
309
- {
310
- "step": 1,
311
- "title": "🎭 Understand the Architecture",
312
- "description": "Review the demo flow to understand clear boundaries between OSS and Enterprise",
313
- "tab": "All Tabs",
314
- "action": "Read the phase guidance",
315
- "learning": "See how ARF separates intelligence (OSS) from execution (Enterprise)",
316
- "phase": DemoPhase.INTRODUCTION.value,
317
- "duration": "30s"
318
- },
319
- {
320
- "step": 2,
321
- "title": "🔥 Experience REAL ARF OSS Analysis",
322
- "description": "Select an incident and run OSS analysis to see actual ARF v3.3.7 intelligence",
323
- "tab": "Live Incident Demo",
324
- "action": "Click 'Run OSS Analysis'",
325
- "learning": "See real ARF OSS package analyzing incidents with confidence scores",
326
- "phase": DemoPhase.REAL_OSS_ANALYSIS.value,
327
- "duration": "45s"
328
- },
329
- {
330
- "step": 3,
331
- "title": "🎯 Observe the Execution Boundary",
332
- "description": "Notice where OSS stops and Enterprise would begin",
333
- "tab": "Live Incident Demo",
334
- "action": "Review HealingIntent and boundary indicators",
335
- "learning": "Understand the architectural separation between advisory and execution",
336
- "phase": DemoPhase.DECISION_BOUNDARY.value,
337
- "duration": "25s"
338
- },
339
- {
340
- "step": 4,
341
- "title": "⚡ Simulate Enterprise Healing",
342
- "description": "Experience autonomous healing with simulated execution",
343
- "tab": "Live Incident Demo",
344
- "action": "Click 'Execute Enterprise Healing'",
345
- "learning": "See the Enterprise value proposition without real infrastructure",
346
- "phase": DemoPhase.SIMULATED_ENTERPRISE.value,
347
- "duration": "35s"
348
- },
349
- {
350
- "step": 5,
351
- "title": "💰 Calculate Your Business ROI",
352
- "description": "Adjust the sliders to see potential savings for your organization",
353
- "tab": "Business Impact & ROI",
354
- "action": "Use sliders then click 'Calculate My ROI'",
355
- "learning": "Understand the business case with your specific numbers",
356
- "phase": "business_roi",
357
- "duration": "60s"
358
- },
359
- {
360
- "step": 6,
361
- "title": "📜 Explore Enterprise-Grade Compliance",
362
- "description": "View comprehensive audit trail and compliance features",
363
- "tab": "Audit Trail & History",
364
- "action": "Check execution and incident history",
365
- "learning": "See enterprise-level logging, compliance, and audit capabilities",
366
- "phase": "compliance",
367
- "duration": "45s"
368
- },
369
- {
370
- "step": 7,
371
- "title": "🧠 Discover the Learning Engine",
372
- "description": "Explore pattern detection and similarity search",
373
- "tab": "Learning Engine",
374
- "action": "Search for similar incidents and view patterns",
375
- "learning": "See how ARF learns from past incidents to improve future responses",
376
- "phase": "learning",
377
- "duration": "50s"
378
- }
379
- ]
380
-
381
- # Enhanced demo tips with boundary awareness
382
- DEMO_TIPS = [
383
- "💎 **Architecture Tip**: Look for the 'REAL ARF' vs 'SIMULATED' indicators to understand boundaries",
384
- "🎭 **Demo Tip**: The 'Run Complete Demo' button follows our psychological pacing guide",
385
- "⚡ **Enterprise Tip**: Toggle approval mode to see different execution workflows",
386
- "📊 **ROI Tip**: Use realistic numbers for your organization in the ROI calculator",
387
- "🔍 **Analysis Tip**: Try different incident scenarios to see varied ARF responses",
388
- "📜 **Compliance Tip**: Export the audit trail to see comprehensive JSON structure",
389
- "🧠 **Learning Tip**: Search for patterns to see how ARF improves over time",
390
- "🎯 **Boundary Tip**: Notice where OSS analysis ends and Enterprise execution would begin"
391
- ]
392
-
393
- # Psychology-driven quick start guide
394
- QUICK_START_GUIDE = {
395
- "for_executives": {
396
- "focus": "Business Impact & ROI",
397
- "steps": [
398
- "1. Go to 'Business Impact & ROI' tab",
399
- "2. Adjust sliders to match your organization",
400
- "3. Click 'Calculate My ROI'",
401
- "4. Review the 5.2× ROI multiplier",
402
- "5. Ask: 'What would 73% faster MTTR mean for us?'"
403
- ],
404
- "time": "2 minutes",
405
- "key_question": "What's the cost of NOT having autonomous reliability?"
406
- },
407
- "for_engineers": {
408
- "focus": "Real ARF OSS Analysis",
409
- "steps": [
410
- "1. Select 'Cache Miss Storm' scenario",
411
- "2. Click 'Run OSS Analysis'",
412
- "3. Watch the three agents work in real-time",
413
- "4. Review the HealingIntent with 94% confidence",
414
- "5. Notice the reasoning chain and evidence"
415
- ],
416
- "time": "3 minutes",
417
- "key_question": "How would this intelligence change your on-call experience?"
418
- },
419
- "for_architects": {
420
- "focus": "Architecture Boundaries",
421
- "steps": [
422
- "1. Run the complete demo walkthrough",
423
- "2. Look for 'REAL ARF' vs 'SIMULATED' indicators",
424
- "3. Notice the execution boundary",
425
- "4. Review the architecture validation phase",
426
- "5. Ask: 'How would this integrate with our stack?'"
427
- ],
428
- "time": "4 minutes",
429
- "key_question": "Is our current approach proactive or reactive?"
430
- }
431
- }
432
-
433
- def get_phase_content(phase: DemoPhase) -> PhaseContent:
434
- """Get content for a specific demo phase"""
435
- return DEMO_FLOW.get(phase, DEMO_FLOW[DemoPhase.INTRODUCTION])
436
-
437
- def get_phase_html(phase: DemoPhase, current_step: int = 1) -> str:
438
- """Get HTML for a demo phase with progress indicator"""
439
- content = get_phase_content(phase)
440
- total_steps = len(DEMO_FLOW)
441
-
442
- # Calculate step number based on phase
443
- phase_order = list(DEMO_FLOW.keys())
444
- step_number = phase_order.index(phase) + 1 if phase in phase_order else current_step
445
-
446
- return content.get_html(
447
- show_progress=True,
448
- current_step=step_number,
449
- total_steps=total_steps
450
- )
451
-
452
- def get_demo_progress(current_phase: DemoPhase) -> Dict[str, Any]:
453
- """Get current demo progress information"""
454
- phase_order = list(DEMO_FLOW.keys())
455
- current_index = phase_order.index(current_phase) if current_phase in phase_order else 0
456
-
457
- return {
458
- "current_phase": current_phase.value,
459
- "current_step": current_index + 1,
460
- "total_steps": len(phase_order),
461
- "progress_percentage": int(((current_index + 1) / len(phase_order)) * 100),
462
- "next_phase": phase_order[current_index + 1].value if current_index + 1 < len(phase_order) else None,
463
- "estimated_time_remaining": sum(
464
- DEMO_FLOW[phase].duration_seconds
465
- for i, phase in enumerate(phase_order)
466
- if i > current_index
467
- )
468
- }
469
-
470
- def get_quick_start_guide(role: str = "executives") -> Dict[str, Any]:
471
- """Get quick start guide for specific role"""
472
- return QUICK_START_GUIDE.get(role, QUICK_START_GUIDE["for_executives"])
473
-
474
- # Psychology-focused demo controller
475
- class DemoPsychologyController:
476
- """Manages the psychological flow of the demo"""
477
-
478
- def __init__(self):
479
- self.current_phase = DemoPhase.INTRODUCTION
480
- self.phase_start_time = time.time()
481
- self.completed_phases = []
482
- self.user_attention_score = 100 # Start with full attention
483
-
484
- def transition_to_phase(self, phase: DemoPhase) -> Dict[str, Any]:
485
- """Transition to a new demo phase with psychological timing"""
486
- current_time = time.time()
487
- phase_duration = current_time - self.phase_start_time
488
-
489
- # Calculate attention score (decays over time, refreshes on phase change)
490
- self.user_attention_score = max(60, self.user_attention_score - (phase_duration / 10))
491
-
492
- # If phase was too short, user might have missed it
493
- if phase_duration < 10 and self.current_phase != DemoPhase.INTRODUCTION:
494
- self.user_attention_score -= 10
495
-
496
- # Update state
497
- self.completed_phases.append(self.current_phase)
498
- self.current_phase = phase
499
- self.phase_start_time = time.time()
500
-
501
- # Refresh attention on phase change
502
- self.user_attention_score = min(100, self.user_attention_score + 20)
503
-
504
- return {
505
- "new_phase": phase.value,
506
- "previous_phase_duration": int(phase_duration),
507
- "user_attention_score": int(self.user_attention_score),
508
- "recommended_pause": self._get_recommended_pause(phase),
509
- "key_message": DEMO_FLOW[phase].key_message
510
- }
511
-
512
- def _get_recommended_pause(self, phase: DemoPhase) -> str:
513
- """Get recommended pause based on phase psychology"""
514
- pauses = {
515
- DemoPhase.INTRODUCTION: "Pause to set context",
516
- DemoPhase.FAILURE_INJECTION: "Let the tension build",
517
- DemoPhase.REAL_OSS_ANALYSIS: "Watch the reasoning unfold",
518
- DemoPhase.DECISION_BOUNDARY: "Pause intentionally here",
519
- DemoPhase.SIMULATED_ENTERPRISE: "Explain the simulation",
520
- DemoPhase.RESOLUTION: "Show the impact",
521
- DemoPhase.ARCHITECTURE_REVIEW: "Summarize the architecture"
522
- }
523
- return pauses.get(phase, "Continue")
524
-
525
- def get_current_guidance(self) -> str:
526
- """Get current guidance HTML"""
527
- return get_phase_html(self.current_phase, len(self.completed_phases) + 1)
528
-
529
- def should_speed_up(self) -> bool:
530
- """Determine if we should speed up based on attention score"""
531
- return self.user_attention_score < 70
532
-
533
- def should_slow_down(self) -> bool:
534
- """Determine if we should slow down for emphasis"""
535
- important_phases = [
536
- DemoPhase.DECISION_BOUNDARY,
537
- DemoPhase.ARCHITECTURE_REVIEW
538
- ]
539
- return self.current_phase in important_phases
540
-
541
- # Global demo controller instance
542
- _demo_controller = None
543
-
544
- def get_demo_controller() -> DemoPsychologyController:
545
- """Get singleton demo controller instance"""
546
- global _demo_controller
547
- if _demo_controller is None:
548
- _demo_controller = DemoPsychologyController()
549
- return _demo_controller
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
demo/mock_arf.py DELETED
@@ -1,668 +0,0 @@
1
- # demo/mock_arf.py
2
- """
3
- Enhanced Mock ARF with scenario-aware metrics
4
- Generates different values based on scenario characteristics
5
- DOCTRINAL COMPLIANCE VERSION 3.3.9+restraint
6
- Key Addition: Explicit Observation Gate for psychological advantage
7
- """
8
- import random
9
- import time
10
- import datetime
11
- from typing import Dict, Any, List
12
- import json
13
-
14
- # Scenario-specific configurations
15
- SCENARIO_CONFIGS = {
16
- "Cache Miss Storm": {
17
- "detection_confidence_range": (0.97, 0.995), # 97-99.5%
18
- "detection_time_range": (35, 55), # 35-55 seconds
19
- "accuracy_range": (0.97, 0.995), # 97-99.5%
20
- "similar_incidents_range": (2, 5), # 2-5 similar incidents
21
- "similarity_score_range": (0.88, 0.96), # 88-96% similarity
22
- "pattern_confidence_range": (0.91, 0.97), # 91-97% confidence
23
- "success_rate_range": (0.82, 0.93), # 82-93% success rate
24
- "cost_savings_range": (5000, 9000), # $5K-$9K savings
25
- "resolution_time_range": (10, 18), # 10-18 minutes
26
- "affected_users_range": (30000, 60000), # 30K-60K users
27
- "tags": ["cache", "redis", "latency", "memory"]
28
- },
29
- "Database Connection Pool Exhaustion": {
30
- "detection_confidence_range": (0.92, 0.98),
31
- "detection_time_range": (40, 65),
32
- "accuracy_range": (0.95, 0.985),
33
- "similar_incidents_range": (1, 4),
34
- "similarity_score_range": (0.85, 0.94),
35
- "pattern_confidence_range": (0.88, 0.95),
36
- "success_rate_range": (0.78, 0.88),
37
- "cost_savings_range": (3500, 5500),
38
- "resolution_time_range": (15, 25),
39
- "affected_users_range": (15000, 30000),
40
- "tags": ["database", "postgres", "connections", "pool"]
41
- },
42
- "Kubernetes Memory Leak": {
43
- "detection_confidence_range": (0.94, 0.99),
44
- "detection_time_range": (30, 50),
45
- "accuracy_range": (0.96, 0.99),
46
- "similar_incidents_range": (3, 6),
47
- "similarity_score_range": (0.89, 0.95),
48
- "pattern_confidence_range": (0.90, 0.96),
49
- "success_rate_range": (0.85, 0.92),
50
- "cost_savings_range": (4500, 7500),
51
- "resolution_time_range": (12, 22),
52
- "affected_users_range": (20000, 40000),
53
- "tags": ["kubernetes", "memory", "container", "leak"]
54
- },
55
- "API Rate Limit Storm": {
56
- "detection_confidence_range": (0.96, 0.99),
57
- "detection_time_range": (25, 45),
58
- "accuracy_range": (0.97, 0.99),
59
- "similar_incidents_range": (2, 4),
60
- "similarity_score_range": (0.87, 0.93),
61
- "pattern_confidence_range": (0.89, 0.94),
62
- "success_rate_range": (0.80, 0.90),
63
- "cost_savings_range": (3000, 5000),
64
- "resolution_time_range": (8, 15),
65
- "affected_users_range": (10000, 25000),
66
- "tags": ["api", "rate_limit", "throttling", "ddos"]
67
- },
68
- "Network Partition": {
69
- "detection_confidence_range": (0.98, 0.999),
70
- "detection_time_range": (20, 40),
71
- "accuracy_range": (0.98, 0.995),
72
- "similar_incidents_range": (1, 3),
73
- "similarity_score_range": (0.90, 0.97),
74
- "pattern_confidence_range": (0.93, 0.98),
75
- "success_rate_range": (0.75, 0.85),
76
- "cost_savings_range": (8000, 15000),
77
- "resolution_time_range": (20, 35),
78
- "affected_users_range": (50000, 100000),
79
- "tags": ["network", "partition", "connectivity", "failure"]
80
- },
81
- "Storage I/O Saturation": {
82
- "detection_confidence_range": (0.93, 0.98),
83
- "detection_time_range": (45, 70),
84
- "accuracy_range": (0.94, 0.98),
85
- "similar_incidents_range": (2, 5),
86
- "similarity_score_range": (0.86, 0.92),
87
- "pattern_confidence_range": (0.87, 0.93),
88
- "success_rate_range": (0.79, 0.87),
89
- "cost_savings_range": (5500, 8500),
90
- "resolution_time_range": (18, 28),
91
- "affected_users_range": (25000, 45000),
92
- "tags": ["storage", "disk", "io", "saturation"]
93
- }
94
- }
95
-
96
- def get_scenario_config(scenario_name: str) -> Dict[str, Any]:
97
- """Get configuration for a specific scenario with defaults"""
98
- return SCENARIO_CONFIGS.get(scenario_name, {
99
- "detection_confidence_range": (0.90, 0.98),
100
- "detection_time_range": (30, 60),
101
- "accuracy_range": (0.92, 0.98),
102
- "similar_incidents_range": (1, 3),
103
- "similarity_score_range": (0.85, 0.95),
104
- "pattern_confidence_range": (0.85, 0.95),
105
- "success_rate_range": (0.75, 0.90),
106
- "cost_savings_range": (4000, 8000),
107
- "resolution_time_range": (15, 30),
108
- "affected_users_range": (20000, 50000),
109
- "tags": ["unknown", "incident"]
110
- })
111
-
112
- def simulate_arf_analysis(scenario_data: Dict[str, Any]) -> Dict[str, Any]:
113
- """
114
- Simulate ARF analysis with scenario-specific metrics
115
-
116
- Args:
117
- scenario_data: Dictionary containing scenario information
118
-
119
- Returns:
120
- Dictionary with analysis results
121
- """
122
- scenario_name = scenario_data.get("name", "Unknown Scenario")
123
- config = get_scenario_config(scenario_name)
124
-
125
- # Generate scenario-specific values
126
- detection_confidence = random.uniform(*config["detection_confidence_range"])
127
- detection_time = random.randint(*config["detection_time_range"])
128
- accuracy = random.uniform(*config["accuracy_range"])
129
-
130
- return {
131
- "analysis_complete": True,
132
- "anomaly_detected": True,
133
- "severity": scenario_data.get("severity", "HIGH_VARIANCE"), # Changed from "HIGH" to "HIGH_VARIANCE"
134
- "confidence": round(detection_confidence, 3), # Round to 3 decimals
135
- "detection_time_ms": detection_time * 1000, # Convert to ms for display
136
- "detection_time_seconds": detection_time,
137
- "accuracy": round(accuracy, 3),
138
- "component": scenario_data.get("component", "unknown"),
139
- "scenario_specific": True,
140
- "scenario_name": scenario_name,
141
- "tags": config["tags"]
142
- }
143
-
144
- def run_rag_similarity_search(scenario_data: Dict[str, Any]) -> List[Dict[str, Any]]:
145
- """
146
- Simulate RAG similarity search with scenario-specific results
147
-
148
- Args:
149
- scenario_data: Dictionary containing scenario information
150
-
151
- Returns:
152
- List of similar incidents
153
- """
154
- scenario_name = scenario_data.get("name", "Unknown Scenario")
155
- config = get_scenario_config(scenario_name)
156
-
157
- similar_count = random.randint(*config["similar_incidents_range"])
158
- similar_incidents = []
159
-
160
- # Generate similar incidents based on scenario
161
- base_time = int(time.time())
162
-
163
- for i in range(similar_count):
164
- similarity_score = random.uniform(*config["similarity_score_range"])
165
- cost_savings = random.randint(*config["cost_savings_range"])
166
- resolution_time = random.randint(*config["resolution_time_range"])
167
- affected_users = random.randint(*config["affected_users_range"])
168
-
169
- # Different resolutions based on scenario type
170
- if "cache" in scenario_name.lower() or "redis" in scenario_name.lower():
171
- resolution = random.choice(["scale_out", "warm_cache", "memory_increase", "add_replicas"])
172
- elif "database" in scenario_name.lower():
173
- resolution = random.choice(["restart", "connection_pool_resize", "index_optimization", "vacuum"])
174
- elif "kubernetes" in scenario_name.lower():
175
- resolution = random.choice(["restart_pod", "memory_limit_increase", "node_drain", "resource_quota"])
176
- elif "api" in scenario_name.lower():
177
- resolution = random.choice(["circuit_breaker", "rate_limit_increase", "caching", "load_balancer"])
178
- elif "network" in scenario_name.lower():
179
- resolution = random.choice(["route_update", "failover", "bandwidth_increase", "redundancy"])
180
- elif "storage" in scenario_name.lower():
181
- resolution = random.choice(["io_optimization", "disk_upgrade", "cache_addition", "load_distribution"])
182
- else:
183
- resolution = random.choice(["investigate", "scale", "restart", "optimize"])
184
-
185
- similar_incidents.append({
186
- "incident_id": f"inc_{base_time - random.randint(1, 90)}_00{i}",
187
- "similarity_score": round(similarity_score, 3),
188
- "success": random.random() > 0.15, # 85% success rate
189
- "resolution": resolution,
190
- "cost_savings": cost_savings,
191
- "detection_time": f"{random.randint(30, 60)}s",
192
- "resolution_time": f"{resolution_time}m",
193
- "pattern": f"{scenario_name.lower().replace(' ', '_')}_v{random.randint(1, 3)}",
194
- "affected_users": affected_users,
195
- "component_match": scenario_data.get("component", "unknown"),
196
- "rag_source": "production_memory_v3",
197
- "timestamp": f"2024-{random.randint(1, 12):02d}-{random.randint(1, 28):02d}"
198
- })
199
-
200
- return similar_incidents
201
-
202
- def calculate_pattern_confidence(scenario_data: Dict[str, Any], similar_incidents: List[Dict[str, Any]]) -> float:
203
- """
204
- Calculate pattern confidence based on similar incidents
205
-
206
- Args:
207
- scenario_data: Dictionary containing scenario information
208
- similar_incidents: List of similar incidents from RAG search
209
-
210
- Returns:
211
- Pattern confidence score (0-1)
212
- """
213
- scenario_name = scenario_data.get("name", "Unknown Scenario")
214
- config = get_scenario_config(scenario_name)
215
-
216
- if not similar_incidents:
217
- return random.uniform(*config["pattern_confidence_range"])
218
-
219
- # Calculate average similarity and success rate
220
- similarity_scores = [inc["similarity_score"] for inc in similar_incidents]
221
- success_rates = [1.0 if inc["success"] else 0.0 for inc in similar_incidents]
222
-
223
- avg_similarity = sum(similarity_scores) / len(similarity_scores)
224
- avg_success = sum(success_rates) / len(success_rates)
225
-
226
- # Weighted average: 60% similarity, 40% success rate
227
- confidence = (avg_similarity * 0.6) + (avg_success * 0.4)
228
-
229
- # Add some randomness but keep within scenario range
230
- min_conf, max_conf = config["pattern_confidence_range"]
231
- confidence = max(min_conf, min(max_conf, confidence))
232
-
233
- return round(confidence, 3)
234
-
235
- def calculate_internal_success_rate(similar_incidents: List[Dict[str, Any]]) -> float:
236
- """
237
- Calculate success rate for internal logic only.
238
- Not for UI display in Decision View.
239
-
240
- Doctrinal: Percentages invite debate, narratives shut it down.
241
- Keep this internal for logic, surface only in Outcome View.
242
- """
243
- if not similar_incidents:
244
- return 0.0
245
-
246
- success_count = sum(1 for inc in similar_incidents if inc.get("success", False))
247
- return round(success_count / len(similar_incidents), 3)
248
-
249
- def check_contraindications(scenario_data: Dict[str, Any], similar_incidents: List[Dict[str, Any]]) -> Dict[str, Any]:
250
- """
251
- Check for contraindications based on retry amplification signatures and historical evidence
252
-
253
- Returns:
254
- Dictionary with contraindication analysis
255
- """
256
- component = scenario_data.get("component", "").lower()
257
- scenario_name = scenario_data.get("name", "").lower()
258
-
259
- # Detect retry amplification signatures
260
- retry_amplification = False
261
- evidence = []
262
-
263
- # Check telemetry for retry storm indicators
264
- telemetry = scenario_data.get("telemetry", {})
265
- if telemetry.get("retry_storm", False):
266
- retry_amplification = True
267
- evidence.append("Telemetry shows retry_storm: True")
268
-
269
- # Check for amplification factor in metrics
270
- metrics = scenario_data.get("metrics", {})
271
- amplification_factor = metrics.get("amplification_factor", 1.0)
272
- if amplification_factor > 2.0:
273
- retry_amplification = True
274
- evidence.append(f"Amplification factor {amplification_factor} > 2.0")
275
-
276
- # Check database load
277
- db_load = metrics.get("database_load_percent", 0)
278
- if db_load > 85:
279
- retry_amplification = True
280
- evidence.append(f"Database load {db_load}% > 85%")
281
-
282
- # Check historical incidents for scaling-first failures
283
- historical_scaling_failures = False
284
- scaling_failure_evidence = []
285
-
286
- for incident in similar_incidents:
287
- resolution = incident.get("resolution", "").lower()
288
- success = incident.get("success", True)
289
-
290
- # Check for scaling-first resolutions that failed
291
- if any(scale_term in resolution for scale_term in ["scale", "increase", "add_replicas"]):
292
- if not success:
293
- historical_scaling_failures = True
294
- scaling_failure_evidence.append(
295
- f"{incident.get('timestamp', 'Unknown date')}: {resolution} failed"
296
- )
297
-
298
- contraindicated_actions = []
299
- if retry_amplification or historical_scaling_failures:
300
- contraindicated_actions.append("scale_during_retry_amplification")
301
-
302
- return {
303
- "retry_amplification": retry_amplification,
304
- "historical_scaling_failures": historical_scaling_failures,
305
- "evidence": evidence + scaling_failure_evidence,
306
- "contraindicated_actions": contraindicated_actions,
307
- "confidence": 0.92 if evidence else 0.0
308
- }
309
-
310
- def create_mock_healing_intent(scenario_data: Dict[str, Any], similar_incidents: List[Dict[str, Any]], confidence: float) -> Dict[str, Any]:
311
- """
312
- Create doctrinally compliant healing intent with sequencing thesis enforcement
313
-
314
- Doctrinal Addition: Explicit Observation Gate when contraindications exist OR confidence < threshold
315
- Psychological Goal: Make inaction an explicit, powerful decision
316
- """
317
- # Check for contraindications FIRST (doctrinal constraint)
318
- contraindications = check_contraindications(scenario_data, similar_incidents)
319
-
320
- scenario_name = scenario_data.get("name", "Unknown Scenario")
321
- config = get_scenario_config(scenario_name)
322
- component = scenario_data.get("component", "unknown")
323
-
324
- # ============ OBSERVATION GATE LOGIC ============
325
- # Key psychological addition: Explicit deferral when uncertainty is high
326
- observation_gate_threshold = 0.70 # Below this, we observe first
327
-
328
- should_observe_first = (
329
- contraindications["retry_amplification"] or
330
- contraindications["historical_scaling_failures"] or
331
- confidence < observation_gate_threshold or
332
- len(similar_incidents) < 2 # Insufficient historical evidence
333
- )
334
-
335
- if should_observe_first:
336
- # Return OBSERVATION GATE state - intentional inaction
337
- current_time = datetime.datetime.now()
338
- next_evaluation = current_time + datetime.timedelta(minutes=5)
339
-
340
- return {
341
- "action": "defer_decision_for_trend_confirmation",
342
- "component": component,
343
- "confidence": round(confidence, 3),
344
- "parameters": {
345
- "observation_window": "5m",
346
- "metrics_to_watch": ["retry_count", "database_load_percent", "error_rate"],
347
- "trend_threshold": "stabilizing_or_declining"
348
- },
349
- "source": "observation_gate_logic",
350
- "requires_enterprise": False,
351
- "advisory_only": True,
352
- # CRITICAL PSYCHOLOGICAL FIELDS
353
- "execution_state": "observe_only",
354
- "next_evaluation_window": "5m",
355
- "decision_frozen_until": next_evaluation.isoformat(),
356
- "deferral_reason": "uncertainty_too_high_for_action" if confidence < observation_gate_threshold else
357
- "contraindications_present" if contraindications["retry_amplification"] else
358
- "historical_failures_detected" if contraindications["historical_scaling_failures"] else
359
- "insufficient_historical_evidence",
360
- # FORMAL HEALINGINTENT FIELDS
361
- "preconditions": [
362
- f"Confidence threshold not met ({confidence:.2f} < {observation_gate_threshold})" if confidence < observation_gate_threshold else
363
- "Retry amplification detected" if contraindications["retry_amplification"] else
364
- "Historical scaling failures present" if contraindications["historical_scaling_failures"] else
365
- "Insufficient similar incidents for pattern matching"
366
- ],
367
- "contraindicated_actions": ["any_healing_action_during_high_uncertainty"],
368
- "reversibility_statement": "Evaluation resumes automatically after 5-minute observation window",
369
- "sequencing_rule": "observe_before_any_action_when_uncertain",
370
- "historical_evidence": [
371
- f"{len(similar_incidents)} similar incidents analyzed (minimum 2 required)",
372
- "Observation-first reduces incorrect actions by 67% (historical analysis)"
373
- ],
374
- # SUCCESS RATE HANDLING (kept internal, not surfaced early)
375
- "_internal_success_rate": calculate_internal_success_rate(similar_incidents) if similar_incidents else 0.0,
376
- "_internal_notes": "Success rate kept internal; percentages invite debate, narratives shut it down",
377
- "scenario_specific": True,
378
- "scenario_name": scenario_name
379
- }
380
-
381
- # If retry amplification detected (but passed observation gate threshold), enforce dampening-first logic
382
- if contraindications["retry_amplification"]:
383
- return {
384
- "action": "implement_request_coalescing_with_exponential_backoff",
385
- "component": component,
386
- "confidence": max(confidence, 0.85), # High confidence for dampening-first
387
- "parameters": {
388
- "coalescing_window_ms": "100-500ms",
389
- "backoff_factor": "exponential",
390
- "max_retries": 3,
391
- "timeout": "10m"
392
- },
393
- "source": "contraindication_detection",
394
- "requires_enterprise": False,
395
- "advisory_only": False,
396
- # CRITICAL: Add observation window even for dampening actions
397
- "post_action_observation": {
398
- "required": True,
399
- "duration": "5m",
400
- "metrics": ["retry_count", "database_load_percent", "latency_p99"]
401
- },
402
- "success_rate": 0.88,
403
- "estimated_impact": {
404
- "cost_savings": 4500,
405
- "resolution_time_minutes": 12,
406
- "users_protected": random.randint(*config["affected_users_range"]),
407
- "mttr_reduction": "73%"
408
- },
409
- "safety_checks": {
410
- "blast_radius": "single_service",
411
- "business_hours": "compliant",
412
- "rollback_plan": "coalescing_disable",
413
- "approval_required": False,
414
- "risk_level": "low"
415
- },
416
- # FORMAL HEALINGINTENT FIELDS (doctrinal constraint)
417
- "preconditions": [
418
- "Retry amplification signature detected",
419
- f"Amplification factor > {scenario_data.get('metrics', {}).get('amplification_factor', 2.0)}",
420
- "Database load > 85%"
421
- ],
422
- "contraindicated_actions": ["scale_during_retry_storm", "add_capacity_during_amplification"],
423
- "reversibility_statement": "Remove coalescing window after 10 minutes of stable operation",
424
- "sequencing_rule": "dampening_first_then_observe_then_optional_scale",
425
- "historical_evidence": contraindications["evidence"][:3], # Top 3 evidence items
426
- "scenario_specific": True,
427
- "scenario_name": scenario_name
428
- }
429
-
430
- # Only proceed with normal logic if no contraindications AND passed observation gate
431
- # Determine action based on component and scenario WITH sequencing logic
432
- ranked_actions = []
433
-
434
- # DAMPENING actions (always first in sequence)
435
- dampening_actions = []
436
- if "api" in component.lower() or "rate" in scenario_name.lower():
437
- dampening_actions.append({
438
- "action": "circuit_breaker",
439
- "confidence": confidence * 0.95, # Slightly lower confidence for dampening
440
- "parameters": {
441
- "threshold": f"{random.randint(70, 85)}%",
442
- "window": f"{random.randint(3, 10)}m",
443
- "fallback": "cached_response",
444
- "retry_after": f"{random.randint(30, 120)}s"
445
- }
446
- })
447
-
448
- # Add general dampening for retry-prone scenarios
449
- if any(term in component.lower() for term in ["redis", "cache", "database"]):
450
- dampening_actions.append({
451
- "action": "request_batching_with_timeout",
452
- "confidence": confidence * 0.92,
453
- "parameters": {
454
- "batch_size": "10-50 requests",
455
- "timeout_ms": "100ms",
456
- "strategy": "adaptive"
457
- }
458
- })
459
-
460
- # Add dampening actions to ranked list
461
- for i, act in enumerate(dampening_actions):
462
- ranked_actions.append({
463
- "rank": len(ranked_actions) + 1,
464
- "action": act["action"],
465
- "confidence": round(act["confidence"], 3),
466
- "parameters": act["parameters"],
467
- "category": "dampening"
468
- })
469
-
470
- # CONCURRENCY CAP actions (second in sequence)
471
- if "database" in component.lower():
472
- ranked_actions.append({
473
- "rank": len(ranked_actions) + 1,
474
- "action": "connection_pool_limit_adjustment",
475
- "confidence": confidence * 0.88,
476
- "parameters": {
477
- "max_connections": f"{random.randint(100, 200)}",
478
- "timeout": f"{random.randint(30, 60)}s"
479
- },
480
- "category": "concurrency_control"
481
- })
482
-
483
- # OBSERVE actions (third in sequence)
484
- ranked_actions.append({
485
- "rank": len(ranked_actions) + 1,
486
- "action": "enhanced_monitoring_with_telemetry",
487
- "confidence": confidence * 0.85,
488
- "parameters": {
489
- "duration": "5m",
490
- "metrics": ["latency_p99", "error_rate", "throughput"],
491
- "alert_threshold": "2x_baseline"
492
- },
493
- "category": "observation"
494
- })
495
-
496
- # SCALING actions (ONLY if no contraindications AND last in sequence)
497
- # AND only if confidence justifies scaling over dampening
498
- scaling_confidence_threshold = 0.75 # Scaling requires higher confidence
499
-
500
- if confidence > scaling_confidence_threshold and not contraindications["historical_scaling_failures"]:
501
- if "cache" in component.lower() or "redis" in component.lower():
502
- scaling_action = {
503
- "rank": len(ranked_actions) + 1,
504
- "action": "gradual_scale_out",
505
- "confidence": confidence * 0.80, # Lower confidence than dampening
506
- "parameters": {
507
- "nodes": f"{random.randint(2, 4)}→{random.randint(4, 6)}",
508
- "strategy": "one_by_one",
509
- "health_check_interval": "30s"
510
- },
511
- "category": "scaling",
512
- "constraints": ["Only if dampening insufficient after 5 minutes"]
513
- }
514
- ranked_actions.append(scaling_action)
515
-
516
- # Calculate success rate internally only
517
- _internal_success_rate = calculate_internal_success_rate(similar_incidents) if similar_incidents else random.uniform(*config["success_rate_range"])
518
-
519
- # Calculate estimated impact
520
- if similar_incidents:
521
- avg_cost_savings = sum(inc["cost_savings"] for inc in similar_incidents) / len(similar_incidents)
522
- avg_resolution_time = sum(int(inc["resolution_time"].replace('m', '')) for inc in similar_incidents) / len(similar_incidents)
523
- else:
524
- avg_cost_savings = sum(config["cost_savings_range"]) / 2
525
- avg_resolution_time = sum(config["resolution_time_range"]) / 2
526
-
527
- # Primary action is first in ranked_actions (dampening-first)
528
- primary_action = ranked_actions[0] if ranked_actions else {
529
- "action": "investigate",
530
- "confidence": confidence,
531
- "parameters": {"priority": "high"}
532
- }
533
-
534
- return {
535
- "action": primary_action["action"],
536
- "component": component,
537
- "confidence": round(confidence, 3),
538
- "parameters": primary_action.get("parameters", {}),
539
- "source": "sequencing_analysis",
540
- "requires_enterprise": True,
541
- "advisory_only": True,
542
- # SUCCESS RATE: Internal only, not for UI display in Decision View
543
- "_internal_success_rate": _internal_success_rate,
544
- "_internal_notes": "Success rate for internal logic; surface narrative outcomes, not percentages",
545
- "estimated_impact": {
546
- "cost_savings": int(avg_cost_savings),
547
- "resolution_time_minutes": int(avg_resolution_time),
548
- "users_protected": random.randint(*config["affected_users_range"]),
549
- "mttr_reduction": f"{random.randint(60, 80)}%"
550
- },
551
- "safety_checks": {
552
- "blast_radius": f"{random.randint(1, 3)} services",
553
- "business_hours": "compliant",
554
- "rollback_plan": "available",
555
- "approval_required": True,
556
- "risk_level": "medium" if confidence < 0.9 else "low"
557
- },
558
- # FORMAL HEALINGINTENT FIELDS (doctrinal constraint)
559
- "preconditions": [
560
- f"Component: {component}",
561
- f"Confidence threshold > {scaling_confidence_threshold}",
562
- "No retry amplification detected",
563
- "Historical scaling success rate > 70%"
564
- ],
565
- "contraindicated_actions": contraindications["contraindicated_actions"],
566
- "reversibility_statement": f"Rollback to previous configuration available within {random.randint(5, 15)} minutes",
567
- "sequencing_rule": "dampening_before_concurrency_before_observation_before_scaling",
568
- "ranked_actions": ranked_actions,
569
- "historical_evidence": [f"{len(similar_incidents)} similar incidents analyzed"],
570
- "scenario_specific": True,
571
- "scenario_name": scenario_name
572
- }
573
-
574
- def get_scenario_metrics(scenario_name: str) -> Dict[str, Any]:
575
- """
576
- Get dynamic metrics for a specific scenario
577
-
578
- Args:
579
- scenario_name: Name of the scenario
580
-
581
- Returns:
582
- Dictionary with scenario-specific metrics
583
- """
584
- config = get_scenario_config(scenario_name)
585
-
586
- # Generate dynamic values within ranges
587
- return {
588
- "detection_confidence": round(random.uniform(*config["detection_confidence_range"]), 3),
589
- "detection_time_seconds": random.randint(*config["detection_time_range"]),
590
- "accuracy": round(random.uniform(*config["accuracy_range"]), 3),
591
- "expected_similar_incidents": random.randint(*config["similar_incidents_range"]),
592
- "avg_similarity_score": round(random.uniform(*config["similarity_score_range"]), 3),
593
- "pattern_confidence": round(random.uniform(*config["pattern_confidence_range"]), 3),
594
- "success_rate": round(random.uniform(*config["success_rate_range"]), 3),
595
- "cost_savings_range": config["cost_savings_range"],
596
- "resolution_time_range": config["resolution_time_range"],
597
- "affected_users_range": config["affected_users_range"],
598
- "tags": config["tags"]
599
- }
600
-
601
- def detect_retry_amplification(telemetry_data: Dict[str, Any]) -> Dict[str, Any]:
602
- """
603
- Detect retry amplification signatures from telemetry data
604
-
605
- Doctrinal constraint: Must be REAL detection, not hardcoded in scenarios
606
-
607
- Args:
608
- telemetry_data: Dictionary containing telemetry metrics
609
-
610
- Returns:
611
- Dictionary with detection results
612
- """
613
- # Extract metrics with defaults
614
- retry_storm = telemetry_data.get("retry_storm", False)
615
- retry_count = telemetry_data.get("retry_count", 0)
616
- success_count = telemetry_data.get("success_count", 1) # Avoid division by zero
617
- database_load = telemetry_data.get("database_load_percent", 0)
618
- retry_cascade_depth = telemetry_data.get("retry_cascade_depth", 0)
619
-
620
- # Calculate amplification factor
621
- amplification_factor = 1.0
622
- if success_count > 0:
623
- amplification_factor = retry_count / success_count
624
-
625
- # Detect signatures
626
- detected = (
627
- retry_storm or
628
- amplification_factor > 2.0 or
629
- retry_cascade_depth > 2 or
630
- database_load > 85
631
- )
632
-
633
- signature = None
634
- if detected:
635
- if retry_storm and amplification_factor > 3.0:
636
- signature = "exponential_retry_cascade"
637
- elif database_load > 85 and amplification_factor > 1.5:
638
- signature = "database_amplified_retry"
639
- else:
640
- signature = "retry_amplification_detected"
641
-
642
- # Calculate confidence based on evidence strength
643
- confidence_factors = []
644
- if retry_storm:
645
- confidence_factors.append(0.3)
646
- if amplification_factor > 2.0:
647
- confidence_factors.append(0.25 * min(amplification_factor / 5.0, 1.0))
648
- if retry_cascade_depth > 2:
649
- confidence_factors.append(0.2 * min(retry_cascade_depth / 5.0, 1.0))
650
- if database_load > 85:
651
- confidence_factors.append(0.25 * min(database_load / 100.0, 1.0))
652
-
653
- confidence = min(0.98, 0.1 + sum(confidence_factors)) if confidence_factors else 0.0
654
-
655
- return {
656
- "detected": detected,
657
- "amplification_factor": round(amplification_factor, 2),
658
- "signature": signature,
659
- "confidence": round(confidence, 3),
660
- "metrics": {
661
- "retry_storm": retry_storm,
662
- "retry_count": retry_count,
663
- "success_count": success_count,
664
- "database_load_percent": database_load,
665
- "retry_cascade_depth": retry_cascade_depth
666
- },
667
- "recommendation": "implement_dampening_first" if detected else "proceed_with_caution"
668
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
demo/orchestrator.py DELETED
@@ -1,98 +0,0 @@
1
- # demo/orchestrator.py - COMPLETE FIXED VERSION
2
- from __future__ import annotations
3
-
4
- import logging
5
- import asyncio
6
- from typing import Any, Dict, Optional, List
7
- import time
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
- # Import mock ARF functions
12
- try:
13
- from demo.mock_arf import (
14
- simulate_arf_analysis,
15
- run_rag_similarity_search,
16
- create_mock_healing_intent,
17
- calculate_pattern_confidence
18
- )
19
- MOCK_ARF_AVAILABLE = True
20
- logger.info("Mock ARF functions available")
21
- except ImportError as e:
22
- logger.warning(f"Mock ARF functions not available: {e}")
23
- MOCK_ARF_AVAILABLE = False
24
-
25
-
26
- class DemoOrchestrator:
27
- """
28
- Orchestrates demo scenarios with proper agent workflow.
29
- """
30
-
31
- def __init__(self, enable_streamlit: bool = False):
32
- self.enable_streamlit = enable_streamlit
33
- logger.info("DemoOrchestrator initialized")
34
-
35
- async def analyze_incident(self, scenario_name: str, scenario_data: Dict[str, Any]) -> Dict[str, Any]:
36
- """
37
- Analyze an incident using the ARF agent workflow.
38
- This is the method called by app.py
39
- """
40
- logger.info(f"Analyzing incident: {scenario_name}")
41
-
42
- if not MOCK_ARF_AVAILABLE:
43
- logger.error("Mock ARF functions not available")
44
- return {
45
- "status": "error",
46
- "message": "Mock ARF functions not available",
47
- "scenario": scenario_name
48
- }
49
-
50
- try:
51
- # Step 1: Detection Agent
52
- logger.debug("Running detection agent...")
53
- detection_result = simulate_arf_analysis(scenario_data)
54
-
55
- # Step 2: Recall Agent
56
- logger.debug("Running recall agent...")
57
- similar_incidents = run_rag_similarity_search(scenario_data)
58
-
59
- # Step 3: Decision Agent
60
- logger.debug("Running decision agent...")
61
- confidence = calculate_pattern_confidence(scenario_data, similar_incidents)
62
- healing_intent = create_mock_healing_intent(scenario_data, similar_incidents, confidence)
63
-
64
- # Simulate processing time
65
- await asyncio.sleep(0.5)
66
-
67
- result = {
68
- "status": "success",
69
- "scenario": scenario_name,
70
- "detection": detection_result,
71
- "recall": similar_incidents,
72
- "decision": healing_intent,
73
- "confidence": confidence,
74
- "processing_time_ms": 450
75
- }
76
-
77
- logger.info(f"Analysis complete for {scenario_name}")
78
- return result
79
-
80
- except Exception as e:
81
- logger.error(f"Error analyzing incident: {e}", exc_info=True)
82
- return {
83
- "status": "error",
84
- "message": str(e),
85
- "scenario": scenario_name
86
- }
87
-
88
- def run_scenario(self, scenario: Dict[str, Any]) -> Dict[str, Any]:
89
- """
90
- Run a demo scenario (legacy method).
91
- """
92
- logger.info("Running scenario: %s", scenario.get("name", "unknown"))
93
-
94
- return {
95
- "scenario": scenario.get("name"),
96
- "status": "completed",
97
- "output": scenario,
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
demo/scenarios.py DELETED
@@ -1,334 +0,0 @@
1
- """
2
- Incident scenarios for the demo - EXPANDED VERSION WITH REALISM UPGRADES
3
- Version: 3.3.9+realism
4
- """
5
-
6
- INCIDENT_SCENARIOS = {
7
- "Cache Miss Storm": {
8
- "description": "Redis cluster experiencing 80% cache miss rate causing database overload",
9
- "severity": "CRITICAL",
10
- "component": "redis_cache",
11
- "metrics": {
12
- "cache_hit_rate": 18.5,
13
- "database_load": 92,
14
- "response_time_ms": 1850,
15
- "affected_users": 45000,
16
- "eviction_rate_per_sec": 125
17
- },
18
- "business_impact": {
19
- "revenue_loss_per_hour": 8500,
20
- "sla_violation": True,
21
- "customer_sat_change": -40,
22
- "affected_services": ["API Gateway", "User Service", "Payment"]
23
- },
24
- "roi_data": {
25
- "hourly_revenue_loss": 8500,
26
- "manual_recovery_hours": 1.0,
27
- "enterprise_recovery_hours": 0.2,
28
- "engineers_required": 4,
29
- "engineer_hourly_rate": 150,
30
- "estimated_monthly_occurrences": 2,
31
- "enterprise_savings_percentage": 0.85
32
- },
33
- # ============ REALISM UPGRADES ============
34
- "realism": {
35
- "ranked_actions": [
36
- {
37
- "rank": 1,
38
- "confidence": 87,
39
- "action": "Scale Redis cluster from 3 to 5 nodes",
40
- "rationale": "Immediate throughput increase, reduces contention",
41
- "risk": "Cold cache amplification: Medium",
42
- "tradeoff": "Adds $420/month infrastructure cost",
43
- "execution_time": "8-12 minutes",
44
- "success_rate": "94% based on 18 similar incidents"
45
- },
46
- {
47
- "rank": 2,
48
- "confidence": 62,
49
- "action": "Implement request coalescing with 500ms window",
50
- "rationale": "Reduces duplicate DB queries, lower blast radius",
51
- "risk": "Adds 150-200ms latency per request",
52
- "tradeoff": "Slower stabilization (15-20 minutes)",
53
- "rejection_note": "Secondary option if scaling unavailable"
54
- },
55
- {
56
- "rank": 3,
57
- "confidence": 34,
58
- "action": "Restart Redis cluster with warmup script",
59
- "rationale": "Clears fragmentation, resets eviction policies",
60
- "risk": "HIGH: 45-second service interruption",
61
- "rejection_reason": "Rejected: High data loss risk during peak traffic",
62
- "safety_override": "Required for Enterprise execution"
63
- }
64
- ],
65
- "risk_assessment": {
66
- "stampede_probability": "18%",
67
- "cold_cache_impact": "Medium",
68
- "data_inconsistency_risk": "Low",
69
- "recovery_complexity": "Medium"
70
- },
71
- "constraints": {
72
- "max_redis_nodes": 8,
73
- "scaling_cooldown": "30 minutes",
74
- "concurrent_connections": "25,000",
75
- "data_size_gb": 42
76
- },
77
- "confidence_degradation": {
78
- "initial": 94,
79
- "after_8_min": 71,
80
- "after_15_min": 52,
81
- "escalation_threshold": 60
82
- }
83
- }
84
- },
85
-
86
- "Database Connection Pool Exhaustion": {
87
- "description": "PostgreSQL connection pool exhausted causing API timeouts",
88
- "severity": "HIGH",
89
- "component": "postgresql_database",
90
- "metrics": {
91
- "active_connections": 98,
92
- "max_connections": 100,
93
- "api_latency_ms": 2450,
94
- "error_rate": 15.2,
95
- "queue_depth": 1250,
96
- "connection_wait_seconds": 45
97
- },
98
- "business_impact": {
99
- "revenue_loss_per_hour": 4200,
100
- "affected_services": ["API Gateway", "User Service", "Payment Service"],
101
- "sla_violation": True,
102
- "partner_api_impact": 3
103
- },
104
- "roi_data": {
105
- "hourly_revenue_loss": 4200,
106
- "manual_recovery_hours": 0.75,
107
- "enterprise_recovery_hours": 0.13,
108
- "engineers_required": 2,
109
- "engineer_hourly_rate": 150,
110
- "estimated_monthly_occurrences": 3,
111
- "enterprise_savings_percentage": 0.82
112
- },
113
- # ============ REALISM UPGRADES ============
114
- "realism": {
115
- "ranked_actions": [
116
- {
117
- "rank": 1,
118
- "confidence": 82,
119
- "action": "Increase max_connections from 100 to 115 (+15%)",
120
- "rationale": "Immediate relief, within safe operating limits",
121
- "risk": "Disk I/O contention: Medium",
122
- "constraint": "DB max_connections: 82% utilized (pre)",
123
- "monitoring": "Monitor connection churn for 30 minutes"
124
- },
125
- {
126
- "rank": 2,
127
- "confidence": 58,
128
- "action": "Enable statement timeout (5s) + connection recycling",
129
- "rationale": "Prevents runaway queries, faster pool turnover",
130
- "risk": "Query cancellation may cause application errors",
131
- "tradeoff": "Adds development/testing overhead"
132
- },
133
- {
134
- "rank": 3,
135
- "confidence": 29,
136
- "action": "Switch to pgbouncer in transaction pooling mode",
137
- "rationale": "10x connection multiplexing possible",
138
- "risk": "HIGH: Requires application changes, 2-hour migration",
139
- "rejection_reason": "Rejected: Too invasive for incident response"
140
- }
141
- ],
142
- "constraint_awareness": {
143
- "disk_io_headroom": "Low",
144
- "memory_available_gb": 8.2,
145
- "pool_increase_cap": "+15%",
146
- "monitoring_gap": "Connection churn not tracked"
147
- }
148
- }
149
- },
150
-
151
- "Kubernetes Memory Leak": {
152
- "description": "Java microservice memory leak causing pod restarts",
153
- "severity": "HIGH",
154
- "component": "java_payment_service",
155
- "metrics": {
156
- "memory_usage": 96,
157
- "gc_pause_time_ms": 4500,
158
- "error_rate": 28.5,
159
- "restart_frequency_per_hour": 12,
160
- "heap_fragmentation": 42
161
- },
162
- "business_impact": {
163
- "revenue_loss_per_hour": 5500,
164
- "session_loss": 8500,
165
- "payment_failures_percentage": 3.2,
166
- "support_tickets_increase": 300
167
- },
168
- "roi_data": {
169
- "hourly_revenue_loss": 5500,
170
- "manual_recovery_hours": 1.5,
171
- "enterprise_recovery_hours": 0.25,
172
- "engineers_required": 3,
173
- "engineer_hourly_rate": 150,
174
- "estimated_monthly_occurrences": 1,
175
- "enterprise_savings_percentage": 0.79
176
- },
177
- # ============ REALISM UPGRADES ============
178
- "realism": {
179
- "ranked_actions": [
180
- {
181
- "rank": 1,
182
- "confidence": 76,
183
- "action": "Canary restart (1/4 pods) with heap dump analysis",
184
- "rationale": "Minimizes blast radius, enables root cause capture",
185
- "risk": "Cold-start latency: +2.3s per pod",
186
- "blast_radius_economics": {
187
- "canary_restart_cost": "$850",
188
- "full_restart_cost": "$3,400",
189
- "payment_retry_risk": "Medium",
190
- "safer_order": "Canary → scale → rollout"
191
- }
192
- },
193
- {
194
- "rank": 2,
195
- "confidence": 63,
196
- "action": "Increase heap from 2GB to 3GB with monitoring",
197
- "rationale": "Buy time for analysis, reduces restart frequency",
198
- "risk": "Delays root cause identification",
199
- "tradeoff": "Temporary fix, adds memory cost"
200
- }
201
- ]
202
- }
203
- },
204
-
205
- "Network Partition": {
206
- "description": "Network partition causing split-brain in distributed database",
207
- "severity": "CRITICAL",
208
- "component": "distributed_database",
209
- "metrics": {
210
- "partition_detected": True,
211
- "write_conflicts": 1250,
212
- "data_inconsistency_percentage": 8.5,
213
- "replication_lag_seconds": 45,
214
- "quorum_lost": True
215
- },
216
- "business_impact": {
217
- "revenue_loss_per_hour": 12000,
218
- "data_corruption_risk": True,
219
- "recovery_complexity": "HIGH",
220
- "compliance_violation": True
221
- },
222
- "roi_data": {
223
- "hourly_revenue_loss": 12000,
224
- "manual_recovery_hours": 2.0,
225
- "enterprise_recovery_hours": 0.3,
226
- "engineers_required": 5,
227
- "engineer_hourly_rate": 150,
228
- "estimated_monthly_occurrences": 0.5,
229
- "enterprise_savings_percentage": 0.88
230
- },
231
- # ============ REALISM UPGRADES ============
232
- "realism": {
233
- "competing_hypotheses": [
234
- {
235
- "cause": "Network partition (control plane)",
236
- "confidence": 61,
237
- "evidence": "Quorum lost, replication lag > 30s",
238
- "investigation_path": "Check network mesh, BGP status"
239
- },
240
- {
241
- "cause": "Control plane overload",
242
- "confidence": 24,
243
- "evidence": "High CPU on orchestration nodes",
244
- "investigation_path": "Scale control plane, check etcd health"
245
- },
246
- {
247
- "cause": "Downstream timeout amplification",
248
- "confidence": 15,
249
- "evidence": "Cascading failures in 3 dependent services",
250
- "investigation_path": "Implement circuit breakers"
251
- }
252
- ]
253
- }
254
- },
255
-
256
- "API Rate Limit Storm": {
257
- "description": "Third-party API rate limiting causing cascading failures",
258
- "severity": "MEDIUM",
259
- "component": "external_api_gateway",
260
- "metrics": {
261
- "rate_limit_hits_percentage": 95,
262
- "error_rate": 42.8,
263
- "retry_storm": True,
264
- "cascade_effect_services": 3,
265
- "queue_backlog": 8500
266
- },
267
- "business_impact": {
268
- "revenue_loss_per_hour": 3800,
269
- "partner_sla_breach": True,
270
- "data_sync_delay_hours": 4,
271
- "customer_reports_delay_hours": 6
272
- },
273
- "roi_data": {
274
- "hourly_revenue_loss": 3800,
275
- "manual_recovery_hours": 1.25,
276
- "enterprise_recovery_hours": 0.17,
277
- "engineers_required": 3,
278
- "engineer_hourly_rate": 150,
279
- "estimated_monthly_occurrences": 4,
280
- "enterprise_savings_percentage": 0.85
281
- },
282
- # ============ REALISM UPGRADES ============
283
- "realism": {
284
- "contract_aware_reasoning": {
285
- "burst_limit": "1.2× allowed",
286
- "penalty_window": "15 minutes",
287
- "degradation_mode": "Non-premium users only",
288
- "contractual_limits": {
289
- "requests_per_second": 100,
290
- "monthly_overage_fee": "$0.15/request",
291
- "suspension_threshold": "3 violations/month"
292
- }
293
- }
294
- }
295
- },
296
-
297
- "Storage I/O Saturation": {
298
- "description": "Storage system I/O saturation causing application timeouts",
299
- "severity": "HIGH",
300
- "component": "storage_cluster",
301
- "metrics": {
302
- "io_utilization": 98,
303
- "latency_ms": 450,
304
- "throughput_mbps": 1250,
305
- "queue_depth": 850,
306
- "error_rate": 8.5
307
- },
308
- "business_impact": {
309
- "revenue_loss_per_hour": 6800,
310
- "data_processing_delay_hours": 3,
311
- "analytics_backlog": True,
312
- "reporting_failure": True
313
- },
314
- "roi_data": {
315
- "hourly_revenue_loss": 6800,
316
- "manual_recovery_hours": 1.75,
317
- "enterprise_recovery_hours": 0.22,
318
- "engineers_required": 3,
319
- "engineer_hourly_rate": 150,
320
- "estimated_monthly_occurrences": 1.5,
321
- "enterprise_savings_percentage": 0.83
322
- },
323
- # ============ REALISM UPGRADES ============
324
- "realism": {
325
- "irreversibility_warnings": {
326
- "rebalance_duration": "18-25 minutes",
327
- "write_amplification_risk": "High",
328
- "requires_explicit_approval": True,
329
- "approval_level": "Director+",
330
- "rollback_complexity": "High (requires snapshot restore)"
331
- }
332
- }
333
- }
334
- }