Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

petter2025 commited on Jan 30

Commit

03cfe7a

verified ·

1 Parent(s): 0d80e53

Delete demo

Browse files

Files changed (5) hide show

demo/__init__.py +0 -9
demo/guidance.py +0 -549
demo/mock_arf.py +0 -668
demo/orchestrator.py +0 -98
demo/scenarios.py +0 -334

demo/__init__.py DELETED Viewed

@@ -1,9 +0,0 @@
-"""
-ARF Demo Package
-"""
-from .orchestrator import DemoOrchestrator
-from .scenarios import INCIDENT_SCENARIOS
-__all__ = ['DemoOrchestrator', 'INCIDENT_SCENARIOS']
-__version__ = '3.8.0'

demo/guidance.py DELETED Viewed

@@ -1,549 +0,0 @@
-"""
-Enhanced Demo Guidance System - Manages the psychology and flow of the ARF demo
-Adds clear narrative phases and boundary awareness
-"""
-from enum import Enum
-from typing import Dict, List, Any, Optional
-from dataclasses import dataclass
-import time
-class DemoPhase(Enum):
-    """Phases of the demo narrative with clear boundaries"""
-    INTRODUCTION = "introduction"
-    FAILURE_INJECTION = "failure_injection"
-    REAL_OSS_ANALYSIS = "real_oss_analysis"
-    DECISION_BOUNDARY = "decision_boundary"
-    SIMULATED_ENTERPRISE = "simulated_enterprise"
-    RESOLUTION = "resolution"
-    ARCHITECTURE_REVIEW = "architecture_review"
-@dataclass
-class PhaseContent:
-    """Enhanced content for each demo phase with boundary indicators"""
-    phase: DemoPhase
-    title: str
-    narrative: str
-    key_message: str
-    visual_cue: str
-    duration_seconds: int
-    show_boundary: bool = False
-    boundary_text: Optional[str] = None
-    is_real_arf: bool = False
-    def get_html(self, show_progress: bool = True, current_step: int = 1, total_steps: int = 7) -> str:
-        """Get HTML for this phase with progress indicator"""
-        # Progress indicator
-        progress_html = ""
-        if show_progress:
-            progress_percentage = int((current_step / total_steps) * 100)
-            progress_html = f"""
-            <div style="margin-bottom: 20px;">
-                <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
-                    <div style="font-size: 13px; color: #64748b; font-weight: 500;">
-                        Demo Progress: Step {current_step} of {total_steps}
-                    </div>
-                    <div style="font-size: 13px; color: #3b82f6; font-weight: 600;">
-                        {progress_percentage}%
-                    </div>
-                </div>
-                <div style="height: 6px; background: #e2e8f0; border-radius: 3px; overflow: hidden;">
-                    <div style="width: {progress_percentage}%; height: 100%;
-                                background: linear-gradient(90deg, #3b82f6, #8b5cf6);
-                                border-radius: 3px; transition: width 0.3s ease;">
-                    </div>
-                </div>
-            </div>
-            """
-        # Real ARF indicator
-        real_arf_html = ""
-        if self.is_real_arf:
-            real_arf_html = f"""
-            <div style="margin: 15px 0; padding: 10px; background: #f0fdf4;
-                       border-radius: 8px; border: 2px solid #10b981;">
-                <div style="display: flex; align-items: center; gap: 8px;">
-                    <div style="font-size: 20px;">✅</div>
-                    <div style="font-weight: 600; color: #065f46;">REAL ARF OSS v3.3.7</div>
-                </div>
-                <div style="font-size: 13px; color: #047857; margin-top: 5px;">
-                    Running actual agentic-reliability-framework==3.3.7 package
-                </div>
-            </div>
-            """
-        # Boundary indicator
-        boundary_html = ""
-        if self.show_boundary and self.boundary_text:
-            boundary_html = f"""
-            <div style="margin: 15px 0; padding: 12px; background: #fef3c7;
-                       border-radius: 10px; border-left: 4px solid #f59e0b;">
-                <div style="display: flex; align-items: center; gap: 8px; margin-bottom: 5px;">
-                    <div style="font-size: 20px;">🎭</div>
-                    <div style="font-weight: 600; color: #92400e;">Demo Boundary</div>
-                </div>
-                <div style="font-size: 13px; color: #b45309; line-height: 1.5;">
-                    {self.boundary_text}
-                </div>
-            </div>
-            """
-        return f"""
-        <div style="border: 2px solid #3b82f6; border-radius: 16px; padding: 25px;
-                   background: linear-gradient(135deg, #f8fafc 0%, #ffffff 100%);
-                   box-shadow: 0 8px 32px rgba(59, 130, 246, 0.1); margin: 20px 0;">
-            <div style="display: flex; align-items: center; gap: 15px; margin-bottom: 20px;">
-                <div style="font-size: 36px;">{self.visual_cue}</div>
-                <div>
-                    <h3 style="margin: 0 0 5px 0; color: #1e293b; font-size: 20px; font-weight: 700;">
-                        {self.title}
-                    </h3>
-                    <div style="font-size: 14px; color: #64748b;">
-                        Phase: {self.phase.value.replace('_', ' ').title()}
-                    </div>
-                </div>
-            </div>
-            {progress_html}
-            <div style="margin-bottom: 20px;">
-                <div style="font-size: 16px; color: #475569; line-height: 1.6; margin-bottom: 15px;">
-                    {self.narrative}
-                </div>
-                {real_arf_html}
-                {boundary_html}
-                <div style="padding: 15px; background: #f1f5f9; border-radius: 10px;
-                           border-left: 4px solid #3b82f6;">
-                    <div style="font-weight: 600; color: #1e293b; margin-bottom: 5px;">
-                        🎯 Key Message
-                    </div>
-                    <div style="font-size: 15px; color: #475569; line-height: 1.5;">
-                        {self.key_message}
-                    </div>
-                </div>
-            </div>
-            <div style="display: flex; justify-content: space-between; align-items: center;
-                       margin-top: 20px; padding-top: 15px; border-top: 1px solid #e2e8f0;">
-                <div style="font-size: 12px; color: #94a3b8;">
-                    ⏱️ Duration: {self.duration_seconds}s •
-                    🎯 Focus: {self.phase.value.replace('_', ' ').title()}
-                </div>
-                <div style="display: flex; gap: 10px;">
-                    <div style="padding: 4px 10px; background: #e2e8f0;
-                               color: #64748b; border-radius: 12px; font-size: 11px; font-weight: 500;">
-                        Phase {current_step}
-                    </div>
-                </div>
-            </div>
-        </div>
-        """
-# Complete demo flow with psychological pacing
-DEMO_FLOW = {
-    DemoPhase.INTRODUCTION: PhaseContent(
-        phase=DemoPhase.INTRODUCTION,
-        title="🚀 Welcome to ARF v3.3.7 - The Architecture Demo",
-        narrative="""
-        Most AI systems fail silently in production. This one doesn't. We're about to demonstrate
-        a production-grade agentic reliability system with <strong>clear architectural boundaries</strong>.
-        This demo shows:
-        1. <strong>Real ARF OSS v3.3.7</strong> - Actual advisory intelligence
-        2. <strong>Simulated Enterprise</strong> - Value proposition without infrastructure access
-        3. <strong>Clear separation</strong> - Honest boundaries between OSS and Enterprise
-        """,
-        key_message="This isn't AI theater. It's a production-ready system with architectural honesty.",
-        visual_cue="🎭",
-        duration_seconds=30,
-        show_boundary=True,
-        boundary_text="We're simulating Enterprise execution for the demo. Real execution requires production infrastructure.",
-        is_real_arf=False
-    ),
-    DemoPhase.FAILURE_INJECTION: PhaseContent(
-        phase=DemoPhase.FAILURE_INJECTION,
-        title="🚨 Phase 1: Inject Production Failure",
-        narrative="""
-        We're simulating a <strong>Cache Miss Storm</strong> affecting 45,000 users with $8,500/hour revenue risk.
-        This is how most systems look right before they fail silently. The metrics show:
-        • Cache hit rate dropped from 85% to 18%
-        • Database load increased to 92%
-        • Response time spiked to 1,850ms
-        Notice: No remediation is running yet. We're letting you feel the tension.
-        """,
-        key_message="Failure happens. The question is how quickly and intelligently you respond.",
-        visual_cue="📉",
-        duration_seconds=20,
-        show_boundary=False,
-        is_real_arf=False
-    ),
-    DemoPhase.REAL_OSS_ANALYSIS: PhaseContent(
-        phase=DemoPhase.REAL_OSS_ANALYSIS,
-        title="🧠 Phase 2: Real ARF OSS Intelligence Activates",
-        narrative="""
-        ARF OSS v3.3.7 is now <strong>analyzing the incident in real-time</strong>. This is not a mock:
-        1. <strong>Detection Agent</strong> - Finds anomalies with 98.7% confidence
-        2. <strong>Recall Agent</strong> - Searches RAG memory for similar incidents
-        3. <strong>Decision Agent</strong> - Generates healing intent with reasoning
-        Watch the confidence scores increase as evidence accumulates. This is <strong>real inference</strong>,
-        not pre-programmed responses. The system is reasoning, not reacting.
-        """,
-        key_message="ARF OSS provides production-grade intelligence. It reasons before it recommends.",
-        visual_cue="🤖",
-        duration_seconds=45,
-        show_boundary=True,
-        boundary_text="This is REAL ARF OSS v3.3.7 (Apache 2.0). It can analyze but not execute.",
-        is_real_arf=True
-    ),
-    DemoPhase.DECISION_BOUNDARY: PhaseContent(
-        phase=DemoPhase.DECISION_BOUNDARY,
-        title="🎯 Phase 3: The Execution Boundary",
-        narrative="""
-        ARF OSS has created a <strong>HealingIntent with 94% confidence</strong>:
-        • Action: Scale Redis cluster from 3 to 5 nodes
-        • Pattern match: 87% success rate from similar incidents
-        • Safety check: ✅ Passed (blast radius: 2 services)
-        Now we pause intentionally. This is the <strong>architectural boundary</strong>:
-        • <strong>OSS can reason</strong> (Apache 2.0, advisory only)
-        • <strong>Enterprise can execute</strong> (Commercial, with safety guarantees)
-        The system knows what to do, but requires authority to act.
-        """,
-        key_message="Reasoning and authority are not the same thing. This boundary is intentional.",
-        visual_cue="⚖️",
-        duration_seconds=25,
-        show_boundary=True,
-        boundary_text="OSS boundary reached. Execution requires Enterprise edition and infrastructure authority.",
-        is_real_arf=True
-    ),
-    DemoPhase.SIMULATED_ENTERPRISE: PhaseContent(
-        phase=DemoPhase.SIMULATED_ENTERPRISE,
-        title="🏢 Phase 4: Simulated Enterprise Execution",
-        narrative="""
-        We're now simulating what <strong>ARF Enterprise</strong> would do:
-        1. <strong>Validate safety constraints</strong> - Business hours, blast radius, rollback plans
-        2. <strong>Apply novel execution protocols</strong> - Deterministic confidence, not just ML probabilities
-        3. <strong>Execute with guarantees</strong> - Rollback prepared, circuit breakers set
-        In production, this would execute against real infrastructure (Kubernetes, cloud APIs, etc.).
-        For the demo, we're showing the value proposition without real side effects.
-        """,
-        key_message="Enterprise adds execution authority, not just better intelligence.",
-        visual_cue="⚡",
-        duration_seconds=35,
-        show_boundary=True,
-        boundary_text="SIMULATED EXECUTION - Real Enterprise would execute against production infrastructure.",
-        is_real_arf=False
-    ),
-    DemoPhase.RESOLUTION: PhaseContent(
-        phase=DemoPhase.RESOLUTION,
-        title="✅ Phase 5: Incident Resolution",
-        narrative="""
-        The simulated execution completes:
-        • <strong>Recovery time:</strong> 12 minutes (vs 45 minutes manual)
-        • <strong>Cost saved:</strong> $6,375
-        • <strong>Users protected:</strong> 45,000 → 0 impacted
-        • <strong>Learning:</strong> Pattern added to RAG memory
-        System health normalizes. Confidence scores stabilize. The incident is marked as
-        <strong>resolved autonomously</strong>.
-        Key metrics show the impact:
-        • Detection time: 45s (89% faster than average)
-        • Auto-heal rate: 81.7% (5.4× industry average)
-        """,
-        key_message="Autonomous reliability creates measurable business impact.",
-        visual_cue="📊",
-        duration_seconds=30,
-        show_boundary=False,
-        is_real_arf=False
-    ),
-    DemoPhase.ARCHITECTURE_REVIEW: PhaseContent(
-        phase=DemoPhase.ARCHITECTURE_REVIEW,
-        title="🏗️ Phase 6: Architecture Validated",
-        narrative="""
-        Let's review what we demonstrated:
-        <strong>✅ Real Components (Production-Ready):</strong>
-        • ARF OSS v3.3.7 intelligence engine
-        • Three-agent pattern (Detection, Recall, Decision)
-        • RAG-based similarity search
-        • Confidence scoring and reasoning chains
-        <strong>🎭 Simulated Components (Demo Value):</strong>
-        • Enterprise execution authority
-        • Infrastructure orchestration
-        • Rollback guarantees
-        • Novel execution protocols
-        <strong>🎯 Clear Boundaries (Architectural Honesty):</strong>
-        • OSS advises, Enterprise executes
-        • No hidden automation or deception
-        • Production-ready separation
-        """,
-        key_message="This demo shows production architecture, not just AI capabilities.",
-        visual_cue="💎",
-        duration_seconds=40,
-        show_boundary=True,
-        boundary_text="Architecture validated: OSS for intelligence, Enterprise for execution.",
-        is_real_arf=False
-    )
-}
-# Original user journey steps (enhanced with phase alignment)
-USER_JOURNEY_STEPS = [
-    {
-        "step": 1,
-        "title": "🎭 Understand the Architecture",
-        "description": "Review the demo flow to understand clear boundaries between OSS and Enterprise",
-        "tab": "All Tabs",
-        "action": "Read the phase guidance",
-        "learning": "See how ARF separates intelligence (OSS) from execution (Enterprise)",
-        "phase": DemoPhase.INTRODUCTION.value,
-        "duration": "30s"
-    },
-    {
-        "step": 2,
-        "title": "🔥 Experience REAL ARF OSS Analysis",
-        "description": "Select an incident and run OSS analysis to see actual ARF v3.3.7 intelligence",
-        "tab": "Live Incident Demo",
-        "action": "Click 'Run OSS Analysis'",
-        "learning": "See real ARF OSS package analyzing incidents with confidence scores",
-        "phase": DemoPhase.REAL_OSS_ANALYSIS.value,
-        "duration": "45s"
-    },
-    {
-        "step": 3,
-        "title": "🎯 Observe the Execution Boundary",
-        "description": "Notice where OSS stops and Enterprise would begin",
-        "tab": "Live Incident Demo",
-        "action": "Review HealingIntent and boundary indicators",
-        "learning": "Understand the architectural separation between advisory and execution",
-        "phase": DemoPhase.DECISION_BOUNDARY.value,
-        "duration": "25s"
-    },
-    {
-        "step": 4,
-        "title": "⚡ Simulate Enterprise Healing",
-        "description": "Experience autonomous healing with simulated execution",
-        "tab": "Live Incident Demo",
-        "action": "Click 'Execute Enterprise Healing'",
-        "learning": "See the Enterprise value proposition without real infrastructure",
-        "phase": DemoPhase.SIMULATED_ENTERPRISE.value,
-        "duration": "35s"
-    },
-    {
-        "step": 5,
-        "title": "💰 Calculate Your Business ROI",
-        "description": "Adjust the sliders to see potential savings for your organization",
-        "tab": "Business Impact & ROI",
-        "action": "Use sliders then click 'Calculate My ROI'",
-        "learning": "Understand the business case with your specific numbers",
-        "phase": "business_roi",
-        "duration": "60s"
-    },
-    {
-        "step": 6,
-        "title": "📜 Explore Enterprise-Grade Compliance",
-        "description": "View comprehensive audit trail and compliance features",
-        "tab": "Audit Trail & History",
-        "action": "Check execution and incident history",
-        "learning": "See enterprise-level logging, compliance, and audit capabilities",
-        "phase": "compliance",
-        "duration": "45s"
-    },
-    {
-        "step": 7,
-        "title": "🧠 Discover the Learning Engine",
-        "description": "Explore pattern detection and similarity search",
-        "tab": "Learning Engine",
-        "action": "Search for similar incidents and view patterns",
-        "learning": "See how ARF learns from past incidents to improve future responses",
-        "phase": "learning",
-        "duration": "50s"
-    }
-]
-# Enhanced demo tips with boundary awareness
-DEMO_TIPS = [
-    "💎 **Architecture Tip**: Look for the 'REAL ARF' vs 'SIMULATED' indicators to understand boundaries",
-    "🎭 **Demo Tip**: The 'Run Complete Demo' button follows our psychological pacing guide",
-    "⚡ **Enterprise Tip**: Toggle approval mode to see different execution workflows",
-    "📊 **ROI Tip**: Use realistic numbers for your organization in the ROI calculator",
-    "🔍 **Analysis Tip**: Try different incident scenarios to see varied ARF responses",
-    "📜 **Compliance Tip**: Export the audit trail to see comprehensive JSON structure",
-    "🧠 **Learning Tip**: Search for patterns to see how ARF improves over time",
-    "🎯 **Boundary Tip**: Notice where OSS analysis ends and Enterprise execution would begin"
-]
-# Psychology-driven quick start guide
-QUICK_START_GUIDE = {
-    "for_executives": {
-        "focus": "Business Impact & ROI",
-        "steps": [
-            "1. Go to 'Business Impact & ROI' tab",
-            "2. Adjust sliders to match your organization",
-            "3. Click 'Calculate My ROI'",
-            "4. Review the 5.2× ROI multiplier",
-            "5. Ask: 'What would 73% faster MTTR mean for us?'"
-        ],
-        "time": "2 minutes",
-        "key_question": "What's the cost of NOT having autonomous reliability?"
-    },
-    "for_engineers": {
-        "focus": "Real ARF OSS Analysis",
-        "steps": [
-            "1. Select 'Cache Miss Storm' scenario",
-            "2. Click 'Run OSS Analysis'",
-            "3. Watch the three agents work in real-time",
-            "4. Review the HealingIntent with 94% confidence",
-            "5. Notice the reasoning chain and evidence"
-        ],
-        "time": "3 minutes",
-        "key_question": "How would this intelligence change your on-call experience?"
-    },
-    "for_architects": {
-        "focus": "Architecture Boundaries",
-        "steps": [
-            "1. Run the complete demo walkthrough",
-            "2. Look for 'REAL ARF' vs 'SIMULATED' indicators",
-            "3. Notice the execution boundary",
-            "4. Review the architecture validation phase",
-            "5. Ask: 'How would this integrate with our stack?'"
-        ],
-        "time": "4 minutes",
-        "key_question": "Is our current approach proactive or reactive?"
-    }
-}
-def get_phase_content(phase: DemoPhase) -> PhaseContent:
-    """Get content for a specific demo phase"""
-    return DEMO_FLOW.get(phase, DEMO_FLOW[DemoPhase.INTRODUCTION])
-def get_phase_html(phase: DemoPhase, current_step: int = 1) -> str:
-    """Get HTML for a demo phase with progress indicator"""
-    content = get_phase_content(phase)
-    total_steps = len(DEMO_FLOW)
-    # Calculate step number based on phase
-    phase_order = list(DEMO_FLOW.keys())
-    step_number = phase_order.index(phase) + 1 if phase in phase_order else current_step
-    return content.get_html(
-        show_progress=True,
-        current_step=step_number,
-        total_steps=total_steps
-    )
-def get_demo_progress(current_phase: DemoPhase) -> Dict[str, Any]:
-    """Get current demo progress information"""
-    phase_order = list(DEMO_FLOW.keys())
-    current_index = phase_order.index(current_phase) if current_phase in phase_order else 0
-    return {
-        "current_phase": current_phase.value,
-        "current_step": current_index + 1,
-        "total_steps": len(phase_order),
-        "progress_percentage": int(((current_index + 1) / len(phase_order)) * 100),
-        "next_phase": phase_order[current_index + 1].value if current_index + 1 < len(phase_order) else None,
-        "estimated_time_remaining": sum(
-            DEMO_FLOW[phase].duration_seconds
-            for i, phase in enumerate(phase_order)
-            if i > current_index
-        )
-    }
-def get_quick_start_guide(role: str = "executives") -> Dict[str, Any]:
-    """Get quick start guide for specific role"""
-    return QUICK_START_GUIDE.get(role, QUICK_START_GUIDE["for_executives"])
-# Psychology-focused demo controller
-class DemoPsychologyController:
-    """Manages the psychological flow of the demo"""
-    def __init__(self):
-        self.current_phase = DemoPhase.INTRODUCTION
-        self.phase_start_time = time.time()
-        self.completed_phases = []
-        self.user_attention_score = 100  # Start with full attention
-    def transition_to_phase(self, phase: DemoPhase) -> Dict[str, Any]:
-        """Transition to a new demo phase with psychological timing"""
-        current_time = time.time()
-        phase_duration = current_time - self.phase_start_time
-        # Calculate attention score (decays over time, refreshes on phase change)
-        self.user_attention_score = max(60, self.user_attention_score - (phase_duration / 10))
-        # If phase was too short, user might have missed it
-        if phase_duration < 10 and self.current_phase != DemoPhase.INTRODUCTION:
-            self.user_attention_score -= 10
-        # Update state
-        self.completed_phases.append(self.current_phase)
-        self.current_phase = phase
-        self.phase_start_time = time.time()
-        # Refresh attention on phase change
-        self.user_attention_score = min(100, self.user_attention_score + 20)
-        return {
-            "new_phase": phase.value,
-            "previous_phase_duration": int(phase_duration),
-            "user_attention_score": int(self.user_attention_score),
-            "recommended_pause": self._get_recommended_pause(phase),
-            "key_message": DEMO_FLOW[phase].key_message
-        }
-    def _get_recommended_pause(self, phase: DemoPhase) -> str:
-        """Get recommended pause based on phase psychology"""
-        pauses = {
-            DemoPhase.INTRODUCTION: "Pause to set context",
-            DemoPhase.FAILURE_INJECTION: "Let the tension build",
-            DemoPhase.REAL_OSS_ANALYSIS: "Watch the reasoning unfold",
-            DemoPhase.DECISION_BOUNDARY: "Pause intentionally here",
-            DemoPhase.SIMULATED_ENTERPRISE: "Explain the simulation",
-            DemoPhase.RESOLUTION: "Show the impact",
-            DemoPhase.ARCHITECTURE_REVIEW: "Summarize the architecture"
-        }
-        return pauses.get(phase, "Continue")
-    def get_current_guidance(self) -> str:
-        """Get current guidance HTML"""
-        return get_phase_html(self.current_phase, len(self.completed_phases) + 1)
-    def should_speed_up(self) -> bool:
-        """Determine if we should speed up based on attention score"""
-        return self.user_attention_score < 70
-    def should_slow_down(self) -> bool:
-        """Determine if we should slow down for emphasis"""
-        important_phases = [
-            DemoPhase.DECISION_BOUNDARY,
-            DemoPhase.ARCHITECTURE_REVIEW
-        ]
-        return self.current_phase in important_phases
-# Global demo controller instance
-_demo_controller = None
-def get_demo_controller() -> DemoPsychologyController:
-    """Get singleton demo controller instance"""
-    global _demo_controller
-    if _demo_controller is None:
-        _demo_controller = DemoPsychologyController()
-    return _demo_controller

demo/mock_arf.py DELETED Viewed

@@ -1,668 +0,0 @@
-# demo/mock_arf.py
-"""
-Enhanced Mock ARF with scenario-aware metrics
-Generates different values based on scenario characteristics
-DOCTRINAL COMPLIANCE VERSION 3.3.9+restraint
-Key Addition: Explicit Observation Gate for psychological advantage
-"""
-import random
-import time
-import datetime
-from typing import Dict, Any, List
-import json
-# Scenario-specific configurations
-SCENARIO_CONFIGS = {
-    "Cache Miss Storm": {
-        "detection_confidence_range": (0.97, 0.995),  # 97-99.5%
-        "detection_time_range": (35, 55),  # 35-55 seconds
-        "accuracy_range": (0.97, 0.995),  # 97-99.5%
-        "similar_incidents_range": (2, 5),  # 2-5 similar incidents
-        "similarity_score_range": (0.88, 0.96),  # 88-96% similarity
-        "pattern_confidence_range": (0.91, 0.97),  # 91-97% confidence
-        "success_rate_range": (0.82, 0.93),  # 82-93% success rate
-        "cost_savings_range": (5000, 9000),  # $5K-$9K savings
-        "resolution_time_range": (10, 18),  # 10-18 minutes
-        "affected_users_range": (30000, 60000),  # 30K-60K users
-        "tags": ["cache", "redis", "latency", "memory"]
-    },
-    "Database Connection Pool Exhaustion": {
-        "detection_confidence_range": (0.92, 0.98),
-        "detection_time_range": (40, 65),
-        "accuracy_range": (0.95, 0.985),
-        "similar_incidents_range": (1, 4),
-        "similarity_score_range": (0.85, 0.94),
-        "pattern_confidence_range": (0.88, 0.95),
-        "success_rate_range": (0.78, 0.88),
-        "cost_savings_range": (3500, 5500),
-        "resolution_time_range": (15, 25),
-        "affected_users_range": (15000, 30000),
-        "tags": ["database", "postgres", "connections", "pool"]
-    },
-    "Kubernetes Memory Leak": {
-        "detection_confidence_range": (0.94, 0.99),
-        "detection_time_range": (30, 50),
-        "accuracy_range": (0.96, 0.99),
-        "similar_incidents_range": (3, 6),
-        "similarity_score_range": (0.89, 0.95),
-        "pattern_confidence_range": (0.90, 0.96),
-        "success_rate_range": (0.85, 0.92),
-        "cost_savings_range": (4500, 7500),
-        "resolution_time_range": (12, 22),
-        "affected_users_range": (20000, 40000),
-        "tags": ["kubernetes", "memory", "container", "leak"]
-    },
-    "API Rate Limit Storm": {
-        "detection_confidence_range": (0.96, 0.99),
-        "detection_time_range": (25, 45),
-        "accuracy_range": (0.97, 0.99),
-        "similar_incidents_range": (2, 4),
-        "similarity_score_range": (0.87, 0.93),
-        "pattern_confidence_range": (0.89, 0.94),
-        "success_rate_range": (0.80, 0.90),
-        "cost_savings_range": (3000, 5000),
-        "resolution_time_range": (8, 15),
-        "affected_users_range": (10000, 25000),
-        "tags": ["api", "rate_limit", "throttling", "ddos"]
-    },
-    "Network Partition": {
-        "detection_confidence_range": (0.98, 0.999),
-        "detection_time_range": (20, 40),
-        "accuracy_range": (0.98, 0.995),
-        "similar_incidents_range": (1, 3),
-        "similarity_score_range": (0.90, 0.97),
-        "pattern_confidence_range": (0.93, 0.98),
-        "success_rate_range": (0.75, 0.85),
-        "cost_savings_range": (8000, 15000),
-        "resolution_time_range": (20, 35),
-        "affected_users_range": (50000, 100000),
-        "tags": ["network", "partition", "connectivity", "failure"]
-    },
-    "Storage I/O Saturation": {
-        "detection_confidence_range": (0.93, 0.98),
-        "detection_time_range": (45, 70),
-        "accuracy_range": (0.94, 0.98),
-        "similar_incidents_range": (2, 5),
-        "similarity_score_range": (0.86, 0.92),
-        "pattern_confidence_range": (0.87, 0.93),
-        "success_rate_range": (0.79, 0.87),
-        "cost_savings_range": (5500, 8500),
-        "resolution_time_range": (18, 28),
-        "affected_users_range": (25000, 45000),
-        "tags": ["storage", "disk", "io", "saturation"]
-    }
-}
-def get_scenario_config(scenario_name: str) -> Dict[str, Any]:
-    """Get configuration for a specific scenario with defaults"""
-    return SCENARIO_CONFIGS.get(scenario_name, {
-        "detection_confidence_range": (0.90, 0.98),
-        "detection_time_range": (30, 60),
-        "accuracy_range": (0.92, 0.98),
-        "similar_incidents_range": (1, 3),
-        "similarity_score_range": (0.85, 0.95),
-        "pattern_confidence_range": (0.85, 0.95),
-        "success_rate_range": (0.75, 0.90),
-        "cost_savings_range": (4000, 8000),
-        "resolution_time_range": (15, 30),
-        "affected_users_range": (20000, 50000),
-        "tags": ["unknown", "incident"]
-    })
-def simulate_arf_analysis(scenario_data: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Simulate ARF analysis with scenario-specific metrics
-    Args:
-        scenario_data: Dictionary containing scenario information
-    Returns:
-        Dictionary with analysis results
-    """
-    scenario_name = scenario_data.get("name", "Unknown Scenario")
-    config = get_scenario_config(scenario_name)
-    # Generate scenario-specific values
-    detection_confidence = random.uniform(*config["detection_confidence_range"])
-    detection_time = random.randint(*config["detection_time_range"])
-    accuracy = random.uniform(*config["accuracy_range"])
-    return {
-        "analysis_complete": True,
-        "anomaly_detected": True,
-        "severity": scenario_data.get("severity", "HIGH_VARIANCE"),  # Changed from "HIGH" to "HIGH_VARIANCE"
-        "confidence": round(detection_confidence, 3),  # Round to 3 decimals
-        "detection_time_ms": detection_time * 1000,  # Convert to ms for display
-        "detection_time_seconds": detection_time,
-        "accuracy": round(accuracy, 3),
-        "component": scenario_data.get("component", "unknown"),
-        "scenario_specific": True,
-        "scenario_name": scenario_name,
-        "tags": config["tags"]
-    }
-def run_rag_similarity_search(scenario_data: Dict[str, Any]) -> List[Dict[str, Any]]:
-    """
-    Simulate RAG similarity search with scenario-specific results
-    Args:
-        scenario_data: Dictionary containing scenario information
-    Returns:
-        List of similar incidents
-    """
-    scenario_name = scenario_data.get("name", "Unknown Scenario")
-    config = get_scenario_config(scenario_name)
-    similar_count = random.randint(*config["similar_incidents_range"])
-    similar_incidents = []
-    # Generate similar incidents based on scenario
-    base_time = int(time.time())
-    for i in range(similar_count):
-        similarity_score = random.uniform(*config["similarity_score_range"])
-        cost_savings = random.randint(*config["cost_savings_range"])
-        resolution_time = random.randint(*config["resolution_time_range"])
-        affected_users = random.randint(*config["affected_users_range"])
-        # Different resolutions based on scenario type
-        if "cache" in scenario_name.lower() or "redis" in scenario_name.lower():
-            resolution = random.choice(["scale_out", "warm_cache", "memory_increase", "add_replicas"])
-        elif "database" in scenario_name.lower():
-            resolution = random.choice(["restart", "connection_pool_resize", "index_optimization", "vacuum"])
-        elif "kubernetes" in scenario_name.lower():
-            resolution = random.choice(["restart_pod", "memory_limit_increase", "node_drain", "resource_quota"])
-        elif "api" in scenario_name.lower():
-            resolution = random.choice(["circuit_breaker", "rate_limit_increase", "caching", "load_balancer"])
-        elif "network" in scenario_name.lower():
-            resolution = random.choice(["route_update", "failover", "bandwidth_increase", "redundancy"])
-        elif "storage" in scenario_name.lower():
-            resolution = random.choice(["io_optimization", "disk_upgrade", "cache_addition", "load_distribution"])
-        else:
-            resolution = random.choice(["investigate", "scale", "restart", "optimize"])
-        similar_incidents.append({
-            "incident_id": f"inc_{base_time - random.randint(1, 90)}_00{i}",
-            "similarity_score": round(similarity_score, 3),
-            "success": random.random() > 0.15,  # 85% success rate
-            "resolution": resolution,
-            "cost_savings": cost_savings,
-            "detection_time": f"{random.randint(30, 60)}s",
-            "resolution_time": f"{resolution_time}m",
-            "pattern": f"{scenario_name.lower().replace(' ', '_')}_v{random.randint(1, 3)}",
-            "affected_users": affected_users,
-            "component_match": scenario_data.get("component", "unknown"),
-            "rag_source": "production_memory_v3",
-            "timestamp": f"2024-{random.randint(1, 12):02d}-{random.randint(1, 28):02d}"
-        })
-    return similar_incidents
-def calculate_pattern_confidence(scenario_data: Dict[str, Any], similar_incidents: List[Dict[str, Any]]) -> float:
-    """
-    Calculate pattern confidence based on similar incidents
-    Args:
-        scenario_data: Dictionary containing scenario information
-        similar_incidents: List of similar incidents from RAG search
-    Returns:
-        Pattern confidence score (0-1)
-    """
-    scenario_name = scenario_data.get("name", "Unknown Scenario")
-    config = get_scenario_config(scenario_name)
-    if not similar_incidents:
-        return random.uniform(*config["pattern_confidence_range"])
-    # Calculate average similarity and success rate
-    similarity_scores = [inc["similarity_score"] for inc in similar_incidents]
-    success_rates = [1.0 if inc["success"] else 0.0 for inc in similar_incidents]
-    avg_similarity = sum(similarity_scores) / len(similarity_scores)
-    avg_success = sum(success_rates) / len(success_rates)
-    # Weighted average: 60% similarity, 40% success rate
-    confidence = (avg_similarity * 0.6) + (avg_success * 0.4)
-    # Add some randomness but keep within scenario range
-    min_conf, max_conf = config["pattern_confidence_range"]
-    confidence = max(min_conf, min(max_conf, confidence))
-    return round(confidence, 3)
-def calculate_internal_success_rate(similar_incidents: List[Dict[str, Any]]) -> float:
-    """
-    Calculate success rate for internal logic only.
-    Not for UI display in Decision View.
-    Doctrinal: Percentages invite debate, narratives shut it down.
-    Keep this internal for logic, surface only in Outcome View.
-    """
-    if not similar_incidents:
-        return 0.0
-    success_count = sum(1 for inc in similar_incidents if inc.get("success", False))
-    return round(success_count / len(similar_incidents), 3)
-def check_contraindications(scenario_data: Dict[str, Any], similar_incidents: List[Dict[str, Any]]) -> Dict[str, Any]:
-    """
-    Check for contraindications based on retry amplification signatures and historical evidence
-    Returns:
-        Dictionary with contraindication analysis
-    """
-    component = scenario_data.get("component", "").lower()
-    scenario_name = scenario_data.get("name", "").lower()
-    # Detect retry amplification signatures
-    retry_amplification = False
-    evidence = []
-    # Check telemetry for retry storm indicators
-    telemetry = scenario_data.get("telemetry", {})
-    if telemetry.get("retry_storm", False):
-        retry_amplification = True
-        evidence.append("Telemetry shows retry_storm: True")
-    # Check for amplification factor in metrics
-    metrics = scenario_data.get("metrics", {})
-    amplification_factor = metrics.get("amplification_factor", 1.0)
-    if amplification_factor > 2.0:
-        retry_amplification = True
-        evidence.append(f"Amplification factor {amplification_factor} > 2.0")
-    # Check database load
-    db_load = metrics.get("database_load_percent", 0)
-    if db_load > 85:
-        retry_amplification = True
-        evidence.append(f"Database load {db_load}% > 85%")
-    # Check historical incidents for scaling-first failures
-    historical_scaling_failures = False
-    scaling_failure_evidence = []
-    for incident in similar_incidents:
-        resolution = incident.get("resolution", "").lower()
-        success = incident.get("success", True)
-        # Check for scaling-first resolutions that failed
-        if any(scale_term in resolution for scale_term in ["scale", "increase", "add_replicas"]):
-            if not success:
-                historical_scaling_failures = True
-                scaling_failure_evidence.append(
-                    f"{incident.get('timestamp', 'Unknown date')}: {resolution} failed"
-                )
-    contraindicated_actions = []
-    if retry_amplification or historical_scaling_failures:
-        contraindicated_actions.append("scale_during_retry_amplification")
-    return {
-        "retry_amplification": retry_amplification,
-        "historical_scaling_failures": historical_scaling_failures,
-        "evidence": evidence + scaling_failure_evidence,
-        "contraindicated_actions": contraindicated_actions,
-        "confidence": 0.92 if evidence else 0.0
-    }
-def create_mock_healing_intent(scenario_data: Dict[str, Any], similar_incidents: List[Dict[str, Any]], confidence: float) -> Dict[str, Any]:
-    """
-    Create doctrinally compliant healing intent with sequencing thesis enforcement
-    Doctrinal Addition: Explicit Observation Gate when contraindications exist OR confidence < threshold
-    Psychological Goal: Make inaction an explicit, powerful decision
-    """
-    # Check for contraindications FIRST (doctrinal constraint)
-    contraindications = check_contraindications(scenario_data, similar_incidents)
-    scenario_name = scenario_data.get("name", "Unknown Scenario")
-    config = get_scenario_config(scenario_name)
-    component = scenario_data.get("component", "unknown")
-    # ============ OBSERVATION GATE LOGIC ============
-    # Key psychological addition: Explicit deferral when uncertainty is high
-    observation_gate_threshold = 0.70  # Below this, we observe first
-    should_observe_first = (
-        contraindications["retry_amplification"] or
-        contraindications["historical_scaling_failures"] or
-        confidence < observation_gate_threshold or
-        len(similar_incidents) < 2  # Insufficient historical evidence
-    )
-    if should_observe_first:
-        # Return OBSERVATION GATE state - intentional inaction
-        current_time = datetime.datetime.now()
-        next_evaluation = current_time + datetime.timedelta(minutes=5)
-        return {
-            "action": "defer_decision_for_trend_confirmation",
-            "component": component,
-            "confidence": round(confidence, 3),
-            "parameters": {
-                "observation_window": "5m",
-                "metrics_to_watch": ["retry_count", "database_load_percent", "error_rate"],
-                "trend_threshold": "stabilizing_or_declining"
-            },
-            "source": "observation_gate_logic",
-            "requires_enterprise": False,
-            "advisory_only": True,
-            # CRITICAL PSYCHOLOGICAL FIELDS
-            "execution_state": "observe_only",
-            "next_evaluation_window": "5m",
-            "decision_frozen_until": next_evaluation.isoformat(),
-            "deferral_reason": "uncertainty_too_high_for_action" if confidence < observation_gate_threshold else
-                              "contraindications_present" if contraindications["retry_amplification"] else
-                              "historical_failures_detected" if contraindications["historical_scaling_failures"] else
-                              "insufficient_historical_evidence",
-            # FORMAL HEALINGINTENT FIELDS
-            "preconditions": [
-                f"Confidence threshold not met ({confidence:.2f} < {observation_gate_threshold})" if confidence < observation_gate_threshold else
-                "Retry amplification detected" if contraindications["retry_amplification"] else
-                "Historical scaling failures present" if contraindications["historical_scaling_failures"] else
-                "Insufficient similar incidents for pattern matching"
-            ],
-            "contraindicated_actions": ["any_healing_action_during_high_uncertainty"],
-            "reversibility_statement": "Evaluation resumes automatically after 5-minute observation window",
-            "sequencing_rule": "observe_before_any_action_when_uncertain",
-            "historical_evidence": [
-                f"{len(similar_incidents)} similar incidents analyzed (minimum 2 required)",
-                "Observation-first reduces incorrect actions by 67% (historical analysis)"
-            ],
-            # SUCCESS RATE HANDLING (kept internal, not surfaced early)
-            "_internal_success_rate": calculate_internal_success_rate(similar_incidents) if similar_incidents else 0.0,
-            "_internal_notes": "Success rate kept internal; percentages invite debate, narratives shut it down",
-            "scenario_specific": True,
-            "scenario_name": scenario_name
-        }
-    # If retry amplification detected (but passed observation gate threshold), enforce dampening-first logic
-    if contraindications["retry_amplification"]:
-        return {
-            "action": "implement_request_coalescing_with_exponential_backoff",
-            "component": component,
-            "confidence": max(confidence, 0.85),  # High confidence for dampening-first
-            "parameters": {
-                "coalescing_window_ms": "100-500ms",
-                "backoff_factor": "exponential",
-                "max_retries": 3,
-                "timeout": "10m"
-            },
-            "source": "contraindication_detection",
-            "requires_enterprise": False,
-            "advisory_only": False,
-            # CRITICAL: Add observation window even for dampening actions
-            "post_action_observation": {
-                "required": True,
-                "duration": "5m",
-                "metrics": ["retry_count", "database_load_percent", "latency_p99"]
-            },
-            "success_rate": 0.88,
-            "estimated_impact": {
-                "cost_savings": 4500,
-                "resolution_time_minutes": 12,
-                "users_protected": random.randint(*config["affected_users_range"]),
-                "mttr_reduction": "73%"
-            },
-            "safety_checks": {
-                "blast_radius": "single_service",
-                "business_hours": "compliant",
-                "rollback_plan": "coalescing_disable",
-                "approval_required": False,
-                "risk_level": "low"
-            },
-            # FORMAL HEALINGINTENT FIELDS (doctrinal constraint)
-            "preconditions": [
-                "Retry amplification signature detected",
-                f"Amplification factor > {scenario_data.get('metrics', {}).get('amplification_factor', 2.0)}",
-                "Database load > 85%"
-            ],
-            "contraindicated_actions": ["scale_during_retry_storm", "add_capacity_during_amplification"],
-            "reversibility_statement": "Remove coalescing window after 10 minutes of stable operation",
-            "sequencing_rule": "dampening_first_then_observe_then_optional_scale",
-            "historical_evidence": contraindications["evidence"][:3],  # Top 3 evidence items
-            "scenario_specific": True,
-            "scenario_name": scenario_name
-        }
-    # Only proceed with normal logic if no contraindications AND passed observation gate
-    # Determine action based on component and scenario WITH sequencing logic
-    ranked_actions = []
-    # DAMPENING actions (always first in sequence)
-    dampening_actions = []
-    if "api" in component.lower() or "rate" in scenario_name.lower():
-        dampening_actions.append({
-            "action": "circuit_breaker",
-            "confidence": confidence * 0.95,  # Slightly lower confidence for dampening
-            "parameters": {
-                "threshold": f"{random.randint(70, 85)}%",
-                "window": f"{random.randint(3, 10)}m",
-                "fallback": "cached_response",
-                "retry_after": f"{random.randint(30, 120)}s"
-            }
-        })
-    # Add general dampening for retry-prone scenarios
-    if any(term in component.lower() for term in ["redis", "cache", "database"]):
-        dampening_actions.append({
-            "action": "request_batching_with_timeout",
-            "confidence": confidence * 0.92,
-            "parameters": {
-                "batch_size": "10-50 requests",
-                "timeout_ms": "100ms",
-                "strategy": "adaptive"
-            }
-        })
-    # Add dampening actions to ranked list
-    for i, act in enumerate(dampening_actions):
-        ranked_actions.append({
-            "rank": len(ranked_actions) + 1,
-            "action": act["action"],
-            "confidence": round(act["confidence"], 3),
-            "parameters": act["parameters"],
-            "category": "dampening"
-        })
-    # CONCURRENCY CAP actions (second in sequence)
-    if "database" in component.lower():
-        ranked_actions.append({
-            "rank": len(ranked_actions) + 1,
-            "action": "connection_pool_limit_adjustment",
-            "confidence": confidence * 0.88,
-            "parameters": {
-                "max_connections": f"{random.randint(100, 200)}",
-                "timeout": f"{random.randint(30, 60)}s"
-            },
-            "category": "concurrency_control"
-        })
-    # OBSERVE actions (third in sequence)
-    ranked_actions.append({
-        "rank": len(ranked_actions) + 1,
-        "action": "enhanced_monitoring_with_telemetry",
-        "confidence": confidence * 0.85,
-        "parameters": {
-            "duration": "5m",
-            "metrics": ["latency_p99", "error_rate", "throughput"],
-            "alert_threshold": "2x_baseline"
-        },
-        "category": "observation"
-    })
-    # SCALING actions (ONLY if no contraindications AND last in sequence)
-    # AND only if confidence justifies scaling over dampening
-    scaling_confidence_threshold = 0.75  # Scaling requires higher confidence
-    if confidence > scaling_confidence_threshold and not contraindications["historical_scaling_failures"]:
-        if "cache" in component.lower() or "redis" in component.lower():
-            scaling_action = {
-                "rank": len(ranked_actions) + 1,
-                "action": "gradual_scale_out",
-                "confidence": confidence * 0.80,  # Lower confidence than dampening
-                "parameters": {
-                    "nodes": f"{random.randint(2, 4)}→{random.randint(4, 6)}",
-                    "strategy": "one_by_one",
-                    "health_check_interval": "30s"
-                },
-                "category": "scaling",
-                "constraints": ["Only if dampening insufficient after 5 minutes"]
-            }
-            ranked_actions.append(scaling_action)
-    # Calculate success rate internally only
-    _internal_success_rate = calculate_internal_success_rate(similar_incidents) if similar_incidents else random.uniform(*config["success_rate_range"])
-    # Calculate estimated impact
-    if similar_incidents:
-        avg_cost_savings = sum(inc["cost_savings"] for inc in similar_incidents) / len(similar_incidents)
-        avg_resolution_time = sum(int(inc["resolution_time"].replace('m', '')) for inc in similar_incidents) / len(similar_incidents)
-    else:
-        avg_cost_savings = sum(config["cost_savings_range"]) / 2
-        avg_resolution_time = sum(config["resolution_time_range"]) / 2
-    # Primary action is first in ranked_actions (dampening-first)
-    primary_action = ranked_actions[0] if ranked_actions else {
-        "action": "investigate",
-        "confidence": confidence,
-        "parameters": {"priority": "high"}
-    }
-    return {
-        "action": primary_action["action"],
-        "component": component,
-        "confidence": round(confidence, 3),
-        "parameters": primary_action.get("parameters", {}),
-        "source": "sequencing_analysis",
-        "requires_enterprise": True,
-        "advisory_only": True,
-        # SUCCESS RATE: Internal only, not for UI display in Decision View
-        "_internal_success_rate": _internal_success_rate,
-        "_internal_notes": "Success rate for internal logic; surface narrative outcomes, not percentages",
-        "estimated_impact": {
-            "cost_savings": int(avg_cost_savings),
-            "resolution_time_minutes": int(avg_resolution_time),
-            "users_protected": random.randint(*config["affected_users_range"]),
-            "mttr_reduction": f"{random.randint(60, 80)}%"
-        },
-        "safety_checks": {
-            "blast_radius": f"{random.randint(1, 3)} services",
-            "business_hours": "compliant",
-            "rollback_plan": "available",
-            "approval_required": True,
-            "risk_level": "medium" if confidence < 0.9 else "low"
-        },
-        # FORMAL HEALINGINTENT FIELDS (doctrinal constraint)
-        "preconditions": [
-            f"Component: {component}",
-            f"Confidence threshold > {scaling_confidence_threshold}",
-            "No retry amplification detected",
-            "Historical scaling success rate > 70%"
-        ],
-        "contraindicated_actions": contraindications["contraindicated_actions"],
-        "reversibility_statement": f"Rollback to previous configuration available within {random.randint(5, 15)} minutes",
-        "sequencing_rule": "dampening_before_concurrency_before_observation_before_scaling",
-        "ranked_actions": ranked_actions,
-        "historical_evidence": [f"{len(similar_incidents)} similar incidents analyzed"],
-        "scenario_specific": True,
-        "scenario_name": scenario_name
-    }
-def get_scenario_metrics(scenario_name: str) -> Dict[str, Any]:
-    """
-    Get dynamic metrics for a specific scenario
-    Args:
-        scenario_name: Name of the scenario
-    Returns:
-        Dictionary with scenario-specific metrics
-    """
-    config = get_scenario_config(scenario_name)
-    # Generate dynamic values within ranges
-    return {
-        "detection_confidence": round(random.uniform(*config["detection_confidence_range"]), 3),
-        "detection_time_seconds": random.randint(*config["detection_time_range"]),
-        "accuracy": round(random.uniform(*config["accuracy_range"]), 3),
-        "expected_similar_incidents": random.randint(*config["similar_incidents_range"]),
-        "avg_similarity_score": round(random.uniform(*config["similarity_score_range"]), 3),
-        "pattern_confidence": round(random.uniform(*config["pattern_confidence_range"]), 3),
-        "success_rate": round(random.uniform(*config["success_rate_range"]), 3),
-        "cost_savings_range": config["cost_savings_range"],
-        "resolution_time_range": config["resolution_time_range"],
-        "affected_users_range": config["affected_users_range"],
-        "tags": config["tags"]
-    }
-def detect_retry_amplification(telemetry_data: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Detect retry amplification signatures from telemetry data
-    Doctrinal constraint: Must be REAL detection, not hardcoded in scenarios
-    Args:
-        telemetry_data: Dictionary containing telemetry metrics
-    Returns:
-        Dictionary with detection results
-    """
-    # Extract metrics with defaults
-    retry_storm = telemetry_data.get("retry_storm", False)
-    retry_count = telemetry_data.get("retry_count", 0)
-    success_count = telemetry_data.get("success_count", 1)  # Avoid division by zero
-    database_load = telemetry_data.get("database_load_percent", 0)
-    retry_cascade_depth = telemetry_data.get("retry_cascade_depth", 0)
-    # Calculate amplification factor
-    amplification_factor = 1.0
-    if success_count > 0:
-        amplification_factor = retry_count / success_count
-    # Detect signatures
-    detected = (
-        retry_storm or
-        amplification_factor > 2.0 or
-        retry_cascade_depth > 2 or
-        database_load > 85
-    )
-    signature = None
-    if detected:
-        if retry_storm and amplification_factor > 3.0:
-            signature = "exponential_retry_cascade"
-        elif database_load > 85 and amplification_factor > 1.5:
-            signature = "database_amplified_retry"
-        else:
-            signature = "retry_amplification_detected"
-    # Calculate confidence based on evidence strength
-    confidence_factors = []
-    if retry_storm:
-        confidence_factors.append(0.3)
-    if amplification_factor > 2.0:
-        confidence_factors.append(0.25 * min(amplification_factor / 5.0, 1.0))
-    if retry_cascade_depth > 2:
-        confidence_factors.append(0.2 * min(retry_cascade_depth / 5.0, 1.0))
-    if database_load > 85:
-        confidence_factors.append(0.25 * min(database_load / 100.0, 1.0))
-    confidence = min(0.98, 0.1 + sum(confidence_factors)) if confidence_factors else 0.0
-    return {
-        "detected": detected,
-        "amplification_factor": round(amplification_factor, 2),
-        "signature": signature,
-        "confidence": round(confidence, 3),
-        "metrics": {
-            "retry_storm": retry_storm,
-            "retry_count": retry_count,
-            "success_count": success_count,
-            "database_load_percent": database_load,
-            "retry_cascade_depth": retry_cascade_depth
-        },
-        "recommendation": "implement_dampening_first" if detected else "proceed_with_caution"
-    }

demo/orchestrator.py DELETED Viewed

@@ -1,98 +0,0 @@
-# demo/orchestrator.py - COMPLETE FIXED VERSION
-from __future__ import annotations
-import logging
-import asyncio
-from typing import Any, Dict, Optional, List
-import time
-logger = logging.getLogger(__name__)
-# Import mock ARF functions
-try:
-    from demo.mock_arf import (
-        simulate_arf_analysis,
-        run_rag_similarity_search,
-        create_mock_healing_intent,
-        calculate_pattern_confidence
-    )
-    MOCK_ARF_AVAILABLE = True
-    logger.info("Mock ARF functions available")
-except ImportError as e:
-    logger.warning(f"Mock ARF functions not available: {e}")
-    MOCK_ARF_AVAILABLE = False
-class DemoOrchestrator:
-    """
-    Orchestrates demo scenarios with proper agent workflow.
-    """
-    def __init__(self, enable_streamlit: bool = False):
-        self.enable_streamlit = enable_streamlit
-        logger.info("DemoOrchestrator initialized")
-    async def analyze_incident(self, scenario_name: str, scenario_data: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Analyze an incident using the ARF agent workflow.
-        This is the method called by app.py
-        """
-        logger.info(f"Analyzing incident: {scenario_name}")
-        if not MOCK_ARF_AVAILABLE:
-            logger.error("Mock ARF functions not available")
-            return {
-                "status": "error",
-                "message": "Mock ARF functions not available",
-                "scenario": scenario_name
-            }
-        try:
-            # Step 1: Detection Agent
-            logger.debug("Running detection agent...")
-            detection_result = simulate_arf_analysis(scenario_data)
-            # Step 2: Recall Agent
-            logger.debug("Running recall agent...")
-            similar_incidents = run_rag_similarity_search(scenario_data)
-            # Step 3: Decision Agent
-            logger.debug("Running decision agent...")
-            confidence = calculate_pattern_confidence(scenario_data, similar_incidents)
-            healing_intent = create_mock_healing_intent(scenario_data, similar_incidents, confidence)
-            # Simulate processing time
-            await asyncio.sleep(0.5)
-            result = {
-                "status": "success",
-                "scenario": scenario_name,
-                "detection": detection_result,
-                "recall": similar_incidents,
-                "decision": healing_intent,
-                "confidence": confidence,
-                "processing_time_ms": 450
-            }
-            logger.info(f"Analysis complete for {scenario_name}")
-            return result
-        except Exception as e:
-            logger.error(f"Error analyzing incident: {e}", exc_info=True)
-            return {
-                "status": "error",
-                "message": str(e),
-                "scenario": scenario_name
-            }
-    def run_scenario(self, scenario: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Run a demo scenario (legacy method).
-        """
-        logger.info("Running scenario: %s", scenario.get("name", "unknown"))
-        return {
-            "scenario": scenario.get("name"),
-            "status": "completed",
-            "output": scenario,
-        }

demo/scenarios.py DELETED Viewed

@@ -1,334 +0,0 @@
-"""
-Incident scenarios for the demo - EXPANDED VERSION WITH REALISM UPGRADES
-Version: 3.3.9+realism
-"""
-INCIDENT_SCENARIOS = {
-    "Cache Miss Storm": {
-        "description": "Redis cluster experiencing 80% cache miss rate causing database overload",
-        "severity": "CRITICAL",
-        "component": "redis_cache",
-        "metrics": {
-            "cache_hit_rate": 18.5,
-            "database_load": 92,
-            "response_time_ms": 1850,
-            "affected_users": 45000,
-            "eviction_rate_per_sec": 125
-        },
-        "business_impact": {
-            "revenue_loss_per_hour": 8500,
-            "sla_violation": True,
-            "customer_sat_change": -40,
-            "affected_services": ["API Gateway", "User Service", "Payment"]
-        },
-        "roi_data": {
-            "hourly_revenue_loss": 8500,
-            "manual_recovery_hours": 1.0,
-            "enterprise_recovery_hours": 0.2,
-            "engineers_required": 4,
-            "engineer_hourly_rate": 150,
-            "estimated_monthly_occurrences": 2,
-            "enterprise_savings_percentage": 0.85
-        },
-        # ============ REALISM UPGRADES ============
-        "realism": {
-            "ranked_actions": [
-                {
-                    "rank": 1,
-                    "confidence": 87,
-                    "action": "Scale Redis cluster from 3 to 5 nodes",
-                    "rationale": "Immediate throughput increase, reduces contention",
-                    "risk": "Cold cache amplification: Medium",
-                    "tradeoff": "Adds $420/month infrastructure cost",
-                    "execution_time": "8-12 minutes",
-                    "success_rate": "94% based on 18 similar incidents"
-                },
-                {
-                    "rank": 2,
-                    "confidence": 62,
-                    "action": "Implement request coalescing with 500ms window",
-                    "rationale": "Reduces duplicate DB queries, lower blast radius",
-                    "risk": "Adds 150-200ms latency per request",
-                    "tradeoff": "Slower stabilization (15-20 minutes)",
-                    "rejection_note": "Secondary option if scaling unavailable"
-                },
-                {
-                    "rank": 3,
-                    "confidence": 34,
-                    "action": "Restart Redis cluster with warmup script",
-                    "rationale": "Clears fragmentation, resets eviction policies",
-                    "risk": "HIGH: 45-second service interruption",
-                    "rejection_reason": "Rejected: High data loss risk during peak traffic",
-                    "safety_override": "Required for Enterprise execution"
-                }
-            ],
-            "risk_assessment": {
-                "stampede_probability": "18%",
-                "cold_cache_impact": "Medium",
-                "data_inconsistency_risk": "Low",
-                "recovery_complexity": "Medium"
-            },
-            "constraints": {
-                "max_redis_nodes": 8,
-                "scaling_cooldown": "30 minutes",
-                "concurrent_connections": "25,000",
-                "data_size_gb": 42
-            },
-            "confidence_degradation": {
-                "initial": 94,
-                "after_8_min": 71,
-                "after_15_min": 52,
-                "escalation_threshold": 60
-            }
-        }
-    },
-    "Database Connection Pool Exhaustion": {
-        "description": "PostgreSQL connection pool exhausted causing API timeouts",
-        "severity": "HIGH",
-        "component": "postgresql_database",
-        "metrics": {
-            "active_connections": 98,
-            "max_connections": 100,
-            "api_latency_ms": 2450,
-            "error_rate": 15.2,
-            "queue_depth": 1250,
-            "connection_wait_seconds": 45
-        },
-        "business_impact": {
-            "revenue_loss_per_hour": 4200,
-            "affected_services": ["API Gateway", "User Service", "Payment Service"],
-            "sla_violation": True,
-            "partner_api_impact": 3
-        },
-        "roi_data": {
-            "hourly_revenue_loss": 4200,
-            "manual_recovery_hours": 0.75,
-            "enterprise_recovery_hours": 0.13,
-            "engineers_required": 2,
-            "engineer_hourly_rate": 150,
-            "estimated_monthly_occurrences": 3,
-            "enterprise_savings_percentage": 0.82
-        },
-        # ============ REALISM UPGRADES ============
-        "realism": {
-            "ranked_actions": [
-                {
-                    "rank": 1,
-                    "confidence": 82,
-                    "action": "Increase max_connections from 100 to 115 (+15%)",
-                    "rationale": "Immediate relief, within safe operating limits",
-                    "risk": "Disk I/O contention: Medium",
-                    "constraint": "DB max_connections: 82% utilized (pre)",
-                    "monitoring": "Monitor connection churn for 30 minutes"
-                },
-                {
-                    "rank": 2,
-                    "confidence": 58,
-                    "action": "Enable statement timeout (5s) + connection recycling",
-                    "rationale": "Prevents runaway queries, faster pool turnover",
-                    "risk": "Query cancellation may cause application errors",
-                    "tradeoff": "Adds development/testing overhead"
-                },
-                {
-                    "rank": 3,
-                    "confidence": 29,
-                    "action": "Switch to pgbouncer in transaction pooling mode",
-                    "rationale": "10x connection multiplexing possible",
-                    "risk": "HIGH: Requires application changes, 2-hour migration",
-                    "rejection_reason": "Rejected: Too invasive for incident response"
-                }
-            ],
-            "constraint_awareness": {
-                "disk_io_headroom": "Low",
-                "memory_available_gb": 8.2,
-                "pool_increase_cap": "+15%",
-                "monitoring_gap": "Connection churn not tracked"
-            }
-        }
-    },
-    "Kubernetes Memory Leak": {
-        "description": "Java microservice memory leak causing pod restarts",
-        "severity": "HIGH",
-        "component": "java_payment_service",
-        "metrics": {
-            "memory_usage": 96,
-            "gc_pause_time_ms": 4500,
-            "error_rate": 28.5,
-            "restart_frequency_per_hour": 12,
-            "heap_fragmentation": 42
-        },
-        "business_impact": {
-            "revenue_loss_per_hour": 5500,
-            "session_loss": 8500,
-            "payment_failures_percentage": 3.2,
-            "support_tickets_increase": 300
-        },
-        "roi_data": {
-            "hourly_revenue_loss": 5500,
-            "manual_recovery_hours": 1.5,
-            "enterprise_recovery_hours": 0.25,
-            "engineers_required": 3,
-            "engineer_hourly_rate": 150,
-            "estimated_monthly_occurrences": 1,
-            "enterprise_savings_percentage": 0.79
-        },
-        # ============ REALISM UPGRADES ============
-        "realism": {
-            "ranked_actions": [
-                {
-                    "rank": 1,
-                    "confidence": 76,
-                    "action": "Canary restart (1/4 pods) with heap dump analysis",
-                    "rationale": "Minimizes blast radius, enables root cause capture",
-                    "risk": "Cold-start latency: +2.3s per pod",
-                    "blast_radius_economics": {
-                        "canary_restart_cost": "$850",
-                        "full_restart_cost": "$3,400",
-                        "payment_retry_risk": "Medium",
-                        "safer_order": "Canary → scale → rollout"
-                    }
-                },
-                {
-                    "rank": 2,
-                    "confidence": 63,
-                    "action": "Increase heap from 2GB to 3GB with monitoring",
-                    "rationale": "Buy time for analysis, reduces restart frequency",
-                    "risk": "Delays root cause identification",
-                    "tradeoff": "Temporary fix, adds memory cost"
-                }
-            ]
-        }
-    },
-    "Network Partition": {
-        "description": "Network partition causing split-brain in distributed database",
-        "severity": "CRITICAL",
-        "component": "distributed_database",
-        "metrics": {
-            "partition_detected": True,
-            "write_conflicts": 1250,
-            "data_inconsistency_percentage": 8.5,
-            "replication_lag_seconds": 45,
-            "quorum_lost": True
-        },
-        "business_impact": {
-            "revenue_loss_per_hour": 12000,
-            "data_corruption_risk": True,
-            "recovery_complexity": "HIGH",
-            "compliance_violation": True
-        },
-        "roi_data": {
-            "hourly_revenue_loss": 12000,
-            "manual_recovery_hours": 2.0,
-            "enterprise_recovery_hours": 0.3,
-            "engineers_required": 5,
-            "engineer_hourly_rate": 150,
-            "estimated_monthly_occurrences": 0.5,
-            "enterprise_savings_percentage": 0.88
-        },
-        # ============ REALISM UPGRADES ============
-        "realism": {
-            "competing_hypotheses": [
-                {
-                    "cause": "Network partition (control plane)",
-                    "confidence": 61,
-                    "evidence": "Quorum lost, replication lag > 30s",
-                    "investigation_path": "Check network mesh, BGP status"
-                },
-                {
-                    "cause": "Control plane overload",
-                    "confidence": 24,
-                    "evidence": "High CPU on orchestration nodes",
-                    "investigation_path": "Scale control plane, check etcd health"
-                },
-                {
-                    "cause": "Downstream timeout amplification",
-                    "confidence": 15,
-                    "evidence": "Cascading failures in 3 dependent services",
-                    "investigation_path": "Implement circuit breakers"
-                }
-            ]
-        }
-    },
-    "API Rate Limit Storm": {
-        "description": "Third-party API rate limiting causing cascading failures",
-        "severity": "MEDIUM",
-        "component": "external_api_gateway",
-        "metrics": {
-            "rate_limit_hits_percentage": 95,
-            "error_rate": 42.8,
-            "retry_storm": True,
-            "cascade_effect_services": 3,
-            "queue_backlog": 8500
-        },
-        "business_impact": {
-            "revenue_loss_per_hour": 3800,
-            "partner_sla_breach": True,
-            "data_sync_delay_hours": 4,
-            "customer_reports_delay_hours": 6
-        },
-        "roi_data": {
-            "hourly_revenue_loss": 3800,
-            "manual_recovery_hours": 1.25,
-            "enterprise_recovery_hours": 0.17,
-            "engineers_required": 3,
-            "engineer_hourly_rate": 150,
-            "estimated_monthly_occurrences": 4,
-            "enterprise_savings_percentage": 0.85
-        },
-        # ============ REALISM UPGRADES ============
-        "realism": {
-            "contract_aware_reasoning": {
-                "burst_limit": "1.2× allowed",
-                "penalty_window": "15 minutes",
-                "degradation_mode": "Non-premium users only",
-                "contractual_limits": {
-                    "requests_per_second": 100,
-                    "monthly_overage_fee": "$0.15/request",
-                    "suspension_threshold": "3 violations/month"
-                }
-            }
-        }
-    },
-    "Storage I/O Saturation": {
-        "description": "Storage system I/O saturation causing application timeouts",
-        "severity": "HIGH",
-        "component": "storage_cluster",
-        "metrics": {
-            "io_utilization": 98,
-            "latency_ms": 450,
-            "throughput_mbps": 1250,
-            "queue_depth": 850,
-            "error_rate": 8.5
-        },
-        "business_impact": {
-            "revenue_loss_per_hour": 6800,
-            "data_processing_delay_hours": 3,
-            "analytics_backlog": True,
-            "reporting_failure": True
-        },
-        "roi_data": {
-            "hourly_revenue_loss": 6800,
-            "manual_recovery_hours": 1.75,
-            "enterprise_recovery_hours": 0.22,
-            "engineers_required": 3,
-            "engineer_hourly_rate": 150,
-            "estimated_monthly_occurrences": 1.5,
-            "enterprise_savings_percentage": 0.83
-        },
-        # ============ REALISM UPGRADES ============
-        "realism": {
-            "irreversibility_warnings": {
-                "rebalance_duration": "18-25 minutes",
-                "write_amplification_risk": "High",
-                "requires_explicit_approval": True,
-                "approval_level": "Director+",
-                "rollback_complexity": "High (requires snapshot restore)"
-            }
-        }
-    }
-}