""" Gradio-only UI components for ARF Ensures full compatibility with app.py NOW WITH DOCTRINAL COMPLIANCE: Psychological Advantage Enforcement UPDATED: Language discipline, observation gate rendering, recall panel dominance UPDATED: Metric discipline, sequencing display, no early "critical" terminology DOCTRINAL VERSION: 3.3.9+restraint """ import gradio as gr from typing import Dict, List, Any import logging import datetime import time import plotly.graph_objects as go import plotly.express as px import numpy as np logger = logging.getLogger(__name__) # Try to import scenarios from registry first try: from config.scenario_registry import ScenarioRegistry INCIDENT_SCENARIOS = ScenarioRegistry.load_scenarios() logger.info(f"Loaded {len(INCIDENT_SCENARIOS)} scenarios from registry") except ImportError: logger.warning("Scenario registry not available, falling back to demo scenarios") from demo.scenarios import INCIDENT_SCENARIOS # ----------------------------- # Header & Status - DOCTRINAL LANGUAGE # ----------------------------- def create_header(version="3.3.9") -> gr.HTML: return gr.HTML(f"""
Reliability Framework

v{version} (Policy + Enterprise Edition)

Production-grade policy execution for system reliability intelligence

πŸ—οΈ Architecture: OSS advises β†’ Enterprise executes
""") def create_status_bar() -> gr.HTML: return gr.HTML("""
βœ… Policy System Online βœ… ARF OSS v3.3.9 🏒 Enterprise Execution
""") # ----------------------------- # CRITICAL RESTORED FUNCTIONS - Missing from current version # ----------------------------- def create_observation_gate_placeholder(confidence: float = 65.0) -> gr.HTML: """Observation gate that demonstrates psychological restraint. Shows 'Decision Intentionally Deferred' when confidence is below threshold. This is a critical doctrinal component showing restraint as a system choice. """ if confidence < 70.0: html_content = f"""
⚠️

Observation Gate: Awaiting confirmation.

System restraint engaged

⏸️

Decision Intentionally Deferred

The system has detected uncertainty ({confidence:.1f}% confidence) and has chosen to observe rather than act. Historical evidence suggests premature action increases risk by 47%.

Confidence Threshold
70.0%
Required for action
Current Confidence
{confidence:.1f}%
Below threshold β†’ Observe
Observe ({confidence:.1f}%) Threshold (70%) Act (75%+)
""" else: html_content = f"""
βœ…

OBSERVATION GATE CLEARED

Confidence threshold met

⚑

Proceed with Policy Action

Confidence exceeds threshold. System may proceed with sequenced actions. Historical evidence will be consulted before any execution.

Confidence Threshold
70.0%
Required for action
Current Confidence
{confidence:.1f}%
Above threshold β†’ Proceed
βœ… Safety Principle

"When confidence is high, proceed with sequenced actions. Each step is validated against historical evidence before proceeding to the next."

Ready ({confidence:.1f}%) Threshold (70%) Proceed
""" return gr.HTML(html_content) def create_sequencing_visualization() -> gr.HTML: """Creates the sequencing panel showing dampening-first progression. Doctrinal: Shows sequencing as policy, not reaction. """ return gr.HTML("""

πŸ”„ Doctrinal Sequencing: Policy Over Reaction

System enforces sequencing regardless of prediction confidence

POLICY ENFORCED
1
Dampening
Prevent amplification first
REQUIRED
β†’
2
Concurrency
Manage load, then observe
REQUIRED
β†’
3
Observe
Validate trends for 5+ minutes
REQUIRED
β†’
4
Scale
Only if all previous succeed
OPTIONAL
🎯
Doctrinal Constraint: Scaling Cannot Appear First
If retry amplification is detected, scaling is contraindicated entirely. The system must observe stabilization before considering capacity increases. Historical evidence shows scaling-first fails 76% of the time during amplification.
Current Sequence State
Waiting for detection process
πŸ”„ Sequence: 0/4
⏱️ Time: 0s
""") def create_detection_display() -> gr.HTML: """Creates detection process HTML display.""" return gr.HTML("""
πŸ•΅οΈβ€β™‚οΈ

Detection Process

Telemetry analysis & pattern recognition

STATUS: ACTIVE
Pattern Match
Retry Amplification
Confidence
92.7%
Detection Time
0.8 seconds
Severity
HIGH_VARIANCE
βœ… Detected: Retry amplification pattern with exponential growth (r=1.8)
Telemetry shows request rate doubling every 45 seconds. System flagged for sequencing enforcement.
Next Step:
Activate recall process
πŸ“Š 12 metrics ⏱️ 0.8s latency πŸ” 3 patterns
""") def create_recall_display() -> gr.HTML: """Creates recall process HTML display with historical evidence dominance.""" return gr.HTML("""
🧠

Recall Process

Historical evidence & pattern matching

STATUS: ACTIVE
πŸ† RECALL DOMINANCE: Historical Evidence > Predictive Confidence
β›” Scaling-First Failures
2024-11-15 β€’ prod-east
FAILED
Action: Scale during retry storm
Outcome: Amplification increased 300%
"Scaling during amplification worsens the problem"
2024-09-22 β€’ staging
FAILED
Action: Add capacity without dampening
Outcome: 45 min outage, $8.2K loss
"New capacity consumed by amplification loop"
βœ… Dampening-First Successes
2024-12-03 β€’ prod-west
SUCCESS
Action: Request coalescing + backoff
Outcome: Resolved in 8 min, $5.1K saved
"Dampening broke amplification cycle"
2024-10-17 β€’ prod-eu
SUCCESS
Action: Circuit breaker + observability
Outcome: 12 min recovery, 0 user impact
"Sequencing prevented escalation"
🎯 RECALL DECISION: Scaling contraindicated due to historical evidence
Historical evidence (76% failure rate) dominates predictive confidence (92%). System will enforce dampening-first sequencing.
Evidence Weight:
Historical: 85% β€’ Predictive: 15%
πŸ“š 8 incidents 🎯 76% failure rate πŸ† recall dominance
""") def create_decision_display() -> gr.HTML: """Creates decision process HTML display.""" return gr.HTML("""
🎯

Decision Process

HealingIntent creation & sequencing

STATUS: ACTIVE
Formal HealingIntent Created

Preconditions checked, contraindications listed

CONFIDENCE: 87.3%
Primary Action
Implement request coalescing with exponential backoff (jitter: 25%)
Sequencing Rule
dampening_first_then_observe_then_optional_scale
Preconditions
β€’ Retry amplification detected
β€’ Confidence > 70%
β€’ No scaling contraindicated
Contraindications
β›” Scale during retry storm
β›” Add capacity immediately
β›” Restart during amplification
Reversibility
βœ… Backoff can be adjusted
βœ… Coalescing can be disabled
βœ… No stateful changes
⏳
Observation Gate: ACTIVE (65.0% confidence)
System will observe for 5 minutes before proceeding to execution
0% Threshold: 70% 100%
Next Step:
Wait for observation gate clearance
πŸ“ formal intent πŸ”„ sequenced 🎯 87% confidence
""") def create_oss_advisory_section() -> gr.HTML: """Creates OSS advisory section showing policy vs execution boundary.""" return gr.HTML("""
πŸ†“

Policy Edition

Analysis & Advisory Only - Apache 2.0

PERMANENTLY SAFE

πŸ“ HealingIntent Created

94% confidence
βœ…
Action Recommended
Implement request coalescing with exponential backoff
🧠
Pattern Match
Similar incident resolved with dampening (87% success rate)
⚠️
Contraindications
βœ… Checked (retry amplification detected)
πŸ”„
Sequencing Rule
dampening_first_then_observe_then_optional_scale
🚫 OSS STOPS HERE - No Execution
OSS provides policy advice only. Enterprise edition required for execution.
This architectural boundary ensures safety by design.
""") def create_timeline_comparison_plot() -> go.Figure: """Creates timeline comparison plot for incident resolution.""" fig = go.Figure() # Timeline data manual_times = [0, 5, 15, 30, 45, 60] manual_users = [45000, 45000, 42000, 35000, 20000, 5000] arf_times = [0, 0.8, 1.5, 3, 8, 12] arf_users = [45000, 45000, 45000, 42000, 15000, 0] # Add traces fig.add_trace(go.Scatter( x=manual_times, y=manual_users, mode='lines+markers', name='Manual Resolution', line=dict(color='#ef4444', width=3, dash='dash'), marker=dict(size=8, color='#ef4444'), hovertemplate='Time: %{x}min
Users Affected: %{y:,}' )) fig.add_trace(go.Scatter( x=arf_times, y=arf_users, mode='lines+markers', name='ARF Policy Execution', line=dict(color='#10b981', width=4), marker=dict(size=10, color='#10b981'), hovertemplate='Time: %{x}min
Users Affected: %{y:,}' )) # Add vertical lines for key events fig.add_vline(x=0.8, line_width=2, line_dash="dot", line_color="#3b82f6", annotation_text="Detection", annotation_position="top right") fig.add_vline(x=1.5, line_width=2, line_dash="dot", line_color="#8b5cf6", annotation_text="Recall", annotation_position="top right") fig.add_vline(x=3, line_width=2, line_dash="dot", line_color="#f59e0b", annotation_text="Decision", annotation_position="top right") fig.add_vline(x=8, line_width=2, line_dash="dot", line_color="#10b981", annotation_text="Resolution", annotation_position="top right") # Update layout fig.update_layout( title=dict( text="⏰ Incident Timeline: Manual vs Policy Execution", font=dict(size=18, color='#1e293b'), x=0.5 ), xaxis=dict( title="Time (minutes)", gridcolor='#e2e8f0', tickformat=',d' ), yaxis=dict( title="Users Affected", gridcolor='#e2e8f0', tickformat=',' ), plot_bgcolor='white', paper_bgcolor='white', font=dict(family="Inter, sans-serif", color="#475569"), hovermode='x unified', legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5, bgcolor='rgba(255, 255, 255, 0.8)', bordercolor='#e2e8f0', borderwidth=1 ), margin=dict(l=50, r=50, t=80, b=50), height=400 ) # Add annotation for savings fig.add_annotation( x=12, y=0, text="πŸš€ 73% faster resolution
πŸ’° $6.3K saved", showarrow=True, arrowhead=2, arrowsize=1, arrowwidth=2, arrowcolor="#10b981", ax=50, ay=-50, bgcolor="#f0fdf4", bordercolor="#10b981", borderwidth=2, font=dict(size=12, color="#065f46") ) return fig # ----------------------------- # NEW: Observation Gate Renderer - CRITICAL PSYCHOLOGICAL FIX # ----------------------------- def render_observation_gate(healing_intent: Dict[str, Any]) -> gr.HTML: """ Render observation gate state as active restraint, not passive waiting. Doctrinal: Make inaction an explicit, powerful decision. """ deferral_reason = healing_intent.get("deferral_reason", "uncertainty_too_high_for_action") frozen_until = healing_intent.get("decision_frozen_until", "") confidence = healing_intent.get("confidence", 0.0) # Parse timestamp for countdown countdown_text = "" if frozen_until: try: frozen_dt = datetime.datetime.fromisoformat(frozen_until.replace("Z", "+00:00")) now = datetime.datetime.now(datetime.timezone.utc) if frozen_dt.tzinfo is None: frozen_dt = frozen_dt.replace(tzinfo=datetime.timezone.utc) time_left = frozen_dt - now minutes_left = max(0, int(time_left.total_seconds() / 60)) countdown_text = f"{minutes_left}m" except: countdown_text = "5m" return gr.HTML(f"""
⏳

Decision Intentionally Deferred

System state: observe_only β€’ Confidence: {confidence:.1%}

ACTIVE RESTRAINT
REASON FOR DEFERRAL
{deferral_reason.replace('_', ' ').title()}
NEXT EVALUATION
System re-evaluates in: {countdown_text}
🎯
This is a System Choice, Not a Limitation
The system is choosing not to act because uncertainty exceeds policy thresholds. This restraint demonstrates operational maturityβ€”eagerness is a liability in production.

"What you are seeing is not waiting. It is judgment under uncertainty."
PREVENTED ACTIONS (CONTRANDICATED)
scale_during_retry_storm add_capacity_during_amplification any_action_during_high_uncertainty
""") # ----------------------------- # NEW: Historical Evidence Panel - RECALL DOMINANCE # ----------------------------- def create_historical_evidence_panel(scenario_data: Dict[str, Any]) -> gr.HTML: """ Create doctrinally compliant historical evidence panel. Must be visually dominant with dates/environments. """ # Extract from scenario or use defaults historical_panel = scenario_data.get("historical_evidence_panel", {}) scaling_failures = historical_panel.get("scaling_first_failures", []) dampening_successes = historical_panel.get("dampening_first_successes", []) # Build failures HTML failures_html = "" for i, failure in enumerate(scaling_failures[:3]): # Show top 3 failures_html += f"""
{failure.get('date', 'Unknown')} β€’ {failure.get('environment', 'Unknown')}
FAILED
Action: {failure.get('action', 'Unknown')}
Outcome: {failure.get('outcome', 'Unknown')}
{failure.get('lesson', 'No lesson captured')}
""" # Build successes HTML successes_html = "" for i, success in enumerate(dampening_successes[:3]): # Show top 3 successes_html += f"""
{success.get('date', 'Unknown')} β€’ {success.get('environment', 'Unknown')}
SUCCESS
Action: {success.get('action', 'Unknown')}
Outcome: {success.get('outcome', 'Unknown')}
{success.get('lesson', 'No lesson captured')}
""" return gr.HTML(f"""

🧠 Historical Evidence (Why Sequencing Matters)

Real outcomes from similar incidentsβ€”this evidence dominates decision logic

Historical evidence outweighs model confidence.
β›” Scaling-First Failures
{failures_html if failures_html else """
πŸ“Š
No scaling failure evidence in memory
"""}
βœ… Dampening-First Successes
{successes_html if successes_html else """
πŸ“Š
No dampening success evidence in memory
"""}
🎯
If history shows failure, the system will not repeat it.
The system prioritizes historical evidence over predictive confidence. If scaling-first failed in similar conditions, scaling is contraindicated regardless of model confidence.

"What happened is more important than what might happen."
""") # ----------------------------- # Performance Metrics Function - DOCTRINAL METRICS # ----------------------------- # 1. First, update the update_performance_metrics function to return what app.py expects: def update_performance_metrics(scenario_name: str, scenarios=INCIDENT_SCENARIOS) -> tuple: """ Update performance metrics based on scenario Returns: (detection_time_html, recall_quality_html, confidence_score_html, sequencing_stage_html) MUST MATCH what app.py demo_btn.click() expects! """ # Scenario-specific metrics mapping WITH GRADIENT COLORS metrics_config = { "Cache": { "detection_time": ("45s", "89% faster", "linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%)"), "recall_quality": ("92%", "85% accuracy", "linear-gradient(135deg, #10b981 0%, #047857 100%)"), "confidence_score": ("87%", "High certainty", "linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%)"), "sequencing_stage": ("Dampening", "Step 1/4", "linear-gradient(135deg, #f59e0b 0%, #d97706 100%)"), "savings_multiplier": 0.85 }, "Database": { "detection_time": ("38s", "91% faster", "linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%)"), "recall_quality": ("89%", "82% accuracy", "linear-gradient(135deg, #10b981 0%, #047857 100%)"), "confidence_score": ("84%", "High certainty", "linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%)"), "sequencing_stage": ("Concurrency", "Step 2/4", "linear-gradient(135deg, #f59e0b 0%, #d97706 100%)"), "savings_multiplier": 0.82 }, "Kubernetes": { "detection_time": ("52s", "87% faster", "linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%)"), "recall_quality": ("91%", "84% accuracy", "linear-gradient(135deg, #10b981 0%, #047857 100%)"), "confidence_score": ("86%", "High certainty", "linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%)"), "sequencing_stage": ("Observe", "Step 3/4", "linear-gradient(135deg, #f59e0b 0%, #d97706 100%)"), "savings_multiplier": 0.83 }, "Network": { "detection_time": ("28s", "93% faster", "linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%)"), "recall_quality": ("94%", "88% accuracy", "linear-gradient(135deg, #10b981 0%, #047857 100%)"), "confidence_score": ("89%", "High certainty", "linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%)"), "sequencing_stage": ("Scale", "Step 4/4", "linear-gradient(135deg, #f59e0b 0%, #d97706 100%)"), "savings_multiplier": 0.88 }, "Default": { "detection_time": ("42s", "90% faster", "linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%)"), "recall_quality": ("90%", "85% accuracy", "linear-gradient(135deg, #10b981 0%, #047857 100%)"), "confidence_score": ("85%", "High certainty", "linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%)"), "sequencing_stage": ("Dampening", "Step 1/4", "linear-gradient(135deg, #f59e0b 0%, #d97706 100%)"), "savings_multiplier": 0.85 } } # Determine scenario type scenario_type = "Default" for key in ["Cache", "Database", "Kubernetes", "Network", "Storage"]: if key.lower() in scenario_name.lower(): scenario_type = key break # Get metrics for scenario type metrics = metrics_config.get(scenario_type, metrics_config["Default"]) # Create HTML for each metric card WITH GRADIENT BORDERS detection_time_html = f"""
⏱️

Detection Time

{metrics['detection_time'][0]}

{metrics['detection_time'][1]} than baseline

""" recall_quality_html = f"""
🧠

Recall Quality

{metrics['recall_quality'][0]}

{metrics['recall_quality'][1]}

""" confidence_score_html = f"""
🎯

Confidence Score

{metrics['confidence_score'][0]}

{metrics['confidence_score'][1]}

""" sequencing_stage_html = f"""
πŸ”

Sequencing Stage

{metrics['sequencing_stage'][0]}

{metrics['sequencing_stage'][1]}

""" logger.info(f"βœ… Updated performance metrics for {scenario_name} ({scenario_type} type)") return detection_time_html, recall_quality_html, confidence_score_html, sequencing_stage_html # 2. Update create_tab1_incident_demo to use the SAME variable names: def create_tab1_incident_demo(): """ Creates Live Demo Tab components with psychological advantage baked in. Returns EXACTLY 26 values matching app.py expectations. Canonical Question: "What should we do right now?" (Policy advice only) No business metrics allowed in this tab. """ import gradio as gr # 1. Core Scenario Selection (Psychological: User controls context) scenario_dropdown = gr.Dropdown( choices=["Retry Storm Amplification", "Cascading Dependency Failure", "Partial Regional Outage", "Latent Configuration Drift"], value="Retry Storm Amplification", label="Select Incident Scenario", info="Choose the operational context for policy evaluation" ) # 2. Historical Evidence Panel (DOMINANT VISUALLY - psychological advantage) historical_panel = gr.DataFrame( value=[ ["2024-03-15", "US-East", "Retry Storm", "Dampen β†’ Observe β†’ Scale", "Contained in 42s"], ["2024-02-28", "EU-West", "Cascading Failure", "Dampen β†’ Isolate β†’ Repair", "Contained in 3m 18s"], ["2024-01-12", "AP-South", "Config Drift", "Observe β†’ Diagnose β†’ Fix", "Prevented outage"] ], headers=["Date", "Environment", "Pattern", "Sequence Applied", "Outcome"], label="πŸ“Š Historical Evidence Dominates Models", interactive=False, height=200 ) # 3. Scenario Context Card scenario_card = gr.Markdown(""" ### Scenario Context: Retry Storm Amplification **What we're seeing:** Client retries triggering backend amplification (8x retry multiplier) **Current state:** 42% error rate, rising at 3.2%/minute **Risk profile:** Service degradation β†’ Partial outage in 8-12 minutes """) # 4. Telemetry Visualization (Psychological: Show what's knowable) telemetry_viz = gr.Plot( value=create_telemetry_plot(), label="Real-time Telemetry: Error Rate & Load" ) # 5. Business Impact Visualization (Psychological: Show consequences) impact_viz = gr.Plot( value=create_impact_plot(), label="Predicted Impact Trajectory" ) # 6. OBSERVATION GATE (CRITICAL PSYCHOLOGICAL ELEMENT) observation_gate_placeholder = create_observation_gate_placeholder() # 7. SEQUENCING VISUALIZATION (Policy, not reaction) sequencing_panel = create_sequencing_visualization() # 8. Workflow Header workflow_header = gr.Markdown("### Policy Execution Sequence") # 9-11. PROCESS DISPLAYS (Doctrinal: "Process" not "Agent") detection_process = create_detection_display("DETECTION_PROCESS") recall_process = create_recall_display("RECALL_PROCESS") decision_process = create_decision_display("DECISION_PROCESS") # 12-14. OSS/Enterprise Boundary oss_section = gr.Markdown("#### πŸ”“ Open Source Capabilities") enterprise_section = gr.Markdown("#### πŸ”’ Enterprise Governance") oss_btn = gr.Button("View OSS Implementation", variant="secondary", size="sm") enterprise_btn = gr.Button("Requires Enterprise License", variant="secondary", size="sm") # 15-17. Approval & MCP approval_toggle = gr.Checkbox( label="Manual Approval Required", value=True, info="Policy: Human approval required for scaling actions during retry storms" ) mcp_mode = gr.Radio( choices=["Policy Evaluation", "Manual Control", "Full Autonomous"], value="Policy Evaluation", label="Execution Mode" ) timeline_viz = gr.Plot(value=create_timeline_plot()) # 18-21. PERFORMANCE METRICS (CRITICAL: Must match app.py indices 18-21) detection_time = gr.Number( value=2.8, label="Detection Time (seconds)", precision=1 ) # INDEX 19: MUST BE recall_quality (not mttr) recall_quality = gr.Number( value=0.92, label="Recall Quality Score", precision=2, info="Historical match confidence" ) # INDEX 20: MUST BE confidence_score (not auto_heal) confidence_score = gr.Number( value=0.65, label="System Confidence", precision=2, info="Policy execution confidence < 70% triggers observation gate" ) # INDEX 21: MUST BE sequencing_stage (not savings) sequencing_stage = gr.Textbox( value="Dampening", label="Current Sequencing Stage", info="Policy: Dampen β†’ Observe β†’ Concurrent Fix β†’ Scale" ) # 22-24. Results Displays oss_results_display = gr.Markdown("### OSS Results Will Appear Here") enterprise_results_display = gr.Markdown("### Enterprise Results Will Appear Here") approval_display = gr.Markdown("**Status:** Awaiting manual approval per policy") # 25. DEMO BUTTON (CRITICAL: Index 25 must be demo_btn) demo_btn = gr.Button( "▢️ Run Complete Walkthrough", variant="primary", size="lg", elem_id="demo_primary_btn" ) # RETURN EXACTLY 26 VALUES IN CORRECT ORDER return ( scenario_dropdown, historical_panel, scenario_card, telemetry_viz, impact_viz, observation_gate_placeholder, sequencing_panel, workflow_header, detection_process, recall_process, decision_process, oss_section, enterprise_section, oss_btn, enterprise_btn, approval_toggle, mcp_mode, timeline_viz, detection_time, recall_quality, confidence_score, sequencing_stage, # ← CORRECT VARIABLES oss_results_display, enterprise_results_display, approval_display, demo_btn ) # ----------------------------- # Tab 1: Live Incident Demo - UPDATED WITH RESTORED FUNCTIONS # ----------------------------- def create_tab1_incident_demo(scenarios=INCIDENT_SCENARIOS, default_scenario="Cache Miss Storm") -> tuple: """ Create doctrinally compliant incident demo tab. Doctrinal: Language discipline, sequencing display, no early "critical" """ # Get the default scenario data default_scenario_data = scenarios.get(default_scenario, {}) business_impact = default_scenario_data.get("business_impact", {}) metrics = default_scenario_data.get("metrics", {}) with gr.Row(): # Left Column: Scenario Selection & Live Visualization with gr.Column(scale=1, variant="panel") as left_col: # Scenario Selection with rich preview scenario_dropdown = gr.Dropdown( choices=list(scenarios.keys()), value=default_scenario, label="🎯 Select Variance Scenario", info="Choose a production variance pattern to analyze", interactive=True, container=False ) # ============ HISTORICAL EVIDENCE PANEL FIRST (RECALL DOMINANCE) ============ historical_panel = create_historical_evidence_panel(default_scenario_data) # Scenario Card with doctrinally compliant language scenario_card = gr.HTML(f"""

πŸ“Š {default_scenario}

{default_scenario_data.get('severity', 'HIGH_VARIANCE')}
Component: {default_scenario_data.get('component', 'Unknown').replace('_', ' ').title()}
Users Affected: {metrics.get('affected_users', 'Unknown') if 'affected_users' in metrics else 'Unknown'}
Revenue Risk: ${business_impact.get('revenue_risk_per_hour', 0):,}/hour
Detection Time: 45 seconds (Policy System)
{default_scenario_data.get('component', 'unknown').split('_')[0]} variance production pattern
""") # Visualization section with gr.Row(): with gr.Column(scale=1): telemetry_header = gr.Markdown("### πŸ“ˆ Live Telemetry") telemetry_viz = gr.Plot( label="", show_label=False, elem_id="telemetry_plot" ) with gr.Column(scale=1): impact_header = gr.Markdown("### πŸ’° Business Impact") impact_viz = gr.Plot( label="", show_label=False, elem_id="impact_plot" ) # Middle Column: Process Workflow (NOT Agent Workflow) with gr.Column(scale=2, variant="panel") as middle_col: # ============ OBSERVATION GATE PLACEHOLDER ============ observation_gate_placeholder = create_observation_gate_placeholder() # ============ SEQUENCING VISUALIZATION ============ sequencing_header = gr.Markdown("### πŸ”„ Sequencing Logic: Dampening β†’ Concurrency β†’ Observe β†’ Scale") sequencing_panel = create_sequencing_visualization() # Process Workflow Header (NOT Agent Workflow) workflow_header = gr.Markdown("## πŸ”„ Policy Process Workflow") workflow_subheader = gr.Markdown("### How the system transforms variance into policy execution") # Process Status Cards (NOT Agent Status Cards) with gr.Row(): detection_process = create_detection_display() recall_process = create_recall_display() decision_process = create_decision_display() # Mode Selection & Safety Controls with gr.Row(): with gr.Column(scale=1): approval_toggle = gr.CheckboxGroup( choices=["πŸ‘€ Require Human Approval"], label="Safety Controls", value=[], info="Toggle human oversight" ) with gr.Column(scale=2): mcp_mode = gr.Radio( choices=["πŸ›‘οΈ Advisory (OSS Only)", "πŸ‘₯ Approval", "⚑ Autonomous"], value="πŸ›‘οΈ Advisory (OSS Only)", label="Policy Safety Mode", info="Control execution safety level", interactive=True ) # OSS vs Enterprise Boundary Visualization boundary_header = gr.Markdown("### 🎭 Policy vs Execution: The Safety Boundary") with gr.Row(): oss_section = create_oss_advisory_section() enterprise_section = gr.HTML("""
πŸ’°

Execution Edition

Full Execution & Learning - Commercial

REQUIRES LICENSE

⚑ Ready to Execute

AUTONOMOUS
βš™οΈ
Mode
Autonomous (Requires Enterprise license)
⚑
Expected Recovery
12 minutes (vs 45 min manual)
πŸ’°
Cost Avoided
$6,375
πŸ‘₯
Users Protected
45,000 β†’ 0 impacted
βœ… Enterprise executes with MCP safety
Enterprise edition adds execution, learning, and safety guarantees.
MCP safety modes: Advisory β†’ Approval β†’ Autonomous
""") # Execution Controls with gr.Row(): with gr.Column(scale=1): oss_btn = gr.Button( "πŸ†“ Run Policy Analysis", variant="secondary", size="lg" ) oss_info = gr.Markdown("*Free, policy-only analysis*") with gr.Column(scale=1): enterprise_btn = gr.Button( "πŸ’° Execute Enterprise Healing", variant="primary", size="lg" ) enterprise_info = gr.Markdown("*Requires Enterprise license*") # Timeline visualization timeline_header = gr.Markdown("### ⏰ Incident Timeline") timeline_viz = gr.Plot( create_timeline_comparison_plot(), label="", show_label=False, elem_id="timeline_plot" ) # Right Column: Results & Metrics with gr.Column(scale=1, variant="panel") as right_col: # Real-time Metrics Dashboard metrics_header = gr.Markdown("## πŸ“Š Performance Metrics") # Metric Cards Grid - MUST MATCH app.py expectations: detection_time, mttr, auto_heal, savings detection_time = gr.HTML() mttr = gr.HTML() # Mean Time to Resolve auto_heal = gr.HTML() # Auto-heal rate savings = gr.HTML() # Cost savings # Results Display Areas oss_results_header = gr.Markdown("### πŸ†“ Policy Analysis Results") oss_results_display = gr.JSON( label="", value={ "status": "Analysis Pending", "processes": ["Detection", "Recall", "Decision"], "mode": "Advisory Only", "action": "Generate Formal HealingIntent" }, height=200 ) enterprise_results_header = gr.Markdown("### πŸ’° Execution Results") enterprise_results_display = gr.JSON( label="", value={ "status": "Execution Pending", "requires_license": True, "available_modes": ["Approval", "Autonomous"], "expected_outcome": "12m MTTR, $6.3K saved" }, height=200 ) # Approval Status approval_display = gr.HTML("""

πŸ‘€ Human Approval Status

Not Required

Current Mode: Advisory (Policy Only)

Switch to "Approval" mode to enable human-in-the-loop workflows

1. System generates formal HealingIntent
2. Human reviews & approves contraindications
3. System executes with sequencing constraints
""") # Demo Actions demo_btn = gr.Button( "▢️ Run Complete Walkthrough", variant="secondary", size="lg" ) demo_info = gr.Markdown("*Experience the full workflow from detection to resolution*") # CRITICAL: Return EXACTLY 26 values that app.py expects return ( # Left column returns (5 values) scenario_dropdown, historical_panel, scenario_card, telemetry_viz, impact_viz, # Middle column returns (13 values) observation_gate_placeholder, sequencing_panel, workflow_header, detection_process, recall_process, decision_process, oss_section, enterprise_section, oss_btn, enterprise_btn, approval_toggle, mcp_mode, timeline_viz, # Right column returns (8 values - MUST BE: detection_time, mttr, auto_heal, savings, oss_results_display, enterprise_results_display, approval_display, demo_btn) detection_time, mttr, auto_heal, savings, oss_results_display, enterprise_results_display, approval_display, demo_btn # DO NOT include: oss_info, enterprise_info, demo_info - app.py doesn't expect them ) # ----------------------------- # NEW: Create Realism Panel (Updated for doctrinal compliance) # ----------------------------- def create_realism_panel(scenario_data: Dict, scenario_name: str) -> gr.HTML: """ Create doctrinally compliant realism panel. Updated to show formal HealingIntent fields and sequencing logic. """ ranked_actions = scenario_data.get("ranked_actions", []) # Build ranked actions HTML with formal HealingIntent fields actions_html = "" for action in ranked_actions: category = action.get("category", "unknown") category_color = { "dampening": "#3b82f6", "concurrency_control": "#10b981", "observation": "#8b5cf6", "scaling": "#f59e0b" }.get(category, "#64748b") rank_color = "#3b82f6" if action["rank"] == 1 else "#f59e0b" if action["rank"] == 2 else "#64748b" status = "βœ… RECOMMENDED" if action["rank"] == 1 else "🟑 SECONDARY" if action["rank"] == 2 else "πŸ”΄ CONTRAINDICATED" # Formal HealingIntent fields preconditions_html = "" if action.get("preconditions"): preconditions_html = f"""
Preconditions:
{"".join([f'
β€’ {pre}
' for pre in action["preconditions"]])}
""" contraindications_html = "" if action.get("contraindicated_actions"): contraindications_html = f"""
Contraindicated Actions:
{"".join([f'
β›” {contra}
' for contra in action["contraindicated_actions"]])}
""" reversibility_html = "" if action.get("reversibility_statement"): reversibility_html = f"""
Reversibility Statement:
{action["reversibility_statement"]}
""" historical_evidence_html = "" if action.get("historical_evidence"): historical_evidence_html = f"""
Historical Evidence:
{"".join([f'
πŸ“Š {evidence}
' for evidence in action["historical_evidence"]])}
""" actions_html += f"""
{action['rank']}
{status} β€’ {action.get('confidence', 0)}% confidence {category.upper().replace('_', ' ')}

{action.get('action', 'No action specified')}

{action.get('confidence', 0)}%
{preconditions_html} {contraindications_html} {reversibility_html} {historical_evidence_html}
Sequencing: {action.get('category', 'unknown').replace('_', ' ')} β€’ {action.get('constraints', ['No constraints'])[0]}
""" # Combine all panels full_html = f"""

🎯 Formal HealingIntent Sequence

Policy-generated intents with preconditions, contraindications, and reversibility statements

DOCTRINAL COMPLIANCE v3.3.9+
{actions_html if actions_html else '
No ranked actions available
'}
πŸ”„ Doctrinal Sequencing Enforcement
1
Dampening
First in sequence
2
Concurrency
Then control
3
Observe
Then validate
4
Scale
Only if necessary
Doctrinal Constraint: Scaling actions have lower confidence than dampening actions and appear last. If retry amplification is detected, scaling is contraindicated entirely.
""" return gr.HTML(full_html) # ----------------------------- # Tab 2: Business ROI - Updated # ----------------------------- def create_tab2_business_roi(scenarios=INCIDENT_SCENARIOS) -> tuple: dashboard_output = gr.Plot(label="Executive Dashboard", show_label=True) roi_scenario_dropdown = gr.Dropdown( choices=list(scenarios.keys()), value="Cache Miss Storm", label="Scenario for ROI Analysis", info="Select the primary incident type for ROI calculation" ) monthly_slider = gr.Slider( minimum=1, maximum=50, value=15, step=1, label="Monthly Incidents", info="Average number of incidents per month" ) team_slider = gr.Slider( minimum=1, maximum=50, value=5, step=1, label="Team Size", info="Number of engineers on reliability team" ) calculate_btn = gr.Button("πŸ“Š Calculate Comprehensive ROI", variant="primary", size="lg") roi_output = gr.JSON(label="ROI Analysis Results", value={}) roi_chart = gr.Plot(label="ROI Visualization") return (dashboard_output, roi_scenario_dropdown, monthly_slider, team_slider, calculate_btn, roi_output, roi_chart) # ----------------------------- # Tab 3: Enterprise Features - UPDATED WITH INSTALLATION STATUS # ----------------------------- def create_tab3_enterprise_features() -> tuple: # Get installation status try: from app import get_installation_status installation = get_installation_status() license_data = { "status": "βœ… OSS Installed" if installation["oss_installed"] else "⚠️ OSS Not Installed", "oss_version": installation["oss_version"] or "Not installed", "enterprise_installed": installation["enterprise_installed"], "enterprise_version": installation["enterprise_version"] or "Not installed", "execution_allowed": installation["execution_allowed"], "recommendations": installation["recommendations"], "badges": installation["badges"] } # Update features table based on installation features_data = [ ["ARF OSS Package", "βœ… Installed" if installation["oss_installed"] else "❌ Not Installed", "OSS"], ["Self-Healing Core", "βœ… Active", "Enterprise"], ["RAG Graph Memory", "βœ… Active", "Both"], ["Predictive Analytics", "πŸ”’ Enterprise" if not installation["enterprise_installed"] else "βœ… Available", "Enterprise"], ["Audit Trail", "πŸ”’ Enterprise" if not installation["enterprise_installed"] else "βœ… Available", "Enterprise"], ["Compliance (SOC2)", "πŸ”’ Enterprise" if not installation["enterprise_installed"] else "βœ… Available", "Enterprise"] ] except ImportError: # Fallback if installation check fails license_data = { "status": "⚠️ Installation Check Failed", "oss_version": "Unknown", "enterprise_installed": False, "recommendations": ["Run installation check"] } features_data = [ ["Self-Healing Core", "βœ… Active", "Enterprise"], ["RAG Graph Memory", "βœ… Active", "Both"], ["Predictive Analytics", "πŸ”’ Enterprise", "Enterprise"], ["Audit Trail", "πŸ”’ Enterprise", "Enterprise"], ["Compliance (SOC2)", "πŸ”’ Enterprise", "Enterprise"], ["Multi-Cloud", "πŸ”’ Enterprise", "Enterprise"] ] license_display = gr.JSON( value=license_data, label="πŸ“¦ Package Installation Status" ) validate_btn = gr.Button("πŸ” Validate Installation", variant="secondary") trial_btn = gr.Button("πŸ†“ Start 30-Day Trial", variant="secondary") upgrade_btn = gr.Button("πŸš€ Upgrade to Enterprise", variant="primary") mcp_mode = gr.Dropdown( choices=["advisory", "approval", "autonomous"], value="advisory", label="MCP Safety Mode" ) # Initial mode info mcp_mode_info = gr.JSON( value={ "current_mode": "advisory", "description": "OSS Edition - Analysis only, no execution", "features": ["Incident analysis", "RAG similarity", "HealingIntent creation"], "package": "agentic-reliability-framework==3.3.7", "license": "Apache 2.0" }, label="Mode Details" ) integrations_data = [ ["Prometheus", "βœ… Connected", "Monitoring"], ["Grafana", "βœ… Connected", "Visualization"], ["Slack", "πŸ”’ Enterprise", "Notifications"], ["PagerDuty", "πŸ”’ Enterprise", "Alerting"], ["Jira", "πŸ”’ Enterprise", "Ticketing"], ["Datadog", "πŸ”’ Enterprise", "Monitoring"] ] features_table = gr.Dataframe( headers=["Feature", "Status", "Edition"], value=features_data, label="Feature Comparison" ) integrations_table = gr.Dataframe( headers=["Integration", "Status", "Type"], value=integrations_data, label="Integration Status" ) return (license_display, validate_btn, trial_btn, upgrade_btn, mcp_mode, mcp_mode_info, features_table, integrations_table) # ----------------------------- # Tab 4: Audit Trail # ----------------------------- def create_tab4_audit_trail() -> tuple: refresh_btn = gr.Button("πŸ”„ Refresh Audit Trail", variant="secondary") clear_btn = gr.Button("πŸ—‘οΈ Clear History", variant="secondary") export_btn = gr.Button("πŸ“₯ Export as JSON", variant="primary") execution_headers = ["Time", "Scenario", "Mode", "Status", "Savings", "Details"] incident_headers = ["Time", "Component", "Scenario", "Severity", "Status"] execution_table = gr.Dataframe( headers=execution_headers, value=[], label="Execution History" ) incident_table = gr.Dataframe( headers=incident_headers, value=[], label="Incident History" ) export_text = gr.JSON( value={"status": "Export ready"}, label="Export Data" ) return (refresh_btn, clear_btn, export_btn, execution_table, incident_table, export_text) # ----------------------------- # Tab 5: Learning Engine # ----------------------------- def create_tab5_learning_engine() -> tuple: learning_graph = gr.Plot(label="RAG Memory Graph") graph_type = gr.Dropdown( choices=["Incident Patterns", "Action-Outcome Chains", "System Dependencies"], value="Incident Patterns", label="Graph Type" ) show_labels = gr.Checkbox(label="Show Labels", value=True) search_query = gr.Textbox(label="Search Patterns", placeholder="Enter pattern to search...") search_btn = gr.Button("πŸ” Search Patterns", variant="secondary") clear_btn_search = gr.Button("πŸ—‘οΈ Clear Search", variant="secondary") search_results = gr.JSON( value={"status": "Ready for search"}, label="Search Results" ) stats_display = gr.JSON( value={"patterns": 42, "incidents": 156, "success_rate": "87.3%"}, label="Learning Statistics" ) patterns_display = gr.JSON( value={"common_patterns": ["cache_storm", "db_pool", "memory_leak"]}, label="Pattern Library" ) performance_display = gr.JSON( value={"accuracy": "94.2%", "recall": "89.7%", "precision": "92.1%"}, label="Agent Performance" ) return (learning_graph, graph_type, show_labels, search_query, search_btn, clear_btn_search, search_results, stats_display, patterns_display, performance_display) # ----------------------------- # Footer # ----------------------------- def create_footer() -> gr.HTML: return gr.HTML("""

Agentic Reliability Framework Β© 2026

Production-grade multi-agent AI for autonomous system reliability intelligence

GitHub β€’ Demo β€’ PyPI β€’ Enterprise Inquiries
""")