""" Gradio-only UI components for ARF Ensures full compatibility with app.py Updated with proper imports and error handling NOW WITH REAL ARF INSTALLATION DETECTION UPDATED: Added realism panel integration for Tab 1 """ import gradio as gr from typing import Dict, List, Any import logging logger = logging.getLogger(__name__) # Try to import scenarios from registry first try: from config.scenario_registry import ScenarioRegistry INCIDENT_SCENARIOS = ScenarioRegistry.load_scenarios() logger.info(f"Loaded {len(INCIDENT_SCENARIOS)} scenarios from registry") except ImportError: logger.warning("Scenario registry not available, falling back to demo scenarios") from demo.scenarios import INCIDENT_SCENARIOS # ----------------------------- # Header & Status - UPDATED WITH INSTALLATION CHECK # ----------------------------- def create_header(version="3.3.9") -> gr.HTML: return gr.HTML(f"""
Agentic Reliability Framework

v{version} (OSS + Enterprise Edition)

Production-grade multi-agent AI for autonomous system reliability intelligence

πŸ—οΈ Architecture: OSS advises β†’ Enterprise executes
""") def create_status_bar() -> gr.HTML: return gr.HTML("""
βœ… System Online βœ… ARF OSS v3.3.9 🏒 Enterprise Edition πŸ“ˆ ROI: 5.2Γ—
""") # ----------------------------- # Tab 1: Live Incident Demo - UPDATED WITH REALISM PANEL # ----------------------------- def create_tab1_incident_demo(scenarios=INCIDENT_SCENARIOS, default_scenario="Cache Miss Storm") -> tuple: """ Create an expressive, comprehensive incident demo tab for ARF. Shows the complete OSS analysis β†’ Enterprise execution workflow. UPDATED: Now includes realism panel for enterprise-seasoned SRE experience """ # Get the default scenario data default_scenario_data = scenarios.get(default_scenario, {}) business_impact = default_scenario_data.get("business_impact", {}) metrics = default_scenario_data.get("metrics", {}) with gr.Row(): # Left Column: Scenario Selection & Live Visualization with gr.Column(scale=1, variant="panel") as left_col: # Scenario Selection with rich preview scenario_dropdown = gr.Dropdown( choices=list(scenarios.keys()), value=default_scenario, label="🎯 Select Incident Scenario", info="Choose a production incident to analyze", interactive=True, container=False ) # Scenario Card with rich information - USING INLINE STYLES scenario_card = gr.HTML(f"""

🚨 {default_scenario}

{default_scenario_data.get('severity', 'HIGH')}
Component: {default_scenario_data.get('component', 'Unknown').replace('_', ' ').title()}
Affected Users: {metrics.get('affected_users', 'Unknown') if 'affected_users' in metrics else 'Unknown'}
Revenue Risk: ${business_impact.get('revenue_loss_per_hour', 0):,}/hour
Detection Time: 45 seconds (ARF AI)
{default_scenario_data.get('component', 'unknown').split('_')[0]} {default_scenario_data.get('severity', 'high').lower()} production incident
""") # Visualization section - USING gr.Plot() FOR PLOTLY FIGURES with gr.Row(): with gr.Column(scale=1): telemetry_header = gr.Markdown("### πŸ“ˆ Live Telemetry") # This expects a Plotly figure from app.py telemetry_viz = gr.Plot( label="", show_label=False, elem_id="telemetry_plot" ) with gr.Column(scale=1): impact_header = gr.Markdown("### πŸ’° Business Impact") impact_viz = gr.Plot( label="", show_label=False, elem_id="impact_plot" ) # Middle Column: Agent Workflow with gr.Column(scale=2, variant="panel") as middle_col: # Agent Workflow Header workflow_header = gr.Markdown("## πŸ”„ ARF Agent Workflow") workflow_subheader = gr.Markdown("### How ARF transforms incidents into autonomous healing") # Agent Status Cards - USING INLINE STYLES with gr.Row(): detection_agent = gr.HTML("""
πŸ•΅οΈβ€β™‚οΈ

Detection Agent

Click "Run OSS Analysis" to activate

Status: Inactive
WAITING
""") recall_agent = gr.HTML("""
🧠

Recall Agent

Click "Run OSS Analysis" to activate

Status: Inactive
WAITING
""") decision_agent = gr.HTML("""
🎯

Decision Agent

Click "Run OSS Analysis" to activate

Status: Inactive
WAITING
""") # OSS vs Enterprise Boundary Visualization boundary_header = gr.Markdown("### 🎭 OSS vs Enterprise: The Safety Boundary") with gr.Row(): oss_section = gr.HTML("""
πŸ†“

OSS Edition

Apache 2.0

Analysis & Advisory Only - No execution, permanently safe

πŸ“ Healing Intent Created

94% confidence

Action: Scale Redis cluster from 3 to 5 nodes

Pattern Match: Similar incident resolved with scaling (87% success rate)

Safety Check: βœ… Passed (blast radius: 2 services)

Estimated Impact: Reduce MTTR from 45min to 12min

🚫 OSS STOPS HERE - No execution
""") enterprise_section = gr.HTML("""
πŸ’°

Enterprise Edition

Commercial

Full Execution & Learning - Autonomous healing with safety guarantees

⚑ Ready to Execute

AUTONOMOUS

Mode: Autonomous (Requires Enterprise license)

Expected Recovery: 12 minutes (vs 45 min manual)

Cost Saved: $6,375

Users Protected: 45,000 β†’ 0 impacted

βœ… Enterprise executes with MCP safety
""") # Execution Controls with gr.Row(): with gr.Column(scale=1): oss_btn = gr.Button( "πŸ†“ Run OSS Analysis", variant="secondary", size="lg" ) oss_info = gr.Markdown("*Free, open-source analysis*") with gr.Column(scale=1): enterprise_btn = gr.Button( "πŸ’° Execute Enterprise Healing", variant="primary", size="lg" ) enterprise_info = gr.Markdown("*Requires Enterprise license*") # Mode Selection & Safety Controls with gr.Row(): with gr.Column(scale=1): approval_toggle = gr.CheckboxGroup( choices=["πŸ‘€ Require Human Approval"], label="Safety Controls", value=[], info="Toggle human oversight" ) with gr.Column(scale=2): mcp_mode = gr.Radio( choices=["πŸ›‘οΈ Advisory (OSS Only)", "πŸ‘₯ Approval", "⚑ Autonomous"], value="πŸ›‘οΈ Advisory (OSS Only)", label="MCP Safety Mode", info="Control execution safety level", interactive=True ) # Timeline visualization timeline_header = gr.Markdown("### ⏰ Incident Timeline") timeline_viz = gr.Plot( label="", show_label=False, elem_id="timeline_plot" ) # ============ NEW: Realism Panel Section ============ realism_header = gr.Markdown("### 🎭 Realism: Trade-offs & Uncertainty") realism_panel = gr.HTML( value="""
πŸ”§

Realism Panel

Select a scenario to see ranked actions, risks, and trade-offs

""", elem_id="realism_panel" ) # Right Column: Results & Metrics with gr.Column(scale=1, variant="panel") as right_col: # Real-time Metrics Dashboard metrics_header = gr.Markdown("## πŸ“Š Performance Metrics") # Metric Cards Grid - USING INLINE STYLES with gr.Row(): detection_time = gr.HTML("""
⏱️

Detection Time

45s

↓ 89% faster than average

""") mttr = gr.HTML("""
⚑

Mean Time to Resolve

12m

↓ 73% faster than manual

""") with gr.Row(): auto_heal = gr.HTML("""
πŸ€–

Auto-Heal Rate

81.7%

↑ 5.4Γ— industry average

""") savings = gr.HTML(f"""
πŸ’°

Cost Saved

${int(business_impact.get('revenue_loss_per_hour', 8500) * 0.85 / 1000):.1f}K

Per incident avoided

""") # Results Display Areas oss_results_header = gr.Markdown("### πŸ†“ OSS Analysis Results") oss_results_display = gr.JSON( label="", value={ "status": "Analysis Pending", "agents": ["Detection", "Recall", "Decision"], "mode": "Advisory Only", "action": "Generate HealingIntent" }, height=200 ) enterprise_results_header = gr.Markdown("### πŸ’° Enterprise Results") enterprise_results_display = gr.JSON( label="", value={ "status": "Execution Pending", "requires_license": True, "available_modes": ["Approval", "Autonomous"], "expected_outcome": "12m MTTR, $6.3K saved" }, height=200 ) # Approval Status - USING INLINE STYLES approval_display = gr.HTML("""

πŸ‘€ Human Approval Status

Not Required

Current Mode: Advisory (OSS Only)

Switch to "Approval" mode to enable human-in-the-loop workflows

1. ARF generates intent
2. Human reviews & approves
3. ARF executes safely
""") # Demo Actions demo_btn = gr.Button( "▢️ Run Complete Demo Walkthrough", variant="secondary", size="lg" ) demo_info = gr.Markdown("*Experience the full ARF workflow from detection to resolution*") return ( # Left column returns scenario_dropdown, scenario_card, telemetry_viz, impact_viz, # Middle column returns (WITH REALISM PANEL) workflow_header, detection_agent, recall_agent, decision_agent, oss_section, enterprise_section, oss_btn, enterprise_btn, approval_toggle, mcp_mode, timeline_viz, realism_panel, # Right column returns detection_time, mttr, auto_heal, savings, oss_results_display, enterprise_results_display, approval_display, demo_btn ) # ----------------------------- # Tab 2: Business ROI - Updated # ----------------------------- def create_tab2_business_roi(scenarios=INCIDENT_SCENARIOS) -> tuple: dashboard_output = gr.Plot(label="Executive Dashboard", show_label=True) roi_scenario_dropdown = gr.Dropdown( choices=list(scenarios.keys()), value="Cache Miss Storm", label="Scenario for ROI Analysis", info="Select the primary incident type for ROI calculation" ) monthly_slider = gr.Slider( minimum=1, maximum=50, value=15, step=1, label="Monthly Incidents", info="Average number of incidents per month" ) team_slider = gr.Slider( minimum=1, maximum=50, value=5, step=1, label="Team Size", info="Number of engineers on reliability team" ) calculate_btn = gr.Button("πŸ“Š Calculate Comprehensive ROI", variant="primary", size="lg") roi_output = gr.JSON(label="ROI Analysis Results", value={}) roi_chart = gr.Plot(label="ROI Visualization") return (dashboard_output, roi_scenario_dropdown, monthly_slider, team_slider, calculate_btn, roi_output, roi_chart) # ----------------------------- # Tab 3: Enterprise Features - UPDATED WITH INSTALLATION STATUS # ----------------------------- def create_tab3_enterprise_features() -> tuple: # Get installation status try: from app import get_installation_status installation = get_installation_status() license_data = { "status": "βœ… OSS Installed" if installation["oss_installed"] else "⚠️ OSS Not Installed", "oss_version": installation["oss_version"] or "Not installed", "enterprise_installed": installation["enterprise_installed"], "enterprise_version": installation["enterprise_version"] or "Not installed", "execution_allowed": installation["execution_allowed"], "recommendations": installation["recommendations"], "badges": installation["badges"] } # Update features table based on installation features_data = [ ["ARF OSS Package", "βœ… Installed" if installation["oss_installed"] else "❌ Not Installed", "OSS"], ["Self-Healing Core", "βœ… Active", "Enterprise"], ["RAG Graph Memory", "βœ… Active", "Both"], ["Predictive Analytics", "πŸ”’ Enterprise" if not installation["enterprise_installed"] else "βœ… Available", "Enterprise"], ["Audit Trail", "πŸ”’ Enterprise" if not installation["enterprise_installed"] else "βœ… Available", "Enterprise"], ["Compliance (SOC2)", "πŸ”’ Enterprise" if not installation["enterprise_installed"] else "βœ… Available", "Enterprise"] ] except ImportError: # Fallback if installation check fails license_data = { "status": "⚠️ Installation Check Failed", "oss_version": "Unknown", "enterprise_installed": False, "recommendations": ["Run installation check"] } features_data = [ ["Self-Healing Core", "βœ… Active", "Enterprise"], ["RAG Graph Memory", "βœ… Active", "Both"], ["Predictive Analytics", "πŸ”’ Enterprise", "Enterprise"], ["Audit Trail", "πŸ”’ Enterprise", "Enterprise"], ["Compliance (SOC2)", "πŸ”’ Enterprise", "Enterprise"], ["Multi-Cloud", "πŸ”’ Enterprise", "Enterprise"] ] license_display = gr.JSON( value=license_data, label="πŸ“¦ Package Installation Status" ) validate_btn = gr.Button("πŸ” Validate Installation", variant="secondary") trial_btn = gr.Button("πŸ†“ Start 30-Day Trial", variant="secondary") upgrade_btn = gr.Button("πŸš€ Upgrade to Enterprise", variant="primary") mcp_mode = gr.Dropdown( choices=["advisory", "approval", "autonomous"], value="advisory", label="MCP Safety Mode" ) # Initial mode info mcp_mode_info = gr.JSON( value={ "current_mode": "advisory", "description": "OSS Edition - Analysis only, no execution", "features": ["Incident analysis", "RAG similarity", "HealingIntent creation"], "package": "agentic-reliability-framework==3.3.7", "license": "Apache 2.0" }, label="Mode Details" ) integrations_data = [ ["Prometheus", "βœ… Connected", "Monitoring"], ["Grafana", "βœ… Connected", "Visualization"], ["Slack", "πŸ”’ Enterprise", "Notifications"], ["PagerDuty", "πŸ”’ Enterprise", "Alerting"], ["Jira", "πŸ”’ Enterprise", "Ticketing"], ["Datadog", "πŸ”’ Enterprise", "Monitoring"] ] features_table = gr.Dataframe( headers=["Feature", "Status", "Edition"], value=features_data, label="Feature Comparison" ) integrations_table = gr.Dataframe( headers=["Integration", "Status", "Type"], value=integrations_data, label="Integration Status" ) return (license_display, validate_btn, trial_btn, upgrade_btn, mcp_mode, mcp_mode_info, features_table, integrations_table) # ----------------------------- # Tab 4: Audit Trail # ----------------------------- def create_tab4_audit_trail() -> tuple: refresh_btn = gr.Button("πŸ”„ Refresh Audit Trail", variant="secondary") clear_btn = gr.Button("πŸ—‘οΈ Clear History", variant="secondary") export_btn = gr.Button("πŸ“₯ Export as JSON", variant="primary") execution_headers = ["Time", "Scenario", "Mode", "Status", "Savings", "Details"] incident_headers = ["Time", "Component", "Scenario", "Severity", "Status"] execution_table = gr.Dataframe( headers=execution_headers, value=[], label="Execution History" ) incident_table = gr.Dataframe( headers=incident_headers, value=[], label="Incident History" ) export_text = gr.JSON( value={"status": "Export ready"}, label="Export Data" ) return (refresh_btn, clear_btn, export_btn, execution_table, incident_table, export_text) # ----------------------------- # Tab 5: Learning Engine # ----------------------------- def create_tab5_learning_engine() -> tuple: learning_graph = gr.Plot(label="RAG Memory Graph") graph_type = gr.Dropdown( choices=["Incident Patterns", "Action-Outcome Chains", "System Dependencies"], value="Incident Patterns", label="Graph Type" ) show_labels = gr.Checkbox(label="Show Labels", value=True) search_query = gr.Textbox(label="Search Patterns", placeholder="Enter pattern to search...") search_btn = gr.Button("πŸ” Search Patterns", variant="secondary") clear_btn_search = gr.Button("πŸ—‘οΈ Clear Search", variant="secondary") search_results = gr.JSON( value={"status": "Ready for search"}, label="Search Results" ) stats_display = gr.JSON( value={"patterns": 42, "incidents": 156, "success_rate": "87.3%"}, label="Learning Statistics" ) patterns_display = gr.JSON( value={"common_patterns": ["cache_storm", "db_pool", "memory_leak"]}, label="Pattern Library" ) performance_display = gr.JSON( value={"accuracy": "94.2%", "recall": "89.7%", "precision": "92.1%"}, label="Agent Performance" ) return (learning_graph, graph_type, show_labels, search_query, search_btn, clear_btn_search, search_results, stats_display, patterns_display, performance_display) # ----------------------------- # Realism Panel Component # ----------------------------- def create_realism_panel(scenario_data: Dict, scenario_name: str) -> gr.HTML: """ Create a realism panel showing ranked actions, risks, and uncertainty. This makes ARF look cautious, opinionated, and enterprise-seasoned. """ realism = scenario_data.get("realism", {}) ranked_actions = realism.get("ranked_actions", []) # Build ranked actions HTML actions_html = "" for action in ranked_actions: rank_color = "#10b981" if action["rank"] == 1 else "#f59e0b" if action["rank"] == 2 else "#ef4444" status = "βœ… RECOMMENDED" if action["rank"] == 1 else "🟑 SECONDARY" if action["rank"] == 2 else "πŸ”΄ REJECTED" actions_html += f"""
{action['rank']}
{status} β€’ {action['confidence']}% confidence

{action['action']}

{action['confidence']}%
Rationale: {action.get('rationale', 'No rationale provided')}
{"
⚠️ Risk: " + action['risk'] + "
" if action.get('risk') else ""} {"
πŸ”„ Trade-off: " + action['tradeoff'] + "
" if action.get('tradeoff') else ""} {"
⏱️ Execution: " + action['execution_time'] + "
" if action.get('execution_time') else ""} {"
🚫 Rejected: " + action['rejection_reason'] + "
" if action.get('rejection_reason') else ""} {"
πŸ›‘οΈ Safety: " + action['safety_override'] + "
" if action.get('safety_override') else ""}
""" # Build competing hypotheses (for Network Partition scenario) hypotheses_html = "" if realism.get("competing_hypotheses"): hypotheses_html = """
🧠 Competing Hypotheses
""" for hypo in realism["competing_hypotheses"]: hypotheses_html += f"""
?
{hypo['cause']} ({hypo['confidence']}%)
{hypo['evidence']}
Investigation: {hypo['investigation_path']}
{hypo['confidence']}%
""" hypotheses_html += "
" # Build risk assessment panel risk_html = "" if realism.get("risk_assessment"): risk_html = """
⚠️ Risk Assessment
""" for key, value in realism["risk_assessment"].items(): risk_html += f"""
{key.replace('_', ' ').title()}
{value}
""" risk_html += "
" # Build confidence degradation panel confidence_html = "" if realism.get("confidence_degradation"): conf = realism["confidence_degradation"] confidence_html = f"""
⏱️ Confidence Degradation Over Time
{conf['initial']}%
Initial Confidence
β†’
{conf['after_8_min']}%
After 8 minutes
β†’
{conf['after_15_min']}%
After 15 minutes
Escalation at {conf['escalation_threshold']}%
⚠️ ARF escalates to human operators when confidence drops below {conf['escalation_threshold']}%
This prevents autonomous execution in high-uncertainty scenarios
""" # Build "What ARF Will NOT Do" panel (global) wont_do_html = """
🚫 What ARF Will NOT Do (Safety Boundaries)
β›”
Restart stateful leaders
During peak traffic or elections
β›”
Apply schema changes
To production databases autonomously
β›”
Exceed API limits
Contractual or rate limits
β›”
Modify ACLs/RBAC
Security permissions autonomously
These boundaries ensure ARF operates within safe, reversible limits. Enterprise edition adds approval workflows for edge cases.
""" # Combine all panels full_html = f"""

🎯 Ranked Healing Intents

ARF evaluates multiple options with confidence scores and risk assessments

REALISM UPGRADE v3.3.9+
{actions_html} {hypotheses_html} {risk_html} {confidence_html} {wont_do_html}
πŸ“ˆ Realistic ROI Estimates (Ranges)
$5.8K – $7.2K
Cost Avoided
Estimated range
4.8Γ— – 5.6Γ—
ROI Multiplier
Confidence interval
68% – 87%
Success Rate
Based on similar incidents
πŸ“Š Real systems have ranges, not single-point estimates. ARF shows uncertainty honestly.
🎭
What Senior SREs Expect at 3 a.m.
"Real systems hesitate. Real systems explain risk. Real systems earn trust. ARF shows multiple options with confidence scores because in production, there's never a single perfect answerβ€”just trade-offs managed carefully."
""" return gr.HTML(full_html) # ----------------------------- # Footer # ----------------------------- def create_footer() -> gr.HTML: return gr.HTML("""

Agentic Reliability Framework Β© 2026

Production-grade multi-agent AI for autonomous system reliability intelligence

GitHub β€’ Demo β€’ PyPI β€’ Enterprise Inquiries
""")