| import gradio as gr |
| import asyncio |
| import json |
| import logging |
| import traceback |
| import os |
| import numpy as np |
| import pandas as pd |
| from datetime import datetime |
| from typing import Dict, Any, List, Optional |
|
|
| |
| |
| |
| import plotly.graph_objects as go |
| from plotly.subplots import make_subplots |
|
|
| |
| |
| |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
| logger = logging.getLogger(__name__) |
|
|
| |
| |
| |
| from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine, HealingPolicy |
| from agentic_reliability_framework.core.governance.risk_engine import RiskEngine, ActionCategory |
| from agentic_reliability_framework.core.governance.intents import ( |
| InfrastructureIntent, ProvisionResourceIntent, ResourceType, Environment |
| ) |
| from agentic_reliability_framework.core.adapters.azure.azure_simulator import AzureInfrastructureSimulator |
| from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction, EventSeverity |
| from agentic_reliability_framework.runtime.hmc.hmc_learner import HMCRiskLearner |
| from agentic_reliability_framework.core.config.constants import ( |
| LATENCY_CRITICAL, ERROR_RATE_HIGH, get_oss_capabilities, |
| RISK_THRESHOLD_LOW, RISK_THRESHOLD_HIGH |
| ) |
|
|
| |
| |
| |
| try: |
| from agentic_reliability_framework.core.config.constants import RISK_THRESHOLD_LOW, RISK_THRESHOLD_HIGH |
| except ImportError: |
| RISK_THRESHOLD_LOW = 0.2 |
| RISK_THRESHOLD_HIGH = 0.8 |
| logger.info("Using fallback risk thresholds (0.2/0.8)") |
|
|
| |
| |
| |
| infra_sim = AzureInfrastructureSimulator() |
| policy_engine = PolicyEngine() |
| risk_engine = RiskEngine(hmc_model_path="hmc_model.json", use_hyperpriors=True) |
|
|
| |
| |
| |
| decision_history = [] |
| risk_history = [] |
|
|
| def update_dashboard_data(decision: Dict, risk: float): |
| decision_history.append((datetime.utcnow().isoformat(), decision, risk)) |
| risk_history.append((datetime.utcnow().isoformat(), risk)) |
| |
| if len(decision_history) > 100: |
| decision_history.pop(0) |
| if len(risk_history) > 100: |
| risk_history.pop(0) |
|
|
| |
| |
| |
| def evaluate_policies(event_type: str, severity: str, component: str) -> Dict[str, Any]: |
| """ |
| Evaluate policies against an event and return recommended actions. |
| Uses OSS PolicyEngine with a minimal ReliabilityEvent. |
| """ |
| try: |
| event = ReliabilityEvent( |
| component=component, |
| latency_p99=0.0, |
| error_rate=0.0, |
| throughput=1.0, |
| severity=EventSeverity(severity) |
| ) |
| actions = policy_engine.evaluate_policies(event) |
| return { |
| "timestamp": datetime.utcnow().isoformat(), |
| "event_type": event_type, |
| "severity": severity, |
| "component": component, |
| "recommended_actions": [a.value for a in actions if a != HealingAction.NO_ACTION], |
| "governance_status": "approved" if actions and actions[0] != HealingAction.NO_ACTION else "blocked" |
| } |
| except Exception as e: |
| logger.error(f"Policy evaluation error: {e}") |
| return { |
| "error": str(e), |
| "governance_status": "error", |
| "recommended_actions": [] |
| } |
|
|
| |
| |
| |
| def autonomous_control_decision(analysis_result: Dict[str, Any], risk_threshold: float = 0.7) -> Dict[str, Any]: |
| """ |
| Make autonomous control decision based on analysis and risk metrics. |
| This simulates an AI Control Plane that can take actions automatically. |
| """ |
| decision = { |
| "timestamp": datetime.utcnow().isoformat(), |
| "approved": False, |
| "actions": [], |
| "reason": "", |
| "risk_level": "unknown" |
| } |
| |
| try: |
| |
| risk = analysis_result.get("risk", 0.5) |
| p95 = analysis_result.get("risk_p95", risk) |
| |
| |
| if risk > RISK_THRESHOLD_HIGH or p95 > RISK_THRESHOLD_HIGH: |
| decision["risk_level"] = "high" |
| decision["approved"] = False |
| decision["reason"] = f"Risk exceeds high threshold ({RISK_THRESHOLD_HIGH})" |
| elif risk < RISK_THRESHOLD_LOW: |
| decision["risk_level"] = "low" |
| decision["approved"] = True |
| decision["reason"] = "Risk within acceptable limits" |
| else: |
| decision["risk_level"] = "medium" |
| decision["approved"] = False |
| decision["reason"] = f"Risk in escalation zone ({RISK_THRESHOLD_LOW}-{RISK_THRESHOLD_HIGH})" |
| |
| |
| if decision["risk_level"] == "high" and "healing_actions" in analysis_result: |
| decision["actions"] = analysis_result["healing_actions"] |
| |
| except Exception as e: |
| logger.error(f"Control decision error: {e}") |
| decision["reason"] = f"Error in decision process: {str(e)}" |
| |
| update_dashboard_data(decision, analysis_result.get("risk", 0.5)) |
| return decision |
|
|
| |
| |
| |
| async def handle_infra_with_governance(fault_type: str, context_window: int, session_state: Dict) -> tuple: |
| """ |
| Infrastructure analysis using OSS simulator and risk engine. |
| """ |
| try: |
| |
| if fault_type == "none": |
| intent = ProvisionResourceIntent( |
| resource_type=ResourceType.VM, |
| environment=Environment.DEVELOPMENT, |
| size="Standard_D2s_v3" |
| ) |
| severity = "low" |
| else: |
| |
| intent = ProvisionResourceIntent( |
| resource_type=ResourceType.VM, |
| environment=Environment.PRODUCTION, |
| size="custom_extra_large" |
| ) |
| severity = "high" if fault_type == "cascade" else "medium" |
|
|
| |
| healing_intent = infra_sim.evaluate_intent(intent) |
| |
| |
| risk = healing_intent.risk_score |
| |
| risk_p95 = healing_intent.risk_contributions.get("hyper_summary", {}).get("p95", risk) if healing_intent.risk_contributions else risk |
| |
| |
| policy_result = evaluate_policies("infrastructure_failure", severity, "azure") |
| |
| |
| analysis_result = { |
| "intent": intent.dict(), |
| "healing_intent": healing_intent.dict(), |
| "risk": risk, |
| "risk_p95": risk_p95, |
| "decision": healing_intent.decision, |
| "justification": healing_intent.justification, |
| "policy_violations": healing_intent.policy_violations, |
| "healing_actions": [a.value for a in healing_intent.recommended_actions] if healing_intent.recommended_actions else [], |
| "risk_contributions": healing_intent.risk_contributions |
| } |
| |
| |
| control_decision = autonomous_control_decision(analysis_result) |
| |
| |
| output = { |
| **analysis_result, |
| "governance": { |
| "policy_evaluation": policy_result, |
| "control_plane_decision": control_decision |
| } |
| } |
| return output, session_state |
| |
| except Exception as e: |
| logger.error(f"Infra task error: {e}", exc_info=True) |
| return { |
| "error": str(e), |
| "traceback": traceback.format_exc(), |
| "governance": evaluate_policies("infrastructure_failure", "critical", "system") |
| }, session_state |
|
|
| |
| |
| |
| def run_hmc(samples: int, warmup: int) -> tuple: |
| """ |
| Train HMCRiskLearner on synthetic data and return posterior summary + plots. |
| """ |
| try: |
| |
| np.random.seed(42) |
| n = 200 |
| data = [] |
| for _ in range(n): |
| latency = np.random.exponential(200) |
| error_rate = np.random.beta(1, 10) |
| throughput = np.random.normal(1000, 200) |
| cpu = np.random.uniform(0.2, 0.9) |
| mem = np.random.uniform(0.3, 0.8) |
| target = int(latency > LATENCY_CRITICAL or error_rate > ERROR_RATE_HIGH) |
| data.append({ |
| "latency_p99": latency, |
| "error_rate": error_rate, |
| "throughput": throughput, |
| "cpu_util": cpu, |
| "memory_util": mem, |
| "target": target |
| }) |
| df = pd.DataFrame(data) |
| |
| learner = HMCRiskLearner() |
| learner.train(df.to_dict('records'), draws=samples, tune=warmup, chains=2) |
| |
| |
| coeffs = learner.get_feature_importance() |
| summary = {k: v for k, v in coeffs.items()} |
| |
| |
| sample_metrics = { |
| "latency_p99": 350, |
| "error_rate": 0.08, |
| "throughput": 900, |
| "cpu_util": 0.7, |
| "memory_util": 0.6 |
| } |
| pred_summary = learner.predict_risk_summary(sample_metrics) |
| summary["sample_prediction"] = pred_summary |
| |
| |
| trace_data = {} |
| if learner.trace is not None: |
| for var in learner.trace.posterior.data_vars: |
| if var in ['alpha', 'beta']: |
| vals = learner.trace.posterior[var].values.flatten() |
| trace_data[var] = vals[:1000] |
| |
| |
| fig_trace = go.Figure() |
| for key, vals in trace_data.items(): |
| fig_trace.add_trace(go.Scatter(y=vals, mode='lines', name=key)) |
| fig_trace.update_layout(title="Posterior Traces", xaxis_title="Sample", yaxis_title="Value") |
| |
| |
| fig_pair = go.Figure() |
| if len(trace_data) > 0: |
| df_trace = pd.DataFrame(trace_data) |
| fig_pair = go.Figure(data=go.Splom( |
| dimensions=[dict(label=k, values=df_trace[k]) for k in df_trace.columns], |
| showupperhalf=False |
| )) |
| fig_pair.update_layout(title="Posterior Pair Plot") |
| |
| return summary, fig_trace, fig_pair |
| |
| except Exception as e: |
| logger.error(f"HMC analysis error: {e}", exc_info=True) |
| return {"error": str(e)}, None, None |
|
|
| |
| |
| |
| def generate_risk_gauge(): |
| if not risk_history: |
| return go.Figure() |
| latest_risk = risk_history[-1][1] |
| fig = go.Figure(go.Indicator( |
| mode="gauge+number", |
| value=latest_risk, |
| title={'text': "Current Risk"}, |
| gauge={ |
| 'axis': {'range': [0, 1]}, |
| 'bar': {'color': "darkblue"}, |
| 'steps': [ |
| {'range': [0, RISK_THRESHOLD_LOW], 'color': "lightgreen"}, |
| {'range': [RISK_THRESHOLD_LOW, RISK_THRESHOLD_HIGH], 'color': "yellow"}, |
| {'range': [RISK_THRESHOLD_HIGH, 1], 'color': "red"} |
| ] |
| })) |
| return fig |
|
|
| def generate_decision_pie(): |
| if not decision_history: |
| return go.Figure() |
| approved = sum(1 for _, d, _ in decision_history if d.get("approved", False)) |
| blocked = len(decision_history) - approved |
| fig = go.Figure(data=[go.Pie(labels=["Approved", "Blocked"], values=[approved, blocked])]) |
| fig.update_layout(title="Policy Decisions") |
| return fig |
|
|
| def generate_action_timeline(): |
| if not decision_history: |
| return go.Figure() |
| times = [d["timestamp"] for _, d, _ in decision_history] |
| approvals = [1 if d.get("approved", False) else 0 for _, d, _ in decision_history] |
| fig = go.Figure() |
| fig.add_trace(go.Scatter(x=times, y=approvals, mode='markers+lines', name='Approvals')) |
| fig.update_layout(title="Autonomous Actions Timeline", xaxis_title="Time", yaxis_title="Approved (1) / Blocked (0)") |
| return fig |
|
|
| def refresh_dashboard(): |
| """Compute latest stats and return updated dashboard components.""" |
| total = len(decision_history) |
| approved = sum(1 for _, d, _ in decision_history if d.get("approved", False)) |
| blocked = total - approved |
| avg_risk = np.mean([r for _, r in risk_history]) if risk_history else 0.5 |
| control_stats = { |
| "total_decisions": total, |
| "approved_actions": approved, |
| "blocked_actions": blocked, |
| "average_risk": float(avg_risk) |
| } |
| return ( |
| control_stats, |
| generate_risk_gauge(), |
| generate_decision_pie(), |
| generate_action_timeline() |
| ) |
|
|
| |
| |
| |
| oss_caps = get_oss_capabilities() |
|
|
| |
| |
| |
| with gr.Blocks(title="ARF v4 – OSS Reliability Control Plane", theme="soft") as demo: |
| gr.Markdown(""" |
| # 🧠 ARF v4 – OSS Reliability Control Plane |
| **Deterministic Probability Thresholding & Hybrid Bayesian Inference** |
| |
| This demo shows the OSS core of ARF: |
| - **Policy‑based Governance** – Automatic evaluation and enforcement (advisory mode) |
| - **Hybrid Risk Engine** – Conjugate priors + HMC + hyperpriors |
| - **Deterministic Thresholds** – Approve (<0.2), Escalate (0.2‑0.8), Deny (>0.8) |
| - **Hamiltonian Monte Carlo** – Offline pattern discovery (NUTS) |
| """) |
|
|
| with gr.Tabs(): |
| |
| with gr.TabItem("Control Plane Dashboard"): |
| gr.Markdown("### 🎮 OSS Control Plane") |
| with gr.Row(): |
| with gr.Column(): |
| system_status = gr.JSON(label="System Status", value={ |
| "edition": oss_caps["edition"], |
| "version": oss_caps["version"], |
| "governance_mode": "advisory", |
| "policies_loaded": len(policy_engine.policies), |
| "risk_threshold_low": RISK_THRESHOLD_LOW, |
| "risk_threshold_high": RISK_THRESHOLD_HIGH |
| }) |
| with gr.Column(): |
| control_stats = gr.JSON(label="Control Statistics", value={ |
| "total_decisions": 0, |
| "approved_actions": 0, |
| "blocked_actions": 0, |
| "average_risk": 0.5 |
| }) |
| with gr.Row(): |
| risk_gauge = gr.Plot(label="Current Risk Gauge") |
| decision_pie = gr.Plot(label="Policy Decisions") |
| with gr.Row(): |
| action_timeline = gr.Plot(label="Autonomous Actions Timeline") |
| with gr.Row(): |
| health_score = gr.Number(label="System Health Score", value=85, precision=0) |
| refresh_dash_btn = gr.Button("Refresh Dashboard") |
| refresh_dash_btn.click( |
| fn=refresh_dashboard, |
| outputs=[control_stats, risk_gauge, decision_pie, action_timeline] |
| ) |
|
|
| |
| with gr.TabItem("Infrastructure Reliability"): |
| gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Autonomous Control") |
| infra_state = gr.State(value={}) |
| |
| with gr.Row(): |
| with gr.Column(): |
| infra_fault = gr.Dropdown( |
| ["none", "switch_down", "server_overload", "cascade"], |
| value="none", |
| label="Inject Fault" |
| ) |
| infra_btn = gr.Button("Evaluate Intent with Governance") |
| with gr.Column(): |
| infra_output = gr.JSON(label="Analysis with Control Decisions") |
|
|
| |
| with gr.TabItem("Deep Analysis (HMC)"): |
| gr.Markdown("### Hamiltonian Monte Carlo – Offline Pattern Discovery") |
| with gr.Row(): |
| with gr.Column(): |
| hmc_samples = gr.Slider(100, 2000, value=500, step=100, label="Number of Samples") |
| hmc_warmup = gr.Slider(50, 500, value=200, step=50, label="Warmup Steps") |
| hmc_run_btn = gr.Button("Run HMC") |
| with gr.Column(): |
| hmc_summary = gr.JSON(label="Posterior Summary") |
| with gr.Row(): |
| hmc_trace_plot = gr.Plot(label="Trace Plot") |
| hmc_pair_plot = gr.Plot(label="Pair Plot") |
|
|
| |
| with gr.TabItem("Policy Management"): |
| gr.Markdown("### 📋 Execution Policies (from OSS)") |
| |
| policies_json = [] |
| for p in policy_engine.policies: |
| policies_json.append({ |
| "name": p.name, |
| "conditions": [{"metric": c.metric, "operator": c.operator, "threshold": c.threshold} for c in p.conditions], |
| "actions": [a.value for a in p.actions], |
| "priority": p.priority, |
| "cool_down_seconds": p.cool_down_seconds, |
| "enabled": p.enabled |
| }) |
| policies_display = gr.JSON(label="Active Policies", value=policies_json) |
|
|
| |
| with gr.TabItem("Enterprise / OSS"): |
| gr.Markdown(f""" |
| ## 🚀 ARF {oss_caps['edition'].upper()} Edition |
| |
| **Version:** {oss_caps['version']} |
| **License:** {oss_caps['license']} |
| **Constants Hash:** {oss_caps.get('constants_hash', 'N/A')} |
| |
| ### OSS Capabilities |
| - **Execution modes:** {', '.join(oss_caps['execution']['modes'])} |
| - **Max incident history:** {oss_caps['execution']['max_incidents']} |
| - **Memory storage:** {oss_caps['memory']['type']} |
| - **FAISS index type:** {oss_caps['memory']['faiss_index_type']} |
| - **Max incident nodes:** {oss_caps['memory']['max_incident_nodes']} |
| |
| ### Enterprise Features (not included) |
| {chr(10).join('- ' + f for f in oss_caps.get('enterprise_features', []))} |
| |
| [📅 Book a Demo](https://calendly.com/petter2025us/30min) | [📧 Contact Sales](mailto:petter2025us@outlook.com) |
| """) |
|
|
| |
| with gr.Row(): |
| feedback_up = gr.Button("👍 Approve Decision") |
| feedback_down = gr.Button("👎 Reject Decision") |
| feedback_msg = gr.Textbox(label="Feedback", interactive=False) |
|
|
| |
| infra_btn.click( |
| fn=lambda f, w, s: asyncio.run(handle_infra_with_governance(f, w, s)), |
| inputs=[infra_fault, gr.State(50), infra_state], |
| outputs=[infra_output, infra_state] |
| ) |
| |
| hmc_run_btn.click( |
| fn=run_hmc, |
| inputs=[hmc_samples, hmc_warmup], |
| outputs=[hmc_summary, hmc_trace_plot, hmc_pair_plot] |
| ) |
| |
| def handle_control_feedback(approved: bool): |
| |
| return f"Feedback recorded: {'approved' if approved else 'rejected'}" |
|
|
| feedback_up.click( |
| fn=lambda: handle_control_feedback(True), |
| outputs=feedback_msg |
| ) |
| feedback_down.click( |
| fn=lambda: handle_control_feedback(False), |
| outputs=feedback_msg |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(server_name="0.0.0.0", server_port=7860) |