Spaces:

A-R-F
/

Agentic-Reliability-Framework-v4

Running

App Files Files Community

petter2025 commited on Mar 26

Commit

3b9c14e

verified ·

1 Parent(s): e13adeb

Update app.py

Browse files

Files changed (1) hide show

app.py +260 -399

app.py CHANGED Viewed

@@ -1,375 +1,251 @@
 import gradio as gr
-import asyncio
-import json
-import logging
-import traceback
-import os
 import numpy as np
 import pandas as pd
-from datetime import datetime
-from typing import Dict, Any, List, Optional
 import threading
 import urllib.request
-import time
 # ----------------------------------------------------------------------
-# Memory monitoring (no external dependencies)
 # ----------------------------------------------------------------------
-def get_memory_usage():
-    """Return current process memory usage in MB (RSS)."""
-    try:
-        # Try using resource module (Unix-like)
-        import resource
-        rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
-        # On Linux, ru_maxrss is in kilobytes; on macOS, in bytes
-        if rss < 1e9:   # likely kilobytes
-            return rss / 1024.0   # convert to MB
-        else:
-            return rss / (1024.0 * 1024.0)   # convert to MB
-    except ImportError:
-        # Fallback to reading /proc/self/status (Linux)
-        try:
-            with open("/proc/self/status") as f:
-                for line in f:
-                    if line.startswith("VmRSS:"):
-                        parts = line.split()
-                        if len(parts) >= 2:
-                            # Value in kB
-                            return int(parts[1]) / 1024.0   # convert to MB
-        except Exception:
-            pass
-    return None
-def log_memory_usage():
-    """Periodically log memory usage to help diagnose timeouts."""
-    mem_mb = get_memory_usage()
-    if mem_mb is not None:
-        logging.info(f"Process memory: {mem_mb:.1f} MB")
-    else:
-        logging.info("Process memory: unknown")
-    # Schedule next check in 60 seconds
-    threading.Timer(60, log_memory_usage).start()
 # ----------------------------------------------------------------------
-# Keep‑alive to prevent idle timeout (external pings)
 # ----------------------------------------------------------------------
 def keep_alive():
-    """Periodically ping the public Space URL to prevent idle timeout."""
-    # Determine the public URL of the Space
     space_id = os.environ.get('SPACE_ID')
     if space_id:
-        # Convert "username/space-name" to "username-space-name.hf.space"
         url = f"https://{space_id.replace('/', '-')}.hf.space/"
-        logging.info(f"Using external URL for keep‑alive: {url}")
     else:
-        # Fallback to localhost when running locally
         url = "http://127.0.0.1:7860/"
-        logging.warning("No SPACE_ID found, using localhost for keep‑alive – will not prevent sleep!")
     while True:
-        time.sleep(300)   # 5 minutes
         try:
             with urllib.request.urlopen(url, timeout=10) as response:
                 status = response.getcode()
-                logging.info(f"Keep‑alive ping: {status}")
         except Exception as e:
-            logging.warning(f"Keep‑alive failed: {e}")
-# Start keep‑alive thread (daemon so it exits when main process ends)
 threading.Thread(target=keep_alive, daemon=True).start()
 # ----------------------------------------------------------------------
-# Plotly for dashboards
-# ----------------------------------------------------------------------
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-# ----------------------------------------------------------------------
-# Logging setup
 # ----------------------------------------------------------------------
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# ----------------------------------------------------------------------
-# OSS Core Imports
-# ----------------------------------------------------------------------
-from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine, HealingPolicy
-from agentic_reliability_framework.core.governance.risk_engine import RiskEngine, ActionCategory
-from agentic_reliability_framework.core.governance.intents import (
-    InfrastructureIntent, ProvisionResourceIntent, ResourceType, Environment
-)
-from agentic_reliability_framework.core.governance.azure.azure_simulator import AzureInfrastructureSimulator
-from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction, EventSeverity
-from agentic_reliability_framework.runtime.hmc.hmc_learner import HMCRiskLearner
-from agentic_reliability_framework.core.config.constants import (
-    LATENCY_CRITICAL, ERROR_RATE_HIGH, get_oss_capabilities
-)
-# ----------------------------------------------------------------------
-# Fallback constants if not in OSS constants
-# ----------------------------------------------------------------------
-try:
-    from agentic_reliability_framework.core.config.constants import RISK_THRESHOLD_LOW, RISK_THRESHOLD_HIGH
-except ImportError:
-    RISK_THRESHOLD_LOW = 0.2
-    RISK_THRESHOLD_HIGH = 0.8
-    logger.info("Using fallback risk thresholds (0.2/0.8)")
-# ----------------------------------------------------------------------
-# Infrastructure simulator and engines
-# ----------------------------------------------------------------------
-# Define policy_engine first
-policy_engine = PolicyEngine()  # loads default policies
-# Now create simulator, passing the policy_engine
-infra_sim = AzureInfrastructureSimulator(policy=policy_engine)
-# Risk engine (does not depend on policy_engine)
-risk_engine = RiskEngine(hmc_model_path="hmc_model.json", use_hyperpriors=True)
-# ----------------------------------------------------------------------
-# Global history for dashboard
-# ----------------------------------------------------------------------
-decision_history = []      # list of (timestamp, decision, category)
-risk_history = []          # list of (timestamp, mean_risk)
-def update_dashboard_data(decision: Dict, risk: float):
     decision_history.append((datetime.utcnow().isoformat(), decision, risk))
     risk_history.append((datetime.utcnow().isoformat(), risk))
-    # Keep only last 100
     if len(decision_history) > 100:
         decision_history.pop(0)
     if len(risk_history) > 100:
         risk_history.pop(0)
 # ----------------------------------------------------------------------
-# Policy evaluation helper
 # ----------------------------------------------------------------------
-def evaluate_policies(event_type: str, severity: str, component: str) -> Dict[str, Any]:
     """
-    Evaluate policies against an event and return recommended actions.
-    Uses OSS PolicyEngine with a minimal ReliabilityEvent.
     """
-    try:
-        event = ReliabilityEvent(
-            component=component,
-            latency_p99=0.0,       # dummy, not used in policy conditions
-            error_rate=0.0,
-            throughput=1.0,
-            severity=EventSeverity(severity)
-        )
-        actions = policy_engine.evaluate_policies(event)
-        return {
-            "timestamp": datetime.utcnow().isoformat(),
-            "event_type": event_type,
-            "severity": severity,
-            "component": component,
-            "recommended_actions": [a.value for a in actions if a != HealingAction.NO_ACTION],
-            "governance_status": "approved" if actions and actions[0] != HealingAction.NO_ACTION else "blocked"
-        }
-    except Exception as e:
-        logger.error(f"Policy evaluation error: {e}")
-        return {
-            "error": str(e),
-            "governance_status": "error",
-            "recommended_actions": []
-        }
 # ----------------------------------------------------------------------
-# Autonomous control decision
 # ----------------------------------------------------------------------
-def autonomous_control_decision(analysis_result: Dict[str, Any], risk_threshold: float = 0.7) -> Dict[str, Any]:
-    """
-    Make autonomous control decision based on analysis and risk metrics.
-    This simulates an AI Control Plane that can take actions automatically.
-    """
     decision = {
         "timestamp": datetime.utcnow().isoformat(),
-        "approved": False,
-        "actions": [],
-        "reason": "",
-        "risk_level": "unknown"
     }
-    try:
-        # Extract risk metrics (if present)
-        risk = analysis_result.get("risk", 0.5)
-        p95 = analysis_result.get("risk_p95", risk)
-        # Determine risk level using OSS thresholds if available
-        if risk > RISK_THRESHOLD_HIGH or p95 > RISK_THRESHOLD_HIGH:
-            decision["risk_level"] = "high"
-            decision["approved"] = False
-            decision["reason"] = f"Risk exceeds high threshold ({RISK_THRESHOLD_HIGH})"
-        elif risk < RISK_THRESHOLD_LOW:
-            decision["risk_level"] = "low"
-            decision["approved"] = True
-            decision["reason"] = "Risk within acceptable limits"
-        else:
-            decision["risk_level"] = "medium"
-            decision["approved"] = False
-            decision["reason"] = f"Risk in escalation zone ({RISK_THRESHOLD_LOW}-{RISK_THRESHOLD_HIGH})"
-        # Optionally add actions based on analysis (e.g., if risk is high, suggest mitigation)
-        if decision["risk_level"] == "high" and "healing_actions" in analysis_result:
-            decision["actions"] = analysis_result["healing_actions"]
-    except Exception as e:
-        logger.error(f"Control decision error: {e}")
-        decision["reason"] = f"Error in decision process: {str(e)}"
-    update_dashboard_data(decision, analysis_result.get("risk", 0.5))
     return decision
 # ----------------------------------------------------------------------
-# Infrastructure analysis with governance
-# ----------------------------------------------------------------------
-async def handle_infra_with_governance(fault_type: str, context_window: int, session_state: Dict) -> tuple:
     """
-    Infrastructure analysis using OSS simulator and risk engine.
     """
-    try:
-        # Map fault to an intent
-        if fault_type == "none":
-            intent = ProvisionResourceIntent(
-                resource_type=ResourceType.VM,
-                environment=Environment.DEVELOPMENT,
-                size="Standard_D2s_v3"
-            )
-            severity = "low"
-        else:
-            # Simulate a failure by using production environment and risky config
-            intent = ProvisionResourceIntent(
-                resource_type=ResourceType.VM,
-                environment=Environment.PRODUCTION,
-                size="custom_extra_large"
-            )
-            severity = "high" if fault_type == "cascade" else "medium"
-        # Evaluate via simulator
-        healing_intent = infra_sim.evaluate_intent(intent)
-        # Extract risk and contributions
-        risk = healing_intent.risk_score
-        # For simplicity, we take p95 from risk_contributions if available; else assume same
-        risk_p95 = healing_intent.risk_contributions.get("hyper_summary", {}).get("p95", risk) if healing_intent.risk_contributions else risk
-        # Get policy evaluation
-        policy_result = evaluate_policies("infrastructure_failure", severity, "azure")
-        # Build analysis result
-        analysis_result = {
-            "intent": intent.dict(),
-            "healing_intent": healing_intent.dict(),
-            "risk": risk,
-            "risk_p95": risk_p95,
-            "decision": healing_intent.decision,   # "approve", "deny", "escalate"
-            "justification": healing_intent.justification,
-            "policy_violations": healing_intent.policy_violations,
-            "healing_actions": [a.value for a in healing_intent.recommended_actions] if healing_intent.recommended_actions else [],
-            "risk_contributions": healing_intent.risk_contributions
-        }
-        # Apply autonomous control decision
-        control_decision = autonomous_control_decision(analysis_result)
-        # Combine with governance
-        output = {
-            **analysis_result,
-            "governance": {
-                "policy_evaluation": policy_result,
-                "control_plane_decision": control_decision
-            }
-        }
-        return output, session_state
-    except Exception as e:
-        logger.error(f"Infra task error: {e}", exc_info=True)
-        return {
-            "error": str(e),
-            "traceback": traceback.format_exc(),
-            "governance": evaluate_policies("infrastructure_failure", "critical", "system")
-        }, session_state
 # ----------------------------------------------------------------------
-# HMC analysis using OSS HMCRiskLearner
 # ----------------------------------------------------------------------
-def run_hmc(samples: int, warmup: int) -> tuple:
-    """
-    Train HMCRiskLearner on synthetic data and return posterior summary + plots.
-    """
-    try:
-        # Generate synthetic incident data
-        np.random.seed(42)
-        n = 200
-        data = []
-        for _ in range(n):
-            latency = np.random.exponential(200)
-            error_rate = np.random.beta(1, 10)
-            throughput = np.random.normal(1000, 200)
-            cpu = np.random.uniform(0.2, 0.9)
-            mem = np.random.uniform(0.3, 0.8)
-            target = int(latency > LATENCY_CRITICAL or error_rate > ERROR_RATE_HIGH)
-            data.append({
-                "latency_p99": latency,
-                "error_rate": error_rate,
-                "throughput": throughput,
-                "cpu_util": cpu,
-                "memory_util": mem,
-                "target": target
-            })
-        df = pd.DataFrame(data)
-        learner = HMCRiskLearner()
-        learner.train(df.to_dict('records'), draws=samples, tune=warmup, chains=2)
-        # Get feature importance (coefficient summaries)
-        coeffs = learner.get_feature_importance()
-        summary = {k: v for k, v in coeffs.items()}
-        # Posterior predictive for a sample point
-        sample_metrics = {
-            "latency_p99": 350,
-            "error_rate": 0.08,
-            "throughput": 900,
-            "cpu_util": 0.7,
-            "memory_util": 0.6
         }
-        pred_summary = learner.predict_risk_summary(sample_metrics)
-        summary["sample_prediction"] = pred_summary
-        # Extract trace for plotting
-        trace_data = {}
-        if learner.trace is not None:
-            for var in learner.trace.posterior.data_vars:
-                if var in ['alpha', 'beta']:
-                    vals = learner.trace.posterior[var].values.flatten()
-                    trace_data[var] = vals[:1000]  # limit for performance
-        # Create trace plot
-        fig_trace = go.Figure()
-        for key, vals in trace_data.items():
-            fig_trace.add_trace(go.Scatter(y=vals, mode='lines', name=key))
-        fig_trace.update_layout(title="Posterior Traces", xaxis_title="Sample", yaxis_title="Value")
-        # Create pair plot (simplified)
-        fig_pair = go.Figure()
-        if len(trace_data) > 0:
-            df_trace = pd.DataFrame(trace_data)
-            fig_pair = go.Figure(data=go.Splom(
-                dimensions=[dict(label=k, values=df_trace[k]) for k in df_trace.columns],
-                showupperhalf=False
-            ))
-            fig_pair.update_layout(title="Posterior Pair Plot")
-        return summary, fig_trace, fig_pair
-    except Exception as e:
-        logger.error(f"HMC analysis error: {e}", exc_info=True)
-        return {"error": str(e)}, None, None
 # ----------------------------------------------------------------------
-# Dashboard plot generators
 # ----------------------------------------------------------------------
 def generate_risk_gauge():
     if not risk_history:
@@ -383,9 +259,9 @@ def generate_risk_gauge():
             'axis': {'range': [0, 1]},
             'bar': {'color': "darkblue"},
             'steps': [
-                {'range': [0, RISK_THRESHOLD_LOW], 'color': "lightgreen"},
-                {'range': [RISK_THRESHOLD_LOW, RISK_THRESHOLD_HIGH], 'color': "yellow"},
-                {'range': [RISK_THRESHOLD_HIGH, 1], 'color': "red"}
             ]
         }))
     return fig
@@ -410,7 +286,6 @@ def generate_action_timeline():
     return fig
 def refresh_dashboard():
-    """Compute latest stats and return updated dashboard components."""
     total = len(decision_history)
     approved = sum(1 for _, d, _ in decision_history if d.get("approved", False))
     blocked = total - approved
@@ -429,43 +304,47 @@ def refresh_dashboard():
     )
 # ----------------------------------------------------------------------
-# Start memory monitoring (non‑blocking)
-# ----------------------------------------------------------------------
-log_memory_usage()
-# ----------------------------------------------------------------------
-# OSS capabilities (for status display)
 # ----------------------------------------------------------------------
-oss_caps = get_oss_capabilities()
 # ----------------------------------------------------------------------
 # Gradio UI
 # ----------------------------------------------------------------------
-with gr.Blocks(title="ARF v4 – OSS Reliability Control Plane") as demo:
     gr.Markdown("""
-    # 🧠 ARF v4 – OSS Reliability Control Plane
-    **Deterministic Probability Thresholding & Hybrid Bayesian Inference**
-    This demo shows the OSS core of ARF:
-    - **Policy‑based Governance** – Automatic evaluation and enforcement (advisory mode)
-    - **Hybrid Risk Engine** – Conjugate priors + HMC + hyperpriors
-    - **Deterministic Thresholds** – Approve (<0.2), Escalate (0.2‑0.8), Deny (>0.8)
-    - **Hamiltonian Monte Carlo** – Offline pattern discovery (NUTS)
     """)
     with gr.Tabs():
         # Tab 1: Control Plane Dashboard
         with gr.TabItem("Control Plane Dashboard"):
-            gr.Markdown("### 🎮 OSS Control Plane")
             with gr.Row():
                 with gr.Column():
                     system_status = gr.JSON(label="System Status", value={
                         "edition": oss_caps["edition"],
                         "version": oss_caps["version"],
                         "governance_mode": "advisory",
-                        "policies_loaded": len(policy_engine.policies),
-                        "risk_threshold_low": RISK_THRESHOLD_LOW,
-                        "risk_threshold_high": RISK_THRESHOLD_HIGH
                     })
                 with gr.Column():
                     control_stats = gr.JSON(label="Control Statistics", value={
@@ -479,19 +358,23 @@ with gr.Blocks(title="ARF v4 – OSS Reliability Control Plane") as demo:
                 decision_pie = gr.Plot(label="Policy Decisions")
             with gr.Row():
                 action_timeline = gr.Plot(label="Autonomous Actions Timeline")
-            with gr.Row():
-                health_score = gr.Number(label="System Health Score", value=85, precision=0)
             refresh_dash_btn = gr.Button("Refresh Dashboard")
             refresh_dash_btn.click(
                 fn=refresh_dashboard,
                 outputs=[control_stats, risk_gauge, decision_pie, action_timeline]
             )
-        # Tab 2: Infrastructure Reliability with Governance
         with gr.TabItem("Infrastructure Reliability"):
-            gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Autonomous Control")
             infra_state = gr.State(value={})
             with gr.Row():
                 with gr.Column():
                     infra_fault = gr.Dropdown(
@@ -499,39 +382,38 @@ with gr.Blocks(title="ARF v4 – OSS Reliability Control Plane") as demo:
                         value="none",
                         label="Inject Fault"
                     )
-                    infra_btn = gr.Button("Evaluate Intent with Governance")
                 with gr.Column():
-                    infra_output = gr.JSON(label="Analysis with Control Decisions")
-        # Tab 3: Deep Analysis (HMC)
-        with gr.TabItem("Deep Analysis (HMC)"):
-            gr.Markdown("### Hamiltonian Monte Carlo – Offline Pattern Discovery")
             with gr.Row():
                 with gr.Column():
-                    hmc_samples = gr.Slider(100, 2000, value=500, step=100, label="Number of Samples")
-                    hmc_warmup = gr.Slider(50, 500, value=200, step=50, label="Warmup Steps")
-                    hmc_run_btn = gr.Button("Run HMC")
                 with gr.Column():
                     hmc_summary = gr.JSON(label="Posterior Summary")
             with gr.Row():
                 hmc_trace_plot = gr.Plot(label="Trace Plot")
-                hmc_pair_plot = gr.Plot(label="Pair Plot")
         # Tab 4: Policy Management
         with gr.TabItem("Policy Management"):
-            gr.Markdown("### 📋 Execution Policies (from OSS)")
-            # Convert policies to JSON‑serializable format
-            policies_json = []
-            for p in policy_engine.policies:
-                policies_json.append({
-                    "name": p.name,
-                    "conditions": [{"metric": c.metric, "operator": c.operator, "threshold": c.threshold} for c in p.conditions],
-                    "actions": [a.value for a in p.actions],
-                    "priority": p.priority,
-                    "cool_down_seconds": p.cool_down_seconds,
-                    "enabled": p.enabled
-                })
-            policies_display = gr.JSON(label="Active Policies", value=policies_json)
         # Tab 5: Enterprise / OSS Info
         with gr.TabItem("Enterprise / OSS"):
@@ -540,52 +422,31 @@ with gr.Blocks(title="ARF v4 – OSS Reliability Control Plane") as demo:
             **Version:** {oss_caps['version']}
             **License:** {oss_caps['license']}
-            **Constants Hash:** {oss_caps.get('constants_hash', 'N/A')}
-            ### OSS Capabilities
-            - **Execution modes:** {', '.join(oss_caps['execution']['modes'])}
-            - **Max incident history:** {oss_caps['execution']['max_incidents']}
-            - **Memory storage:** {oss_caps['memory']['type']}
-            - **FAISS index type:** {oss_caps['memory']['faiss_index_type']}
-            - **Max incident nodes:** {oss_caps['memory']['max_incident_nodes']}
             ### Enterprise Features (not included)
-            {chr(10).join('- ' + f for f in oss_caps.get('enterprise_features', []))}
             [📅 Book a Demo](https://calendly.com/petter2025us/30min) | [📧 Contact Sales](mailto:petter2025us@outlook.com)
             """)
-    # Feedback row (simplified)
-    with gr.Row():
-        feedback_up = gr.Button("👍 Approve Decision")
-        feedback_down = gr.Button("👎 Reject Decision")
-        feedback_msg = gr.Textbox(label="Feedback", interactive=False)
     # Wire events
     infra_btn.click(
-        fn=lambda f, w, s: asyncio.run(handle_infra_with_governance(f, w, s)),
-        inputs=[infra_fault, gr.State(50), infra_state],  # context_window not used, but keep for signature
         outputs=[infra_output, infra_state]
     )
     hmc_run_btn.click(
-        fn=run_hmc,
         inputs=[hmc_samples, hmc_warmup],
         outputs=[hmc_summary, hmc_trace_plot, hmc_pair_plot]
     )
-    def handle_control_feedback(approved: bool):
-        # Simple feedback placeholder
-        return f"Feedback recorded: {'approved' if approved else 'rejected'}"
-    feedback_up.click(
-        fn=lambda: handle_control_feedback(True),
-        outputs=feedback_msg
-    )
-    feedback_down.click(
-        fn=lambda: handle_control_feedback(False),
-        outputs=feedback_msg
-    )
 if __name__ == "__main__":
     demo.launch(theme="soft")

 import gradio as gr
 import numpy as np
 import pandas as pd
+import plotly.graph_objects as go
+import random
+import time
 import threading
 import urllib.request
+import os
+from datetime import datetime
+import logging
+from scipy.stats import beta, norm
 # ----------------------------------------------------------------------
+# Logging
 # ----------------------------------------------------------------------
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # ----------------------------------------------------------------------
+# Keep‑alive (pings public URL every 5 minutes)
 # ----------------------------------------------------------------------
 def keep_alive():
     space_id = os.environ.get('SPACE_ID')
     if space_id:
         url = f"https://{space_id.replace('/', '-')}.hf.space/"
     else:
         url = "http://127.0.0.1:7860/"
     while True:
+        time.sleep(300)
         try:
             with urllib.request.urlopen(url, timeout=10) as response:
                 status = response.getcode()
+                logger.info(f"Keep‑alive ping: {status}")
         except Exception as e:
+            logger.warning(f"Keep‑alive failed: {e}")
 threading.Thread(target=keep_alive, daemon=True).start()
 # ----------------------------------------------------------------------
+# Global history
 # ----------------------------------------------------------------------
+decision_history = []   # (timestamp, decision, risk)
+risk_history = []       # (timestamp, risk)
+def update_dashboard_data(decision, risk):
     decision_history.append((datetime.utcnow().isoformat(), decision, risk))
     risk_history.append((datetime.utcnow().isoformat(), risk))
     if len(decision_history) > 100:
         decision_history.pop(0)
     if len(risk_history) > 100:
         risk_history.pop(0)
 # ----------------------------------------------------------------------
+# Bayesian Risk Engine (Conjugate Beta-Binomial)
 # ----------------------------------------------------------------------
+class BayesianRiskEngine:
     """
+    Implements a Beta-Binomial conjugate prior for binary failure events.
+    - Prior: Beta(alpha, beta)
+    - Posterior: Beta(alpha + failures, beta + successes)
+    - Predictive risk = mean of posterior.
     """
+    def __init__(self, alpha=1.0, beta=1.0):
+        self.alpha = alpha
+        self.beta = beta
+    def update(self, failures, successes):
+        """Update posterior with new observations."""
+        self.alpha += failures
+        self.beta += successes
+    def risk(self):
+        """Return current risk estimate (mean of posterior)."""
+        return self.alpha / (self.alpha + self.beta)
+    def risk_interval(self, prob=0.95):
+        """Return credible interval for risk."""
+        return beta.ppf((1-prob)/2, self.alpha, self.beta), beta.ppf((1+prob)/2, self.alpha, self.beta)
+    def risk_distribution(self, x):
+        """PDF of the posterior Beta distribution."""
+        return beta.pdf(x, self.alpha, self.beta)
 # ----------------------------------------------------------------------
+# Policy Engine (threshold-based)
 # ----------------------------------------------------------------------
+class PolicyEngine:
+    def __init__(self, thresholds={"low": 0.2, "high": 0.8}):
+        self.thresholds = thresholds
+    def evaluate(self, risk):
+        if risk < self.thresholds["low"]:
+            return "approve", "Risk within safe limits"
+        elif risk > self.thresholds["high"]:
+            return "deny", f"Risk exceeds high threshold ({self.thresholds['high']})"
+        else:
+            return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
+# ----------------------------------------------------------------------
+# Autonomous Control Decision (approve/deny based on risk)
+# ----------------------------------------------------------------------
+def autonomous_control_decision(risk, risk_engine, policy_engine):
+    action, reason = policy_engine.evaluate(risk)
     decision = {
         "timestamp": datetime.utcnow().isoformat(),
+        "approved": action == "approve",
+        "actions": ["escalate_human"] if action == "escalate" else [],
+        "reason": reason,
+        "risk_level": "low" if risk < 0.2 else "medium" if risk < 0.8 else "high"
     }
+    update_dashboard_data(decision, risk)
     return decision
 # ----------------------------------------------------------------------
+# Simple Metropolis-Hastings MCMC sampler (for HMC tab)
+# ----------------------------------------------------------------------
+class MHMCMC:
+    """A generic Metropolis-Hastings sampler for a target log-posterior."""
+    def __init__(self, log_target, proposal_sd=0.1):
+        self.log_target = log_target
+        self.proposal_sd = proposal_sd
+    def sample(self, n_samples, initial_state, burn_in=0):
+        samples = np.zeros((n_samples, len(initial_state)))
+        current = np.array(initial_state)
+        current_log = self.log_target(current)
+        accepted = 0
+        for i in range(n_samples + burn_in):
+            # Propose
+            proposal = current + np.random.normal(0, self.proposal_sd, size=len(current))
+            proposal_log = self.log_target(proposal)
+            # Acceptance ratio
+            accept_prob = min(1, np.exp(proposal_log - current_log))
+            if np.random.rand() < accept_prob:
+                current = proposal
+                current_log = proposal_log
+                accepted += 1
+            if i >= burn_in:
+                samples[i - burn_in] = current
+        acceptance_rate = accepted / (n_samples + burn_in)
+        return samples, acceptance_rate
+# ----------------------------------------------------------------------
+# HMC analysis (MCMC on a simple model)
+# ----------------------------------------------------------------------
+def run_hmc_mcmc(samples, warmup):
     """
+    Simulate an HMC-like analysis using Metropolis-Hastings.
+    Target: posterior of a Normal distribution with unknown mean.
     """
+    # Generate some data: assume we observed 10 points with mean 0.5, std 0.2
+    data = np.random.normal(0.5, 0.2, 10)
+    # Prior: Normal(0, 1) on mu
+    def log_prior(mu):
+        return -0.5 * (mu ** 2)  # ignoring constant
+    # Likelihood: Normal(data | mu, sigma=0.2)
+    def log_likelihood(mu):
+        return -0.5 * np.sum(((data - mu) / 0.2) ** 2)  # ignoring constant
+    def log_posterior(mu):
+        return log_prior(mu) + log_likelihood(mu)
+    # Run MCMC
+    sampler = MHMCMC(log_posterior, proposal_sd=0.05)
+    mu_samples, acceptance = sampler.sample(samples, initial_state=[0.0], burn_in=warmup)
+    # Summary
+    mu_samples = mu_samples.flatten()
+    mean = np.mean(mu_samples)
+    median = np.median(mu_samples)
+    credible_interval = np.percentile(mu_samples, [2.5, 97.5])
+    # Trace plot
+    fig_trace = go.Figure()
+    fig_trace.add_trace(go.Scatter(y=mu_samples, mode='lines', name='μ', line=dict(width=1)))
+    fig_trace.update_layout(title="Trace of μ (Metropolis-Hastings)", xaxis_title="Iteration", yaxis_title="μ")
+    # Histogram
+    fig_hist = go.Figure()
+    fig_hist.add_trace(go.Histogram(x=mu_samples, nbinsx=50, name='Posterior'))
+    fig_hist.update_layout(title="Posterior Distribution of μ", xaxis_title="μ", yaxis_title="Density")
+    summary = {
+        "mean": mean,
+        "median": median,
+        "credible_interval_95": f"[{credible_interval[0]:.3f}, {credible_interval[1]:.3f}]",
+        "acceptance_rate": f"{acceptance:.2%}"
+    }
+    return summary, fig_trace, fig_hist
 # ----------------------------------------------------------------------
+# Infrastructure Analysis (uses BayesianRiskEngine)
 # ----------------------------------------------------------------------
+async def handle_infra_with_governance(fault_type, context_window, session_state):
+    # Map fault to simulated observations (failures, successes)
+    fault_map = {
+        "none": (1, 99),
+        "switch_down": (20, 80),
+        "server_overload": (35, 65),
+        "cascade": (60, 40)
+    }
+    failures, successes = fault_map.get(fault_type, (1, 99))
+    severity = "low" if failures < 10 else "medium" if failures < 40 else "high"
+    # Create risk engine with prior Beta(1,1)
+    risk_engine = BayesianRiskEngine(alpha=1, beta=1)
+    # Update with observed data
+    risk_engine.update(failures, successes)
+    risk = risk_engine.risk()
+    ci_low, ci_high = risk_engine.risk_interval(0.95)
+    # Policy evaluation
+    policy_engine = PolicyEngine(thresholds={"low": 0.2, "high": 0.8})
+    action, reason = policy_engine.evaluate(risk)
+    # Autonomous decision
+    control_decision = autonomous_control_decision(risk, risk_engine, policy_engine)
+    # Build output
+    analysis_result = {
+        "risk": risk,
+        "risk_ci": [ci_low, ci_high],
+        "decision": action,
+        "justification": reason,
+        "healing_actions": ["restart"] if action == "deny" else ["monitor"],
+        "posterior_parameters": {
+            "alpha": risk_engine.alpha,
+            "beta": risk_engine.beta
+        }
+    }
+    output = {
+        **analysis_result,
+        "governance": {
+            "policy_evaluation": {
+                "action": action,
+                "reason": reason,
+                "thresholds": policy_engine.thresholds
+            },
+            "control_plane_decision": control_decision
         }
+    }
+    return output, session_state
 # ----------------------------------------------------------------------
+# Dashboard plots
 # ----------------------------------------------------------------------
 def generate_risk_gauge():
     if not risk_history:
             'axis': {'range': [0, 1]},
             'bar': {'color': "darkblue"},
             'steps': [
+                {'range': [0, 0.2], 'color': "lightgreen"},
+                {'range': [0.2, 0.8], 'color': "yellow"},
+                {'range': [0.8, 1], 'color': "red"}
             ]
         }))
     return fig
     return fig
 def refresh_dashboard():
     total = len(decision_history)
     approved = sum(1 for _, d, _ in decision_history if d.get("approved", False))
     blocked = total - approved
     )
 # ----------------------------------------------------------------------
+# OSS capabilities (mocked)
 # ----------------------------------------------------------------------
+oss_caps = {
+    "edition": "OSS (Demo)",
+    "version": "4.0.0-bayesian",
+    "license": "Apache 2.0",
+    "execution": {"modes": ["advisory"], "max_incidents": 100},
+    "memory": {"type": "in-memory", "faiss_index_type": "flat", "max_incident_nodes": 100},
+    "enterprise_features": ["Real-time HMC (using PyMC)", "Hyperpriors", "Decision Engine"]
+}
 # ----------------------------------------------------------------------
 # Gradio UI
 # ----------------------------------------------------------------------
+with gr.Blocks(title="ARF v4 – Bayesian Risk Scoring Demo", theme="soft") as demo:
     gr.Markdown("""
+    # 🧠 ARF v4 – Bayesian Risk Scoring for AI Reliability (Demo)
+    **Mathematically rigorous risk estimation using conjugate priors and MCMC**
+    This demo showcases:
+    - **Bayesian conjugate prior (Beta-Binomial)** – online risk update from observed failures/successes.
+    - **Policy thresholds** – approve (<0.2), escalate (0.2‑0.8), deny (>0.8).
+    - **Metropolis-Hastings MCMC** – sampling from a posterior distribution (simulating HMC concepts).
+    - **Autonomous control decisions** – based on the current risk estimate.
+    All components are implemented from first principles using only `numpy` and standard libraries.
     """)
     with gr.Tabs():
         # Tab 1: Control Plane Dashboard
         with gr.TabItem("Control Plane Dashboard"):
+            gr.Markdown("### 🎮 Control Plane")
             with gr.Row():
                 with gr.Column():
                     system_status = gr.JSON(label="System Status", value={
                         "edition": oss_caps["edition"],
                         "version": oss_caps["version"],
                         "governance_mode": "advisory",
+                        "policies_loaded": 2,
+                        "risk_threshold_low": 0.2,
+                        "risk_threshold_high": 0.8
                     })
                 with gr.Column():
                     control_stats = gr.JSON(label="Control Statistics", value={
                 decision_pie = gr.Plot(label="Policy Decisions")
             with gr.Row():
                 action_timeline = gr.Plot(label="Autonomous Actions Timeline")
             refresh_dash_btn = gr.Button("Refresh Dashboard")
             refresh_dash_btn.click(
                 fn=refresh_dashboard,
                 outputs=[control_stats, risk_gauge, decision_pie, action_timeline]
             )
+        # Tab 2: Infrastructure Reliability (Bayesian Risk Update)
         with gr.TabItem("Infrastructure Reliability"):
+            gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Bayesian Risk")
+            gr.Markdown("""
+            This tab simulates evaluating an infrastructure change.
+            The risk is computed using a **Beta-Binomial conjugate prior**:
+            - Prior: Beta(α=1, β=1) (uniform)
+            - Posterior: Beta(α + failures, β + successes)
+            - Risk = mean of posterior
+            """)
             infra_state = gr.State(value={})
             with gr.Row():
                 with gr.Column():
                     infra_fault = gr.Dropdown(
                         value="none",
                         label="Inject Fault"
                     )
+                    infra_btn = gr.Button("Evaluate Intent")
                 with gr.Column():
+                    infra_output = gr.JSON(label="Analysis Result")
+        # Tab 3: Deep Analysis (MCMC)
+        with gr.TabItem("Deep Analysis (MCMC)"):
+            gr.Markdown("### Markov Chain Monte Carlo (Metropolis‑Hastings)")
+            gr.Markdown("""
+            This sampler approximates the posterior distribution of a **normal mean** given 10 observations.
+            It demonstrates how MCMC can be used for Bayesian inference without external libraries.
+            """)
             with gr.Row():
                 with gr.Column():
+                    hmc_samples = gr.Slider(500, 10000, value=5000, step=500, label="Number of Samples")
+                    hmc_warmup = gr.Slider(100, 2000, value=1000, step=100, label="Burn‑in Steps")
+                    hmc_run_btn = gr.Button("Run MCMC")
                 with gr.Column():
                     hmc_summary = gr.JSON(label="Posterior Summary")
             with gr.Row():
                 hmc_trace_plot = gr.Plot(label="Trace Plot")
+                hmc_pair_plot = gr.Plot(label="Posterior Histogram")
         # Tab 4: Policy Management
         with gr.TabItem("Policy Management"):
+            gr.Markdown("### 📋 Execution Policies")
+            gr.Markdown("Policies define risk thresholds for autonomous actions.")
+            policies_json = [
+                {"name": "Low Risk Policy", "conditions": ["risk < 0.2"], "action": "approve", "priority": 1},
+                {"name": "Medium Risk Policy", "conditions": ["0.2 ≤ risk ≤ 0.8"], "action": "escalate", "priority": 2},
+                {"name": "High Risk Policy", "conditions": ["risk > 0.8"], "action": "deny", "priority": 3}
+            ]
+            gr.JSON(label="Active Policies", value=policies_json)
         # Tab 5: Enterprise / OSS Info
         with gr.TabItem("Enterprise / OSS"):
             **Version:** {oss_caps['version']}
             **License:** {oss_caps['license']}
+            ### OSS Capabilities (Demo)
+            - **Bayesian conjugate prior** – Beta-Binomial risk scoring
+            - **Policy thresholds** – configurable approve/escalate/deny
+            - **MCMC sampling** – Metropolis-Hastings (simulates HMC concepts)
+            - **In-memory storage** – no persistence
             ### Enterprise Features (not included)
+            {chr(10).join('- ' + f for f in oss_caps['enterprise_features'])}
             [📅 Book a Demo](https://calendly.com/petter2025us/30min) | [📧 Contact Sales](mailto:petter2025us@outlook.com)
             """)
     # Wire events
     infra_btn.click(
+        fn=lambda f, w, s: handle_infra_with_governance(f, w, s),
+        inputs=[infra_fault, gr.State(50), infra_state],
         outputs=[infra_output, infra_state]
     )
     hmc_run_btn.click(
+        fn=run_hmc_mcmc,
         inputs=[hmc_samples, hmc_warmup],
         outputs=[hmc_summary, hmc_trace_plot, hmc_pair_plot]
     )
 if __name__ == "__main__":
     demo.launch(theme="soft")