Spaces:

A-R-F
/

Agentic-Reliability-Framework-v4

Running

App Files Files Community

petter2025 commited on Mar 26

Commit

4b27602

verified ·

1 Parent(s): a04bec8

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -134

app.py CHANGED Viewed

@@ -1,21 +1,49 @@
 import gradio as gr
 import numpy as np
 import pandas as pd
-import plotly.graph_objects as go
-import random
-import time
 import threading
 import urllib.request
-import os
-from datetime import datetime
-import logging
-from scipy.stats import beta, norm
 # ----------------------------------------------------------------------
-# Logging
 # ----------------------------------------------------------------------
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 # ----------------------------------------------------------------------
 # Keep‑alive (pings public URL every 5 minutes)
@@ -24,66 +52,56 @@ def keep_alive():
     space_id = os.environ.get('SPACE_ID')
     if space_id:
         url = f"https://{space_id.replace('/', '-')}.hf.space/"
     else:
         url = "http://127.0.0.1:7860/"
     while True:
         time.sleep(300)
         try:
             with urllib.request.urlopen(url, timeout=10) as response:
                 status = response.getcode()
-                logger.info(f"Keep‑alive ping: {status}")
         except Exception as e:
-            logger.warning(f"Keep‑alive failed: {e}")
 threading.Thread(target=keep_alive, daemon=True).start()
 # ----------------------------------------------------------------------
-# Global history
 # ----------------------------------------------------------------------
-decision_history = []   # (timestamp, decision, risk)
-risk_history = []       # (timestamp, risk)
-def update_dashboard_data(decision, risk):
-    decision_history.append((datetime.utcnow().isoformat(), decision, risk))
-    risk_history.append((datetime.utcnow().isoformat(), risk))
-    if len(decision_history) > 100:
-        decision_history.pop(0)
-    if len(risk_history) > 100:
-        risk_history.pop(0)
 # ----------------------------------------------------------------------
-# Bayesian Risk Engine (Conjugate Beta-Binomial)
 # ----------------------------------------------------------------------
 class BayesianRiskEngine:
-    """
-    Implements a Beta-Binomial conjugate prior for binary failure events.
-    - Prior: Beta(alpha, beta)
-    - Posterior: Beta(alpha + failures, beta + successes)
-    - Predictive risk = mean of posterior.
-    """
     def __init__(self, alpha=1.0, beta=1.0):
         self.alpha = alpha
         self.beta = beta
     def update(self, failures, successes):
-        """Update posterior with new observations."""
         self.alpha += failures
         self.beta += successes
     def risk(self):
-        """Return current risk estimate (mean of posterior)."""
         return self.alpha / (self.alpha + self.beta)
     def risk_interval(self, prob=0.95):
-        """Return credible interval for risk."""
-        return beta.ppf((1-prob)/2, self.alpha, self.beta), beta.ppf((1+prob)/2, self.alpha, self.beta)
-    def risk_distribution(self, x):
-        """PDF of the posterior Beta distribution."""
-        return beta.pdf(x, self.alpha, self.beta)
 # ----------------------------------------------------------------------
-# Policy Engine (threshold-based)
 # ----------------------------------------------------------------------
 class PolicyEngine:
     def __init__(self, thresholds={"low": 0.2, "high": 0.8}):
@@ -98,8 +116,19 @@ class PolicyEngine:
             return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
 # ----------------------------------------------------------------------
-# Autonomous Control Decision (approve/deny based on risk)
 # ----------------------------------------------------------------------
 def autonomous_control_decision(risk, risk_engine, policy_engine):
     action, reason = policy_engine.evaluate(risk)
     decision = {
@@ -113,10 +142,55 @@ def autonomous_control_decision(risk, risk_engine, policy_engine):
     return decision
 # ----------------------------------------------------------------------
-# Simple Metropolis-Hastings MCMC sampler (for HMC tab)
 # ----------------------------------------------------------------------
 class MHMCMC:
-    """A generic Metropolis-Hastings sampler for a target log-posterior."""
     def __init__(self, log_target, proposal_sd=0.1):
         self.log_target = log_target
         self.proposal_sd = proposal_sd
@@ -127,10 +201,8 @@ class MHMCMC:
         current_log = self.log_target(current)
         accepted = 0
         for i in range(n_samples + burn_in):
-            # Propose
             proposal = current + np.random.normal(0, self.proposal_sd, size=len(current))
             proposal_log = self.log_target(proposal)
-            # Acceptance ratio
             accept_prob = min(1, np.exp(proposal_log - current_log))
             if np.random.rand() < accept_prob:
                 current = proposal
@@ -141,44 +213,28 @@ class MHMCMC:
         acceptance_rate = accepted / (n_samples + burn_in)
         return samples, acceptance_rate
-# ----------------------------------------------------------------------
-# HMC analysis (MCMC on a simple model)
-# ----------------------------------------------------------------------
 def run_hmc_mcmc(samples, warmup):
-    """
-    Simulate an HMC-like analysis using Metropolis-Hastings.
-    Target: posterior of a Normal distribution with unknown mean.
-    """
-    # Generate some data: assume we observed 10 points with mean 0.5, std 0.2
     data = np.random.normal(0.5, 0.2, 10)
-    # Prior: Normal(0, 1) on mu
     def log_prior(mu):
-        return -0.5 * (mu ** 2)  # ignoring constant
-    # Likelihood: Normal(data | mu, sigma=0.2)
     def log_likelihood(mu):
-        return -0.5 * np.sum(((data - mu) / 0.2) ** 2)  # ignoring constant
     def log_posterior(mu):
         return log_prior(mu) + log_likelihood(mu)
-    # Run MCMC
     sampler = MHMCMC(log_posterior, proposal_sd=0.05)
     mu_samples, acceptance = sampler.sample(samples, initial_state=[0.0], burn_in=warmup)
-    # Summary
     mu_samples = mu_samples.flatten()
     mean = np.mean(mu_samples)
     median = np.median(mu_samples)
     credible_interval = np.percentile(mu_samples, [2.5, 97.5])
-    # Trace plot
     fig_trace = go.Figure()
     fig_trace.add_trace(go.Scatter(y=mu_samples, mode='lines', name='μ', line=dict(width=1)))
     fig_trace.update_layout(title="Trace of μ (Metropolis-Hastings)", xaxis_title="Iteration", yaxis_title="μ")
-    # Histogram
     fig_hist = go.Figure()
     fig_hist.add_trace(go.Histogram(x=mu_samples, nbinsx=50, name='Posterior'))
     fig_hist.update_layout(title="Posterior Distribution of μ", xaxis_title="μ", yaxis_title="Density")
@@ -191,59 +247,6 @@ def run_hmc_mcmc(samples, warmup):
     }
     return summary, fig_trace, fig_hist
-# ----------------------------------------------------------------------
-# Infrastructure Analysis (uses BayesianRiskEngine)
-# ----------------------------------------------------------------------
-async def handle_infra_with_governance(fault_type, context_window, session_state):
-    # Map fault to simulated observations (failures, successes)
-    fault_map = {
-        "none": (1, 99),
-        "switch_down": (20, 80),
-        "server_overload": (35, 65),
-        "cascade": (60, 40)
-    }
-    failures, successes = fault_map.get(fault_type, (1, 99))
-    severity = "low" if failures < 10 else "medium" if failures < 40 else "high"
-    # Create risk engine with prior Beta(1,1)
-    risk_engine = BayesianRiskEngine(alpha=1, beta=1)
-    # Update with observed data
-    risk_engine.update(failures, successes)
-    risk = risk_engine.risk()
-    ci_low, ci_high = risk_engine.risk_interval(0.95)
-    # Policy evaluation
-    policy_engine = PolicyEngine(thresholds={"low": 0.2, "high": 0.8})
-    action, reason = policy_engine.evaluate(risk)
-    # Autonomous decision
-    control_decision = autonomous_control_decision(risk, risk_engine, policy_engine)
-    # Build output
-    analysis_result = {
-        "risk": risk,
-        "risk_ci": [ci_low, ci_high],
-        "decision": action,
-        "justification": reason,
-        "healing_actions": ["restart"] if action == "deny" else ["monitor"],
-        "posterior_parameters": {
-            "alpha": risk_engine.alpha,
-            "beta": risk_engine.beta
-        }
-    }
-    output = {
-        **analysis_result,
-        "governance": {
-            "policy_evaluation": {
-                "action": action,
-                "reason": reason,
-                "thresholds": policy_engine.thresholds
-            },
-            "control_plane_decision": control_decision
-        }
-    }
-    return output, session_state
 # ----------------------------------------------------------------------
 # Dashboard plots
 # ----------------------------------------------------------------------
@@ -315,6 +318,11 @@ oss_caps = {
     "enterprise_features": ["Real-time HMC (using PyMC)", "Hyperpriors", "Decision Engine"]
 }
 # ----------------------------------------------------------------------
 # Gradio UI
 # ----------------------------------------------------------------------
@@ -329,11 +337,10 @@ with gr.Blocks(title="ARF v4 – Bayesian Risk Scoring Demo", theme="soft") as d
     - **Metropolis-Hastings MCMC** – sampling from a posterior distribution (simulating HMC concepts).
     - **Autonomous control decisions** – based on the current risk estimate.
-    All components are implemented from first principles using only `numpy` and standard libraries.
     """)
     with gr.Tabs():
-        # Tab 1: Control Plane Dashboard
         with gr.TabItem("Control Plane Dashboard"):
             gr.Markdown("### 🎮 Control Plane")
             with gr.Row():
@@ -364,16 +371,8 @@ with gr.Blocks(title="ARF v4 – Bayesian Risk Scoring Demo", theme="soft") as d
                 outputs=[control_stats, risk_gauge, decision_pie, action_timeline]
             )
-        # Tab 2: Infrastructure Reliability (Bayesian Risk Update)
         with gr.TabItem("Infrastructure Reliability"):
             gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Bayesian Risk")
-            gr.Markdown("""
-            This tab simulates evaluating an infrastructure change.
-            The risk is computed using a **Beta-Binomial conjugate prior**:
-            - Prior: Beta(α=1, β=1) (uniform)
-            - Posterior: Beta(α + failures, β + successes)
-            - Risk = mean of posterior
-            """)
             infra_state = gr.State(value={})
             with gr.Row():
                 with gr.Column():
@@ -386,13 +385,8 @@ with gr.Blocks(title="ARF v4 – Bayesian Risk Scoring Demo", theme="soft") as d
                 with gr.Column():
                     infra_output = gr.JSON(label="Analysis Result")
-        # Tab 3: Deep Analysis (MCMC)
         with gr.TabItem("Deep Analysis (MCMC)"):
             gr.Markdown("### Markov Chain Monte Carlo (Metropolis‑Hastings)")
-            gr.Markdown("""
-            This sampler approximates the posterior distribution of a **normal mean** given 10 observations.
-            It demonstrates how MCMC can be used for Bayesian inference without external libraries.
-            """)
             with gr.Row():
                 with gr.Column():
                     hmc_samples = gr.Slider(500, 10000, value=5000, step=500, label="Number of Samples")
@@ -404,10 +398,8 @@ with gr.Blocks(title="ARF v4 – Bayesian Risk Scoring Demo", theme="soft") as d
                 hmc_trace_plot = gr.Plot(label="Trace Plot")
                 hmc_pair_plot = gr.Plot(label="Posterior Histogram")
-        # Tab 4: Policy Management
         with gr.TabItem("Policy Management"):
             gr.Markdown("### 📋 Execution Policies")
-            gr.Markdown("Policies define risk thresholds for autonomous actions.")
             policies_json = [
                 {"name": "Low Risk Policy", "conditions": ["risk < 0.2"], "action": "approve", "priority": 1},
                 {"name": "Medium Risk Policy", "conditions": ["0.2 ≤ risk ≤ 0.8"], "action": "escalate", "priority": 2},
@@ -415,7 +407,6 @@ with gr.Blocks(title="ARF v4 – Bayesian Risk Scoring Demo", theme="soft") as d
             ]
             gr.JSON(label="Active Policies", value=policies_json)
-        # Tab 5: Enterprise / OSS Info
         with gr.TabItem("Enterprise / OSS"):
             gr.Markdown(f"""
             ## 🚀 ARF {oss_caps['edition'].upper()} Edition

 import gradio as gr
+import asyncio
+import json
+import logging
+import traceback
+import os
 import numpy as np
 import pandas as pd
+from datetime import datetime
+from typing import Dict, Any, List, Optional
 import threading
 import urllib.request
+import time
+from scipy.stats import beta   # only beta is used
 # ----------------------------------------------------------------------
+# Memory monitoring (no external dependencies)
 # ----------------------------------------------------------------------
+def get_memory_usage():
+    """Return current process memory usage in MB (RSS)."""
+    try:
+        import resource
+        rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+        if rss < 1e9:
+            return rss / 1024.0
+        else:
+            return rss / (1024.0 * 1024.0)
+    except ImportError:
+        try:
+            with open("/proc/self/status") as f:
+                for line in f:
+                    if line.startswith("VmRSS:"):
+                        parts = line.split()
+                        if len(parts) >= 2:
+                            return int(parts[1]) / 1024.0
+        except Exception:
+            pass
+    return None
+def log_memory_usage():
+    mem_mb = get_memory_usage()
+    if mem_mb is not None:
+        logging.info(f"Process memory: {mem_mb:.1f} MB")
+    else:
+        logging.info("Process memory: unknown")
+    threading.Timer(60, log_memory_usage).start()
 # ----------------------------------------------------------------------
 # Keep‑alive (pings public URL every 5 minutes)
     space_id = os.environ.get('SPACE_ID')
     if space_id:
         url = f"https://{space_id.replace('/', '-')}.hf.space/"
+        logging.info(f"Using external URL for keep‑alive: {url}")
     else:
         url = "http://127.0.0.1:7860/"
+        logging.warning("No SPACE_ID found, using localhost – will not prevent sleep!")
     while True:
         time.sleep(300)
         try:
             with urllib.request.urlopen(url, timeout=10) as response:
                 status = response.getcode()
+                logging.info(f"Keep‑alive ping: {status}")
         except Exception as e:
+            logging.warning(f"Keep‑alive failed: {e}")
 threading.Thread(target=keep_alive, daemon=True).start()
 # ----------------------------------------------------------------------
+# Plotly
 # ----------------------------------------------------------------------
+import plotly.graph_objects as go
+# ----------------------------------------------------------------------
+# Logging
+# ----------------------------------------------------------------------
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
 # ----------------------------------------------------------------------
+# Bayesian Risk Engine (Beta‑Binomial)
 # ----------------------------------------------------------------------
 class BayesianRiskEngine:
     def __init__(self, alpha=1.0, beta=1.0):
         self.alpha = alpha
         self.beta = beta
     def update(self, failures, successes):
         self.alpha += failures
         self.beta += successes
     def risk(self):
         return self.alpha / (self.alpha + self.beta)
     def risk_interval(self, prob=0.95):
+        """Return credible interval using scipy.stats.beta."""
+        lo = beta.ppf((1 - prob) / 2, self.alpha, self.beta)
+        hi = beta.ppf((1 + prob) / 2, self.alpha, self.beta)
+        return lo, hi
 # ----------------------------------------------------------------------
+# Policy Engine
 # ----------------------------------------------------------------------
 class PolicyEngine:
     def __init__(self, thresholds={"low": 0.2, "high": 0.8}):
             return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
 # ----------------------------------------------------------------------
+# History
 # ----------------------------------------------------------------------
+decision_history = []
+risk_history = []
+def update_dashboard_data(decision, risk):
+    decision_history.append((datetime.utcnow().isoformat(), decision, risk))
+    risk_history.append((datetime.utcnow().isoformat(), risk))
+    if len(decision_history) > 100:
+        decision_history.pop(0)
+    if len(risk_history) > 100:
+        risk_history.pop(0)
 def autonomous_control_decision(risk, risk_engine, policy_engine):
     action, reason = policy_engine.evaluate(risk)
     decision = {
     return decision
 # ----------------------------------------------------------------------
+# Infrastructure analysis
+# ----------------------------------------------------------------------
+async def handle_infra_with_governance(fault_type, context_window, session_state):
+    fault_map = {
+        "none": (1, 99),
+        "switch_down": (20, 80),
+        "server_overload": (35, 65),
+        "cascade": (60, 40)
+    }
+    failures, successes = fault_map.get(fault_type, (1, 99))
+    severity = "low" if failures < 10 else "medium" if failures < 40 else "high"
+    risk_engine = BayesianRiskEngine(alpha=1, beta=1)
+    risk_engine.update(failures, successes)
+    risk = risk_engine.risk()
+    ci_low, ci_high = risk_engine.risk_interval(0.95)
+    policy_engine = PolicyEngine(thresholds={"low": 0.2, "high": 0.8})
+    action, reason = policy_engine.evaluate(risk)
+    control_decision = autonomous_control_decision(risk, risk_engine, policy_engine)
+    analysis_result = {
+        "risk": risk,
+        "risk_ci": [ci_low, ci_high],
+        "decision": action,
+        "justification": reason,
+        "healing_actions": ["restart"] if action == "deny" else ["monitor"],
+        "posterior_parameters": {
+            "alpha": risk_engine.alpha,
+            "beta": risk_engine.beta
+        }
+    }
+    output = {
+        **analysis_result,
+        "governance": {
+            "policy_evaluation": {
+                "action": action,
+                "reason": reason,
+                "thresholds": policy_engine.thresholds
+            },
+            "control_plane_decision": control_decision
+        }
+    }
+    return output, session_state
+# ----------------------------------------------------------------------
+# MCMC (Metropolis‑Hastings) – no scipy needed
 # ----------------------------------------------------------------------
 class MHMCMC:
     def __init__(self, log_target, proposal_sd=0.1):
         self.log_target = log_target
         self.proposal_sd = proposal_sd
         current_log = self.log_target(current)
         accepted = 0
         for i in range(n_samples + burn_in):
             proposal = current + np.random.normal(0, self.proposal_sd, size=len(current))
             proposal_log = self.log_target(proposal)
             accept_prob = min(1, np.exp(proposal_log - current_log))
             if np.random.rand() < accept_prob:
                 current = proposal
         acceptance_rate = accepted / (n_samples + burn_in)
         return samples, acceptance_rate
 def run_hmc_mcmc(samples, warmup):
+    # Generate data: 10 observations with mean 0.5, std 0.2
     data = np.random.normal(0.5, 0.2, 10)
     def log_prior(mu):
+        return -0.5 * (mu ** 2)   # prior N(0,1)
     def log_likelihood(mu):
+        return -0.5 * np.sum(((data - mu) / 0.2) ** 2)
     def log_posterior(mu):
         return log_prior(mu) + log_likelihood(mu)
     sampler = MHMCMC(log_posterior, proposal_sd=0.05)
     mu_samples, acceptance = sampler.sample(samples, initial_state=[0.0], burn_in=warmup)
     mu_samples = mu_samples.flatten()
     mean = np.mean(mu_samples)
     median = np.median(mu_samples)
     credible_interval = np.percentile(mu_samples, [2.5, 97.5])
     fig_trace = go.Figure()
     fig_trace.add_trace(go.Scatter(y=mu_samples, mode='lines', name='μ', line=dict(width=1)))
     fig_trace.update_layout(title="Trace of μ (Metropolis-Hastings)", xaxis_title="Iteration", yaxis_title="μ")
     fig_hist = go.Figure()
     fig_hist.add_trace(go.Histogram(x=mu_samples, nbinsx=50, name='Posterior'))
     fig_hist.update_layout(title="Posterior Distribution of μ", xaxis_title="μ", yaxis_title="Density")
     }
     return summary, fig_trace, fig_hist
 # ----------------------------------------------------------------------
 # Dashboard plots
 # ----------------------------------------------------------------------
     "enterprise_features": ["Real-time HMC (using PyMC)", "Hyperpriors", "Decision Engine"]
 }
+# ----------------------------------------------------------------------
+# Start memory monitoring
+# ----------------------------------------------------------------------
+log_memory_usage()
 # ----------------------------------------------------------------------
 # Gradio UI
 # ----------------------------------------------------------------------
     - **Metropolis-Hastings MCMC** – sampling from a posterior distribution (simulating HMC concepts).
     - **Autonomous control decisions** – based on the current risk estimate.
+    All components are implemented with only `numpy`, `scipy`, and standard libraries.
     """)
     with gr.Tabs():
         with gr.TabItem("Control Plane Dashboard"):
             gr.Markdown("### 🎮 Control Plane")
             with gr.Row():
                 outputs=[control_stats, risk_gauge, decision_pie, action_timeline]
             )
         with gr.TabItem("Infrastructure Reliability"):
             gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Bayesian Risk")
             infra_state = gr.State(value={})
             with gr.Row():
                 with gr.Column():
                 with gr.Column():
                     infra_output = gr.JSON(label="Analysis Result")
         with gr.TabItem("Deep Analysis (MCMC)"):
             gr.Markdown("### Markov Chain Monte Carlo (Metropolis‑Hastings)")
             with gr.Row():
                 with gr.Column():
                     hmc_samples = gr.Slider(500, 10000, value=5000, step=500, label="Number of Samples")
                 hmc_trace_plot = gr.Plot(label="Trace Plot")
                 hmc_pair_plot = gr.Plot(label="Posterior Histogram")
         with gr.TabItem("Policy Management"):
             gr.Markdown("### 📋 Execution Policies")
             policies_json = [
                 {"name": "Low Risk Policy", "conditions": ["risk < 0.2"], "action": "approve", "priority": 1},
                 {"name": "Medium Risk Policy", "conditions": ["0.2 ≤ risk ≤ 0.8"], "action": "escalate", "priority": 2},
             ]
             gr.JSON(label="Active Policies", value=policies_json)
         with gr.TabItem("Enterprise / OSS"):
             gr.Markdown(f"""
             ## 🚀 ARF {oss_caps['edition'].upper()} Edition