Spaces:

A-R-F
/

Agentic-Reliability-Framework-v4

Running

App Files Files Community

petter2025 commited on Mar 3

Commit

1080e05

verified ·

1 Parent(s): ece2830

Update app.py

Browse files

Files changed (1) hide show

app.py +252 -439

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import json
 import logging
 import traceback
 import os
-import torch
 import numpy as np
 import pandas as pd
 from datetime import datetime
@@ -22,179 +21,42 @@ from plotly.subplots import make_subplots
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Global variable for feedback
-last_task_category = None
-# ARF components
-from agentic_reliability_framework.runtime.engine import EnhancedReliabilityEngine
-from agentic_reliability_framework.core.models.event import ReliabilityEvent
-from policy_engine import PolicyEngine
-# Custom AI components
-from ai_event import AIEvent
-from ai_risk_engine import AIRiskEngine
-from hallucination_detective import HallucinationDetectiveAgent
-from memory_drift_diagnostician import MemoryDriftDiagnosticianAgent
-from nli_detector import NLIDetector
-from retrieval import SimpleRetriever
-from image_detector import ImageQualityDetector
-from audio_detector import AudioQualityDetector
-from iot_simulator import IoTSimulator
-from robotics_diagnostician import RoboticsDiagnostician
-from iot_event import IoTEvent
-# ========== Advanced Inference (HMC) ==========
-from advanced_inference import HMCAnalyzer
-# ========== Infrastructure Reliability Imports (with fallbacks) ==========
-INFRA_DEPS_AVAILABLE = False
-try:
-    from infra_simulator import InfraSimulator
-    from infra_graph import InfraGraph
-    from bayesian_model import failure_model as pyro_model
-    from gnn_predictor import FailureGNN
-    from ontology_reasoner import InfraOntology
-    import problog
-    INFRA_DEPS_AVAILABLE = True
-    logger.info("Infrastructure reliability modules loaded.")
-except ImportError as e:
-    logger.warning(f"Infrastructure modules not fully available: {e}. The Infrastructure tab will use mock mode.")
 # ----------------------------------------------------------------------
-# ARF infrastructure engine
 # ----------------------------------------------------------------------
-try:
-    logger.info("Initializing EnhancedReliabilityEngine...")
-    infra_engine = EnhancedReliabilityEngine()
-    policy_engine = PolicyEngine()
-    logger.info("Policy Engine initialized with 5 policies")
-except Exception as e:
-    logger.error(f"Infrastructure engine init failed: {e}")
-    infra_engine = None
-    policy_engine = PolicyEngine()  # Fallback
 # ----------------------------------------------------------------------
-# Text generation model (DialoGPT-small) with logprobs
 # ----------------------------------------------------------------------
-from transformers import AutoTokenizer, AutoModelForCausalLM
-gen_model_name = "microsoft/DialoGPT-small"
 try:
-    tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
-    model = AutoModelForCausalLM.from_pretrained(gen_model_name)
-    model.eval()
-    logger.info(f"Generator {gen_model_name} loaded.")
-except Exception as e:
-    logger.error(f"Generator load failed: {e}")
-    tokenizer = model = None
-def generate_with_logprobs(prompt, max_new_tokens=100):
-    """Generate text and return (generated_text, avg_log_prob)."""
-    if tokenizer is None or model is None:
-        return "[Model not loaded]", -10.0
-    inputs = tokenizer(prompt, return_tensors="pt")
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=max_new_tokens,
-            return_dict_in_generate=True,
-            output_scores=True
-        )
-    scores = outputs.scores
-    log_probs = [torch.log_softmax(score, dim=-1) for score in scores]
-    generated_ids = outputs.sequences[0][inputs['input_ids'].shape[1]:]
-    token_log_probs = []
-    for i, lp in enumerate(log_probs):
-        token_id = generated_ids[i]
-        token_log_probs.append(lp[0, token_id].item())
-    avg_log_prob = sum(token_log_probs) / len(token_log_probs) if token_log_probs else -10.0
-    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
-    return generated_text, avg_log_prob
-# ----------------------------------------------------------------------
-# NLI detector
-# ----------------------------------------------------------------------
-nli_detector = NLIDetector()
-# ----------------------------------------------------------------------
-# Retrieval
-# ----------------------------------------------------------------------
-retriever = SimpleRetriever()
-# ----------------------------------------------------------------------
-# Image generation
-# ----------------------------------------------------------------------
-from diffusers import StableDiffusionPipeline
-image_pipe = None
-try:
-    image_pipe = StableDiffusionPipeline.from_pretrained(
-        "hf-internal-testing/tiny-stable-diffusion-torch",
-        safety_checker=None
-    )
-    if not torch.cuda.is_available():
-        image_pipe.to("cpu")
-    logger.info("Image pipeline loaded.")
-except Exception as e:
-    logger.warning(f"Image pipeline load failed (will be disabled): {e}")
-    image_pipe = None
-# ----------------------------------------------------------------------
-# Audio transcription
-# ----------------------------------------------------------------------
-from transformers import pipeline
-audio_pipe = None
-try:
-    audio_pipe = pipeline(
-        "automatic-speech-recognition",
-        model="openai/whisper-tiny.en",
-        device=0 if torch.cuda.is_available() else -1
-    )
-    logger.info("Audio pipeline loaded.")
-except Exception as e:
-    logger.warning(f"Audio pipeline load failed (will be disabled): {e}")
-# ----------------------------------------------------------------------
-# AI agents
-# ----------------------------------------------------------------------
-hallucination_detective = HallucinationDetectiveAgent(nli_detector=nli_detector)
-memory_drift_diagnostician = MemoryDriftDiagnosticianAgent()
-image_quality_detector = ImageQualityDetector()
-audio_quality_detector = AudioQualityDetector()
-robotics_diagnostician = RoboticsDiagnostician()
-# ----------------------------------------------------------------------
-# Bayesian risk engine (now with hyperpriors)
-# ----------------------------------------------------------------------
-ai_risk_engine = AIRiskEngine()
 # ----------------------------------------------------------------------
-# HMC analyzer
 # ----------------------------------------------------------------------
-hmc_analyzer = HMCAnalyzer()
 # ----------------------------------------------------------------------
-# IoT simulator
 # ----------------------------------------------------------------------
-iot_sim = IoTSimulator()
-# ----------------------------------------------------------------------
-# Infrastructure components
-# ----------------------------------------------------------------------
-if INFRA_DEPS_AVAILABLE:
-    infra_sim = InfraSimulator()
-    infra_graph = InfraGraph(
-        uri=os.getenv("NEO4J_URI"),
-        user=os.getenv("NEO4J_USER"),
-        password=os.getenv("NEO4J_PASSWORD")
-    )
-    gnn_model = FailureGNN()
-    ontology = InfraOntology()
-else:
-    infra_sim = InfraSimulator() if INFRA_DEPS_AVAILABLE else None
-    infra_graph = None
-    gnn_model = None
-    ontology = None
-# ========== Global History for Dashboard ==========
 decision_history = []      # list of (timestamp, decision, category)
 risk_history = []          # list of (timestamp, mean_risk)
@@ -207,19 +69,30 @@ def update_dashboard_data(decision: Dict, risk: float):
     if len(risk_history) > 100:
         risk_history.pop(0)
-# ========== Execution Governance Functions ==========
 def evaluate_policies(event_type: str, severity: str, component: str) -> Dict[str, Any]:
-    """Evaluate policies against an event and return recommended actions."""
     try:
-        actions = policy_engine.evaluate(event_type, severity, component)
         return {
             "timestamp": datetime.utcnow().isoformat(),
             "event_type": event_type,
             "severity": severity,
             "component": component,
-            "recommended_actions": actions,
-            "governance_status": "approved" if actions else "blocked"
         }
     except Exception as e:
         logger.error(f"Policy evaluation error: {e}")
@@ -229,6 +102,9 @@ def evaluate_policies(event_type: str, severity: str, component: str) -> Dict[st
             "recommended_actions": []
         }
 def autonomous_control_decision(analysis_result: Dict[str, Any], risk_threshold: float = 0.7) -> Dict[str, Any]:
     """
     Make autonomous control decision based on analysis and risk metrics.
@@ -243,183 +119,95 @@ def autonomous_control_decision(analysis_result: Dict[str, Any], risk_threshold:
     }
     try:
-        # Extract risk metrics
-        risk_metrics = analysis_result.get("risk_metrics", {})
-        mean_risk = risk_metrics.get("mean", 0.5)
-        p95_risk = risk_metrics.get("p95", 0.7)
-        # Determine risk level
-        if mean_risk > risk_threshold or p95_risk > risk_threshold:
             decision["risk_level"] = "high"
             decision["approved"] = False
-            decision["reason"] = f"Risk exceeds threshold (mean={mean_risk:.2f}, p95={p95_risk:.2f})"
-        else:
             decision["risk_level"] = "low"
             decision["approved"] = True
             decision["reason"] = "Risk within acceptable limits"
-            # Generate autonomous actions based on findings
-            if "hallucination_detection" in analysis_result:
-                hallu = analysis_result["hallucination_detection"]
-                if hallu.get("findings", {}).get("is_hallucination"):
-                    decision["actions"].append({
-                        "action": "regenerate",
-                        "params": {"temperature": 0.3},
-                        "reason": "Hallucination detected"
-                    })
-            if "memory_drift_detection" in analysis_result:
-                drift = analysis_result["memory_drift_detection"]
-                if drift.get("findings", {}).get("drift_detected"):
-                    decision["actions"].append({
-                        "action": "reset_context",
-                        "params": {},
-                        "reason": "Memory drift detected"
-                    })
     except Exception as e:
         logger.error(f"Control decision error: {e}")
         decision["reason"] = f"Error in decision process: {str(e)}"
-    update_dashboard_data(decision, analysis_result.get("risk_metrics", {}).get("mean", 0.5))
     return decision
-# ========== Async Handlers with Governance ==========
-async def handle_text(task_type, prompt, context_window):
-    """Handle text generation with governance and control plane decisions."""
-    global last_task_category
-    last_task_category = task_type
     try:
-        logger.info(f"Handling text task: {task_type}, prompt: {prompt[:50]}...")
-        # Generate response
-        response, avg_log_prob = generate_with_logprobs(prompt)
-        retrieval_score = retriever.get_similarity(prompt)
-        # Create event
-        event = AIEvent(
-            timestamp=datetime.utcnow(),
-            component="ai",
-            service_mesh="ai",
-            latency_p99=0,
-            error_rate=0.0,
-            throughput=1,
-            cpu_util=None,
-            memory_util=None,
-            action_category=task_type,
-            model_name=gen_model_name,
-            model_version="latest",
-            prompt=prompt,
-            response=response,
-            response_length=len(response),
-            confidence=float(np.exp(avg_log_prob)),
-            perplexity=None,
-            retrieval_scores=[retrieval_score],
-            user_feedback=None,
-            latency_ms=0
-        )
-        # Run analysis
-        hallu_result = await hallucination_detective.analyze(event)
-        drift_result = await memory_drift_diagnostician.analyze(event, context_window)
-        risk_metrics = ai_risk_engine.risk_score(task_type)
-        # Combine results
         analysis_result = {
-            "response": response,
-            "avg_log_prob": avg_log_prob,
-            "confidence": event.confidence,
-            "retrieval_score": retrieval_score,
-            "hallucination_detection": hallu_result,
-            "memory_drift_detection": drift_result,
-            "risk_metrics": risk_metrics
         }
-        # Apply governance and control plane
-        policy_result = evaluate_policies(
-            event_type="text_generation",
-            severity="medium" if hallu_result.get("findings", {}).get("is_hallucination") else "low",
-            component="ai_service"
-        )
         control_decision = autonomous_control_decision(analysis_result)
-        # Add governance to output
-        analysis_result["governance"] = {
-            "policy_evaluation": policy_result,
-            "control_plane_decision": control_decision
-        }
-        return analysis_result
-    except Exception as e:
-        logger.error(f"Text task error: {e}", exc_info=True)
-        return {
-            "error": str(e),
-            "traceback": traceback.format_exc(),
-            "governance": {
-                "policy_evaluation": evaluate_policies("text_generation", "critical", "ai_service"),
-                "control_plane_decision": {"approved": False, "reason": f"Error: {str(e)}"}
-            }
-        }
-async def handle_infra_with_governance(fault_type, context_window, session_state):
-    """Infrastructure analysis with execution governance."""
-    if not INFRA_DEPS_AVAILABLE:
-        return {
-            "error": "Infrastructure modules not available",
-            "governance": evaluate_policies("infrastructure", "critical", "system")
-        }, session_state
-    try:
-        # Initialize simulator
-        if "sim" not in session_state or session_state["sim"] is None:
-            session_state["sim"] = InfraSimulator()
-        sim = session_state["sim"]
-        # Inject fault
-        sim.set_fault(fault_type if fault_type != "none" else None)
-        components = sim.read_state()
-        # Update graph
-        if infra_graph:
-            infra_graph.update_from_state(components)
-        # Determine severity based on fault
-        severity = "low"
-        if fault_type != "none":
-            severity = "high" if fault_type == "cascade" else "medium"
-        # Evaluate policies
-        policy_result = evaluate_policies(
-            event_type="infrastructure_failure",
-            severity=severity,
-            component="data_center"
-        )
-        # Control plane decision
-        control_decision = {
-            "timestamp": datetime.utcnow().isoformat(),
-            "approved": policy_result["governance_status"] == "approved",
-            "actions": policy_result["recommended_actions"],
-            "reason": "Governance approved" if policy_result["governance_status"] == "approved" else "Blocked by policy",
-            "risk_level": severity
-        }
-        # Combine results
         output = {
-            "topology": components,
-            "bayesian_risk": {"switch_failure": 0.1, "server_failure": 0.05},
-            "gnn_predictions": {"at_risk": ["server-1"] if fault_type != "none" else []},
-            "logic_explanations": "ProbLog analysis complete",
-            "ontology": ontology.classify("server") if ontology else {"inferred": [], "consistent": True},
             "governance": {
                 "policy_evaluation": policy_result,
                 "control_plane_decision": control_decision
             }
         }
         return output, session_state
     except Exception as e:
@@ -427,31 +215,89 @@ async def handle_infra_with_governance(fault_type, context_window, session_state
         return {
             "error": str(e),
             "traceback": traceback.format_exc(),
-            "governance": evaluate_policies("infrastructure", "critical", "system")
         }, session_state
-# ========== HMC Handler ==========
-def run_hmc(samples, warmup):
-    summary = hmc_analyzer.run_inference(num_samples=samples, warmup=warmup)
-    trace_data = hmc_analyzer.get_trace_data()
-    fig_trace, fig_pair = None, None
-    if trace_data:
-        # Trace plot
         fig_trace = go.Figure()
         for key, vals in trace_data.items():
             fig_trace.add_trace(go.Scatter(y=vals, mode='lines', name=key))
         fig_trace.update_layout(title="Posterior Traces", xaxis_title="Sample", yaxis_title="Value")
-        # Pair plot (simplified scatter matrix)
-        df = pd.DataFrame(trace_data)
-        fig_pair = go.Figure(data=go.Splom(
-            dimensions=[dict(label=k, values=df[k]) for k in df.columns],
-            showupperhalf=False
-        ))
-        fig_pair.update_layout(title="Posterior Pair Plot")
-    return summary, fig_trace, fig_pair
-# ========== Dashboard Plot Generators ==========
 def generate_risk_gauge():
     if not risk_history:
         return go.Figure()
@@ -460,12 +306,15 @@ def generate_risk_gauge():
         mode="gauge+number",
         value=latest_risk,
         title={'text': "Current Risk"},
-        gauge={'axis': {'range': [0, 1]},
-               'bar': {'color': "darkblue"},
-               'steps': [
-                   {'range': [0, 0.3], 'color': "lightgreen"},
-                   {'range': [0.3, 0.7], 'color': "yellow"},
-                   {'range': [0.7, 1], 'color': "red"}]}))
     return fig
 def generate_decision_pie():
@@ -487,7 +336,6 @@ def generate_action_timeline():
     fig.update_layout(title="Autonomous Actions Timeline", xaxis_title="Time", yaxis_title="Approved (1) / Blocked (0)")
     return fig
-# ========== Dashboard Refresh Function ==========
 def refresh_dashboard():
     """Compute latest stats and return updated dashboard components."""
     total = len(decision_history)
@@ -508,35 +356,38 @@ def refresh_dashboard():
     )
 # ----------------------------------------------------------------------
-# Gradio UI with Governance Focus
 # ----------------------------------------------------------------------
-with gr.Blocks(title="ARF v4 – Autonomous AI Control Plane", theme="soft") as demo:
     gr.Markdown("""
-    # 🧠 ARF v4 – Autonomous AI Control Plane
-    **Execution Governance & Neuro‑Symbolic Reliability for Critical Infrastructure**
-    This demo shows how ARF provides:
-    - **Policy‑based Governance** – Automatic evaluation and enforcement
-    - **Autonomous Control Decisions** – AI-driven remediation actions
-    - **Neuro‑Symbolic Reasoning** – Combining neural networks with symbolic logic
-    - **Real‑time Risk Assessment** – Bayesian online learning with hyperpriors
-    - **Hamiltonian Monte Carlo** – Offline deep pattern discovery
     """)
-    # Historic Context Window (shared across tabs)
-    context_window_slider = gr.Slider(1, 200, value=50, step=1, label="Historic Context Window (readings)")
     with gr.Tabs():
         # Tab 1: Control Plane Dashboard
         with gr.TabItem("Control Plane Dashboard"):
-            gr.Markdown("### 🎮 Autonomous Control Plane")
             with gr.Row():
                 with gr.Column():
                     system_status = gr.JSON(label="System Status", value={
-                        "governance_mode": "active",
-                        "policies_loaded": 5,
-                        "autonomous_actions": "enabled",
-                        "risk_threshold": 0.7
                     })
                 with gr.Column():
                     control_stats = gr.JSON(label="Control Statistics", value={
@@ -552,24 +403,15 @@ with gr.Blocks(title="ARF v4 – Autonomous AI Control Plane", theme="soft") as
                 action_timeline = gr.Plot(label="Autonomous Actions Timeline")
             with gr.Row():
                 health_score = gr.Number(label="System Health Score", value=85, precision=0)
-            # Refresh button for dashboard
             refresh_dash_btn = gr.Button("Refresh Dashboard")
             refresh_dash_btn.click(
                 fn=refresh_dashboard,
                 outputs=[control_stats, risk_gauge, decision_pie, action_timeline]
             )
-        # Tab 2: Text Generation with Governance
-        with gr.TabItem("Text Generation"):
-            gr.Markdown("### AI Text Generation with Governance")
-            text_task = gr.Dropdown(["chat", "code", "summary"], value="chat", label="Task")
-            text_prompt = gr.Textbox(label="Prompt", value="What is the capital of France?", lines=3)
-            text_btn = gr.Button("Generate with Governance")
-            text_output = gr.JSON(label="Analysis with Control Decisions")
-        # Tab 3: Infrastructure Reliability with Governance
         with gr.TabItem("Infrastructure Reliability"):
-            gr.Markdown("### Neuro‑Symbolic Infrastructure with Autonomous Control")
             infra_state = gr.State(value={})
             with gr.Row():
@@ -579,11 +421,11 @@ with gr.Blocks(title="ARF v4 – Autonomous AI Control Plane", theme="soft") as
                         value="none",
                         label="Inject Fault"
                     )
-                    infra_btn = gr.Button("Run Analysis with Governance")
                 with gr.Column():
                     infra_output = gr.JSON(label="Analysis with Control Decisions")
-        # Tab 4: Deep Analysis (HMC)
         with gr.TabItem("Deep Analysis (HMC)"):
             gr.Markdown("### Hamiltonian Monte Carlo – Offline Pattern Discovery")
             with gr.Row():
@@ -597,81 +439,54 @@ with gr.Blocks(title="ARF v4 – Autonomous AI Control Plane", theme="soft") as
                 hmc_trace_plot = gr.Plot(label="Trace Plot")
                 hmc_pair_plot = gr.Plot(label="Pair Plot")
-        # Tab 5: Policy Management
         with gr.TabItem("Policy Management"):
-            gr.Markdown("### 📋 Execution Policies")
-            policies = gr.JSON(label="Active Policies", value=[
-                {
-                    "id": "POL-001",
-                    "name": "Hallucination Prevention",
-                    "condition": "confidence < 0.6",
-                    "action": "regenerate",
-                    "severity": "medium"
-                },
-                {
-                    "id": "POL-002",
-                    "name": "Infrastructure Cascade",
-                    "condition": "fault_type == 'cascade'",
-                    "action": "isolate_affected",
-                    "severity": "critical"
-                },
-                {
-                    "id": "POL-003",
-                    "name": "Memory Drift",
-                    "condition": "drift_detected == true",
-                    "action": "reset_context",
-                    "severity": "low"
-                },
-                {
-                    "id": "POL-004",
-                    "name": "High Risk",
-                    "condition": "risk_metrics.mean > 0.7",
-                    "action": "require_approval",
-                    "severity": "high"
-                },
-                {
-                    "id": "POL-005",
-                    "name": "Audio Quality",
-                    "condition": "confidence < 0.5",
-                    "action": "request_retry",
-                    "severity": "low"
-                }
-            ])
-        # Tab 6: Enterprise
-        with gr.TabItem("Enterprise"):
-            gr.Markdown("""
-            ## 🚀 ARF Enterprise – Autonomous Control Plane for Critical Infrastructure
-            ### Key Enterprise Features:
-            - **Execution Governance** – Policy‑controlled autonomous actions
-            - **Audit Trails & Compliance** – Full traceability for SOC2, HIPAA, GDPR
-            - **Learning Loops** – Models improve over time with your data
-            - **Multi‑Tenant Control** – Role‑based access and isolation
-            - **Cloud Integrations** – Azure, AWS, GCP native clients
-            - **24/7 Support & SLAs** – Enterprise‑grade reliability
-            ### Get Started
-            - 📅 [Book a Demo](https://calendly.com/petter2025us/30min)
-            - 📧 [Contact Sales](mailto:petter2025us@outlook.com)
             """)
-    # Feedback row
     with gr.Row():
         feedback_up = gr.Button("👍 Approve Decision")
         feedback_down = gr.Button("👎 Reject Decision")
         feedback_msg = gr.Textbox(label="Feedback", interactive=False)
     # Wire events
-    text_btn.click(
-        fn=lambda task, p, w: asyncio.run(handle_text(task, p, w)),
-        inputs=[text_task, text_prompt, context_window_slider],
-        outputs=text_output
-    )
     infra_btn.click(
         fn=lambda f, w, s: asyncio.run(handle_infra_with_governance(f, w, s)),
-        inputs=[infra_fault, context_window_slider, infra_state],
         outputs=[infra_output, infra_state]
     )
@@ -682,11 +497,9 @@ with gr.Blocks(title="ARF v4 – Autonomous AI Control Plane", theme="soft") as
     )
     def handle_control_feedback(approved: bool):
-        global last_task_category
-        if last_task_category is None:
-            return "No recent decision to rate"
-        return f"Control decision {'approved' if approved else 'rejected'} for {last_task_category}"
     feedback_up.click(
         fn=lambda: handle_control_feedback(True),
         outputs=feedback_msg

 import logging
 import traceback
 import os
 import numpy as np
 import pandas as pd
 from datetime import datetime
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # ----------------------------------------------------------------------
+# OSS Core Imports
 # ----------------------------------------------------------------------
+from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine, HealingPolicy
+from agentic_reliability_framework.core.governance.risk_engine import RiskEngine, ActionCategory
+from agentic_reliability_framework.core.governance.intents import (
+    InfrastructureIntent, ProvisionResourceIntent, ResourceType, Environment
+)
+from agentic_reliability_framework.core.adapters.azure.azure_simulator import AzureInfrastructureSimulator
+from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction, EventSeverity
+from agentic_reliability_framework.runtime.hmc.hmc_learner import HMCRiskLearner
+from agentic_reliability_framework.core.config.constants import (
+    LATENCY_CRITICAL, ERROR_RATE_HIGH, get_oss_capabilities,
+    RISK_THRESHOLD_LOW, RISK_THRESHOLD_HIGH  # Note: these may need to be added to constants if missing; fallback defined below
+)
 # ----------------------------------------------------------------------
+# Fallback constants if not in OSS constants
 # ----------------------------------------------------------------------
 try:
+    from agentic_reliability_framework.core.config.constants import RISK_THRESHOLD_LOW, RISK_THRESHOLD_HIGH
+except ImportError:
+    RISK_THRESHOLD_LOW = 0.2
+    RISK_THRESHOLD_HIGH = 0.8
+    logger.info("Using fallback risk thresholds (0.2/0.8)")
 # ----------------------------------------------------------------------
+# Infrastructure simulator and engines
 # ----------------------------------------------------------------------
+infra_sim = AzureInfrastructureSimulator()
+policy_engine = PolicyEngine()  # loads default policies
+risk_engine = RiskEngine(hmc_model_path="hmc_model.json", use_hyperpriors=True)
 # ----------------------------------------------------------------------
+# Global history for dashboard
 # ----------------------------------------------------------------------
 decision_history = []      # list of (timestamp, decision, category)
 risk_history = []          # list of (timestamp, mean_risk)
     if len(risk_history) > 100:
         risk_history.pop(0)
+# ----------------------------------------------------------------------
+# Policy evaluation helper
+# ----------------------------------------------------------------------
 def evaluate_policies(event_type: str, severity: str, component: str) -> Dict[str, Any]:
+    """
+    Evaluate policies against an event and return recommended actions.
+    Uses OSS PolicyEngine with a minimal ReliabilityEvent.
+    """
     try:
+        event = ReliabilityEvent(
+            component=component,
+            latency_p99=0.0,       # dummy, not used in policy conditions
+            error_rate=0.0,
+            throughput=1.0,
+            severity=EventSeverity(severity)
+        )
+        actions = policy_engine.evaluate_policies(event)
         return {
             "timestamp": datetime.utcnow().isoformat(),
             "event_type": event_type,
             "severity": severity,
             "component": component,
+            "recommended_actions": [a.value for a in actions if a != HealingAction.NO_ACTION],
+            "governance_status": "approved" if actions and actions[0] != HealingAction.NO_ACTION else "blocked"
         }
     except Exception as e:
         logger.error(f"Policy evaluation error: {e}")
             "recommended_actions": []
         }
+# ----------------------------------------------------------------------
+# Autonomous control decision
+# ----------------------------------------------------------------------
 def autonomous_control_decision(analysis_result: Dict[str, Any], risk_threshold: float = 0.7) -> Dict[str, Any]:
     """
     Make autonomous control decision based on analysis and risk metrics.
     }
     try:
+        # Extract risk metrics (if present)
+        risk = analysis_result.get("risk", 0.5)
+        p95 = analysis_result.get("risk_p95", risk)
+        # Determine risk level using OSS thresholds if available
+        if risk > RISK_THRESHOLD_HIGH or p95 > RISK_THRESHOLD_HIGH:
             decision["risk_level"] = "high"
             decision["approved"] = False
+            decision["reason"] = f"Risk exceeds high threshold ({RISK_THRESHOLD_HIGH})"
+        elif risk < RISK_THRESHOLD_LOW:
             decision["risk_level"] = "low"
             decision["approved"] = True
             decision["reason"] = "Risk within acceptable limits"
+        else:
+            decision["risk_level"] = "medium"
+            decision["approved"] = False
+            decision["reason"] = f"Risk in escalation zone ({RISK_THRESHOLD_LOW}-{RISK_THRESHOLD_HIGH})"
+        # Optionally add actions based on analysis (e.g., if risk is high, suggest mitigation)
+        if decision["risk_level"] == "high" and "healing_actions" in analysis_result:
+            decision["actions"] = analysis_result["healing_actions"]
     except Exception as e:
         logger.error(f"Control decision error: {e}")
         decision["reason"] = f"Error in decision process: {str(e)}"
+    update_dashboard_data(decision, analysis_result.get("risk", 0.5))
     return decision
+# ----------------------------------------------------------------------
+# Infrastructure analysis with governance
+# ----------------------------------------------------------------------
+async def handle_infra_with_governance(fault_type: str, context_window: int, session_state: Dict) -> tuple:
+    """
+    Infrastructure analysis using OSS simulator and risk engine.
+    """
     try:
+        # Map fault to an intent
+        if fault_type == "none":
+            intent = ProvisionResourceIntent(
+                resource_type=ResourceType.VM,
+                environment=Environment.DEVELOPMENT,
+                size="Standard_D2s_v3"
+            )
+            severity = "low"
+        else:
+            # Simulate a failure by using production environment and risky config
+            intent = ProvisionResourceIntent(
+                resource_type=ResourceType.VM,
+                environment=Environment.PRODUCTION,
+                size="custom_extra_large"
+            )
+            severity = "high" if fault_type == "cascade" else "medium"
+        # Evaluate via simulator
+        healing_intent = infra_sim.evaluate_intent(intent)
+        # Extract risk and contributions
+        risk = healing_intent.risk_score
+        # For simplicity, we take p95 from risk_contributions if available; else assume same
+        risk_p95 = healing_intent.risk_contributions.get("hyper_summary", {}).get("p95", risk) if healing_intent.risk_contributions else risk
+        # Get policy evaluation
+        policy_result = evaluate_policies("infrastructure_failure", severity, "azure")
+        # Build analysis result
         analysis_result = {
+            "intent": intent.dict(),
+            "healing_intent": healing_intent.dict(),
+            "risk": risk,
+            "risk_p95": risk_p95,
+            "decision": healing_intent.decision,   # "approve", "deny", "escalate"
+            "justification": healing_intent.justification,
+            "policy_violations": healing_intent.policy_violations,
+            "healing_actions": [a.value for a in healing_intent.recommended_actions] if healing_intent.recommended_actions else [],
+            "risk_contributions": healing_intent.risk_contributions
         }
+        # Apply autonomous control decision
         control_decision = autonomous_control_decision(analysis_result)
+        # Combine with governance
         output = {
+            **analysis_result,
             "governance": {
                 "policy_evaluation": policy_result,
                 "control_plane_decision": control_decision
             }
         }
         return output, session_state
     except Exception as e:
         return {
             "error": str(e),
             "traceback": traceback.format_exc(),
+            "governance": evaluate_policies("infrastructure_failure", "critical", "system")
         }, session_state
+# ----------------------------------------------------------------------
+# HMC analysis using OSS HMCRiskLearner
+# ----------------------------------------------------------------------
+def run_hmc(samples: int, warmup: int) -> tuple:
+    """
+    Train HMCRiskLearner on synthetic data and return posterior summary + plots.
+    """
+    try:
+        # Generate synthetic incident data
+        np.random.seed(42)
+        n = 200
+        data = []
+        for _ in range(n):
+            latency = np.random.exponential(200)
+            error_rate = np.random.beta(1, 10)
+            throughput = np.random.normal(1000, 200)
+            cpu = np.random.uniform(0.2, 0.9)
+            mem = np.random.uniform(0.3, 0.8)
+            target = int(latency > LATENCY_CRITICAL or error_rate > ERROR_RATE_HIGH)
+            data.append({
+                "latency_p99": latency,
+                "error_rate": error_rate,
+                "throughput": throughput,
+                "cpu_util": cpu,
+                "memory_util": mem,
+                "target": target
+            })
+        df = pd.DataFrame(data)
+        learner = HMCRiskLearner()
+        learner.train(df.to_dict('records'), draws=samples, tune=warmup, chains=2)
+        # Get feature importance (coefficient summaries)
+        coeffs = learner.get_feature_importance()
+        summary = {k: v for k, v in coeffs.items()}
+        # Posterior predictive for a sample point
+        sample_metrics = {
+            "latency_p99": 350,
+            "error_rate": 0.08,
+            "throughput": 900,
+            "cpu_util": 0.7,
+            "memory_util": 0.6
+        }
+        pred_summary = learner.predict_risk_summary(sample_metrics)
+        summary["sample_prediction"] = pred_summary
+        # Extract trace for plotting
+        trace_data = {}
+        if learner.trace is not None:
+            for var in learner.trace.posterior.data_vars:
+                if var in ['alpha', 'beta']:
+                    vals = learner.trace.posterior[var].values.flatten()
+                    trace_data[var] = vals[:1000]  # limit for performance
+        # Create trace plot
         fig_trace = go.Figure()
         for key, vals in trace_data.items():
             fig_trace.add_trace(go.Scatter(y=vals, mode='lines', name=key))
         fig_trace.update_layout(title="Posterior Traces", xaxis_title="Sample", yaxis_title="Value")
+        # Create pair plot (simplified)
+        fig_pair = go.Figure()
+        if len(trace_data) > 0:
+            df_trace = pd.DataFrame(trace_data)
+            fig_pair = go.Figure(data=go.Splom(
+                dimensions=[dict(label=k, values=df_trace[k]) for k in df_trace.columns],
+                showupperhalf=False
+            ))
+            fig_pair.update_layout(title="Posterior Pair Plot")
+        return summary, fig_trace, fig_pair
+    except Exception as e:
+        logger.error(f"HMC analysis error: {e}", exc_info=True)
+        return {"error": str(e)}, None, None
+# ----------------------------------------------------------------------
+# Dashboard plot generators
+# ----------------------------------------------------------------------
 def generate_risk_gauge():
     if not risk_history:
         return go.Figure()
         mode="gauge+number",
         value=latest_risk,
         title={'text': "Current Risk"},
+        gauge={
+            'axis': {'range': [0, 1]},
+            'bar': {'color': "darkblue"},
+            'steps': [
+                {'range': [0, RISK_THRESHOLD_LOW], 'color': "lightgreen"},
+                {'range': [RISK_THRESHOLD_LOW, RISK_THRESHOLD_HIGH], 'color': "yellow"},
+                {'range': [RISK_THRESHOLD_HIGH, 1], 'color': "red"}
+            ]
+        }))
     return fig
 def generate_decision_pie():
     fig.update_layout(title="Autonomous Actions Timeline", xaxis_title="Time", yaxis_title="Approved (1) / Blocked (0)")
     return fig
 def refresh_dashboard():
     """Compute latest stats and return updated dashboard components."""
     total = len(decision_history)
     )
 # ----------------------------------------------------------------------
+# OSS capabilities (for status display)
+# ----------------------------------------------------------------------
+oss_caps = get_oss_capabilities()
+# ----------------------------------------------------------------------
+# Gradio UI
 # ----------------------------------------------------------------------
+with gr.Blocks(title="ARF v4 – OSS Reliability Control Plane", theme="soft") as demo:
     gr.Markdown("""
+    # 🧠 ARF v4 – OSS Reliability Control Plane
+    **Deterministic Probability Thresholding & Hybrid Bayesian Inference**
+    This demo shows the OSS core of ARF:
+    - **Policy‑based Governance** – Automatic evaluation and enforcement (advisory mode)
+    - **Hybrid Risk Engine** – Conjugate priors + HMC + hyperpriors
+    - **Deterministic Thresholds** – Approve (<0.2), Escalate (0.2‑0.8), Deny (>0.8)
+    - **Hamiltonian Monte Carlo** – Offline pattern discovery (NUTS)
     """)
     with gr.Tabs():
         # Tab 1: Control Plane Dashboard
         with gr.TabItem("Control Plane Dashboard"):
+            gr.Markdown("### 🎮 OSS Control Plane")
             with gr.Row():
                 with gr.Column():
                     system_status = gr.JSON(label="System Status", value={
+                        "edition": oss_caps["edition"],
+                        "version": oss_caps["version"],
+                        "governance_mode": "advisory",
+                        "policies_loaded": len(policy_engine.policies),
+                        "risk_threshold_low": RISK_THRESHOLD_LOW,
+                        "risk_threshold_high": RISK_THRESHOLD_HIGH
                     })
                 with gr.Column():
                     control_stats = gr.JSON(label="Control Statistics", value={
                 action_timeline = gr.Plot(label="Autonomous Actions Timeline")
             with gr.Row():
                 health_score = gr.Number(label="System Health Score", value=85, precision=0)
             refresh_dash_btn = gr.Button("Refresh Dashboard")
             refresh_dash_btn.click(
                 fn=refresh_dashboard,
                 outputs=[control_stats, risk_gauge, decision_pie, action_timeline]
             )
+        # Tab 2: Infrastructure Reliability with Governance
         with gr.TabItem("Infrastructure Reliability"):
+            gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Autonomous Control")
             infra_state = gr.State(value={})
             with gr.Row():
                         value="none",
                         label="Inject Fault"
                     )
+                    infra_btn = gr.Button("Evaluate Intent with Governance")
                 with gr.Column():
                     infra_output = gr.JSON(label="Analysis with Control Decisions")
+        # Tab 3: Deep Analysis (HMC)
         with gr.TabItem("Deep Analysis (HMC)"):
             gr.Markdown("### Hamiltonian Monte Carlo – Offline Pattern Discovery")
             with gr.Row():
                 hmc_trace_plot = gr.Plot(label="Trace Plot")
                 hmc_pair_plot = gr.Plot(label="Pair Plot")
+        # Tab 4: Policy Management
         with gr.TabItem("Policy Management"):
+            gr.Markdown("### 📋 Execution Policies (from OSS)")
+            # Convert policies to JSON‑serializable format
+            policies_json = []
+            for p in policy_engine.policies:
+                policies_json.append({
+                    "name": p.name,
+                    "conditions": [{"metric": c.metric, "operator": c.operator, "threshold": c.threshold} for c in p.conditions],
+                    "actions": [a.value for a in p.actions],
+                    "priority": p.priority,
+                    "cool_down_seconds": p.cool_down_seconds,
+                    "enabled": p.enabled
+                })
+            policies_display = gr.JSON(label="Active Policies", value=policies_json)
+        # Tab 5: Enterprise / OSS Info
+        with gr.TabItem("Enterprise / OSS"):
+            gr.Markdown(f"""
+            ## 🚀 ARF {oss_caps['edition'].upper()} Edition
+            **Version:** {oss_caps['version']}
+            **License:** {oss_caps['license']}
+            **Constants Hash:** {oss_caps.get('constants_hash', 'N/A')}
+            ### OSS Capabilities
+            - **Execution modes:** {', '.join(oss_caps['execution']['modes'])}
+            - **Max incident history:** {oss_caps['execution']['max_incidents']}
+            - **Memory storage:** {oss_caps['memory']['type']}
+            - **FAISS index type:** {oss_caps['memory']['faiss_index_type']}
+            - **Max incident nodes:** {oss_caps['memory']['max_incident_nodes']}
+            ### Enterprise Features (not included)
+            {chr(10).join('- ' + f for f in oss_caps.get('enterprise_features', []))}
+            [📅 Book a Demo](https://calendly.com/petter2025us/30min) | [📧 Contact Sales](mailto:petter2025us@outlook.com)
             """)
+    # Feedback row (simplified)
     with gr.Row():
         feedback_up = gr.Button("👍 Approve Decision")
         feedback_down = gr.Button("👎 Reject Decision")
         feedback_msg = gr.Textbox(label="Feedback", interactive=False)
     # Wire events
     infra_btn.click(
         fn=lambda f, w, s: asyncio.run(handle_infra_with_governance(f, w, s)),
+        inputs=[infra_fault, gr.State(50), infra_state],  # context_window not used, but keep for signature
         outputs=[infra_output, infra_state]
     )
     )
     def handle_control_feedback(approved: bool):
+        # Simple feedback placeholder
+        return f"Feedback recorded: {'approved' if approved else 'rejected'}"
     feedback_up.click(
         fn=lambda: handle_control_feedback(True),
         outputs=feedback_msg