petter2025 commited on
Commit
1080e05
·
verified ·
1 Parent(s): ece2830

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +252 -439
app.py CHANGED
@@ -4,7 +4,6 @@ import json
4
  import logging
5
  import traceback
6
  import os
7
- import torch
8
  import numpy as np
9
  import pandas as pd
10
  from datetime import datetime
@@ -22,179 +21,42 @@ from plotly.subplots import make_subplots
22
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
23
  logger = logging.getLogger(__name__)
24
 
25
- # Global variable for feedback
26
- last_task_category = None
27
-
28
- # ARF components
29
- from agentic_reliability_framework.runtime.engine import EnhancedReliabilityEngine
30
- from agentic_reliability_framework.core.models.event import ReliabilityEvent
31
- from policy_engine import PolicyEngine
32
-
33
- # Custom AI components
34
- from ai_event import AIEvent
35
- from ai_risk_engine import AIRiskEngine
36
- from hallucination_detective import HallucinationDetectiveAgent
37
- from memory_drift_diagnostician import MemoryDriftDiagnosticianAgent
38
- from nli_detector import NLIDetector
39
- from retrieval import SimpleRetriever
40
- from image_detector import ImageQualityDetector
41
- from audio_detector import AudioQualityDetector
42
- from iot_simulator import IoTSimulator
43
- from robotics_diagnostician import RoboticsDiagnostician
44
- from iot_event import IoTEvent
45
-
46
- # ========== Advanced Inference (HMC) ==========
47
- from advanced_inference import HMCAnalyzer
48
-
49
- # ========== Infrastructure Reliability Imports (with fallbacks) ==========
50
- INFRA_DEPS_AVAILABLE = False
51
- try:
52
- from infra_simulator import InfraSimulator
53
- from infra_graph import InfraGraph
54
- from bayesian_model import failure_model as pyro_model
55
- from gnn_predictor import FailureGNN
56
- from ontology_reasoner import InfraOntology
57
- import problog
58
- INFRA_DEPS_AVAILABLE = True
59
- logger.info("Infrastructure reliability modules loaded.")
60
- except ImportError as e:
61
- logger.warning(f"Infrastructure modules not fully available: {e}. The Infrastructure tab will use mock mode.")
62
-
63
  # ----------------------------------------------------------------------
64
- # ARF infrastructure engine
65
  # ----------------------------------------------------------------------
66
- try:
67
- logger.info("Initializing EnhancedReliabilityEngine...")
68
- infra_engine = EnhancedReliabilityEngine()
69
- policy_engine = PolicyEngine()
70
- logger.info("Policy Engine initialized with 5 policies")
71
- except Exception as e:
72
- logger.error(f"Infrastructure engine init failed: {e}")
73
- infra_engine = None
74
- policy_engine = PolicyEngine() # Fallback
 
 
 
75
 
76
  # ----------------------------------------------------------------------
77
- # Text generation model (DialoGPT-small) with logprobs
78
  # ----------------------------------------------------------------------
79
- from transformers import AutoTokenizer, AutoModelForCausalLM
80
- gen_model_name = "microsoft/DialoGPT-small"
81
  try:
82
- tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
83
- model = AutoModelForCausalLM.from_pretrained(gen_model_name)
84
- model.eval()
85
- logger.info(f"Generator {gen_model_name} loaded.")
86
- except Exception as e:
87
- logger.error(f"Generator load failed: {e}")
88
- tokenizer = model = None
89
-
90
- def generate_with_logprobs(prompt, max_new_tokens=100):
91
- """Generate text and return (generated_text, avg_log_prob)."""
92
- if tokenizer is None or model is None:
93
- return "[Model not loaded]", -10.0
94
- inputs = tokenizer(prompt, return_tensors="pt")
95
- with torch.no_grad():
96
- outputs = model.generate(
97
- **inputs,
98
- max_new_tokens=max_new_tokens,
99
- return_dict_in_generate=True,
100
- output_scores=True
101
- )
102
- scores = outputs.scores
103
- log_probs = [torch.log_softmax(score, dim=-1) for score in scores]
104
- generated_ids = outputs.sequences[0][inputs['input_ids'].shape[1]:]
105
- token_log_probs = []
106
- for i, lp in enumerate(log_probs):
107
- token_id = generated_ids[i]
108
- token_log_probs.append(lp[0, token_id].item())
109
- avg_log_prob = sum(token_log_probs) / len(token_log_probs) if token_log_probs else -10.0
110
- generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
111
- return generated_text, avg_log_prob
112
-
113
- # ----------------------------------------------------------------------
114
- # NLI detector
115
- # ----------------------------------------------------------------------
116
- nli_detector = NLIDetector()
117
-
118
- # ----------------------------------------------------------------------
119
- # Retrieval
120
- # ----------------------------------------------------------------------
121
- retriever = SimpleRetriever()
122
-
123
- # ----------------------------------------------------------------------
124
- # Image generation
125
- # ----------------------------------------------------------------------
126
- from diffusers import StableDiffusionPipeline
127
- image_pipe = None
128
- try:
129
- image_pipe = StableDiffusionPipeline.from_pretrained(
130
- "hf-internal-testing/tiny-stable-diffusion-torch",
131
- safety_checker=None
132
- )
133
- if not torch.cuda.is_available():
134
- image_pipe.to("cpu")
135
- logger.info("Image pipeline loaded.")
136
- except Exception as e:
137
- logger.warning(f"Image pipeline load failed (will be disabled): {e}")
138
- image_pipe = None
139
-
140
- # ----------------------------------------------------------------------
141
- # Audio transcription
142
- # ----------------------------------------------------------------------
143
- from transformers import pipeline
144
- audio_pipe = None
145
- try:
146
- audio_pipe = pipeline(
147
- "automatic-speech-recognition",
148
- model="openai/whisper-tiny.en",
149
- device=0 if torch.cuda.is_available() else -1
150
- )
151
- logger.info("Audio pipeline loaded.")
152
- except Exception as e:
153
- logger.warning(f"Audio pipeline load failed (will be disabled): {e}")
154
-
155
- # ----------------------------------------------------------------------
156
- # AI agents
157
- # ----------------------------------------------------------------------
158
- hallucination_detective = HallucinationDetectiveAgent(nli_detector=nli_detector)
159
- memory_drift_diagnostician = MemoryDriftDiagnosticianAgent()
160
- image_quality_detector = ImageQualityDetector()
161
- audio_quality_detector = AudioQualityDetector()
162
- robotics_diagnostician = RoboticsDiagnostician()
163
-
164
- # ----------------------------------------------------------------------
165
- # Bayesian risk engine (now with hyperpriors)
166
- # ----------------------------------------------------------------------
167
- ai_risk_engine = AIRiskEngine()
168
 
169
  # ----------------------------------------------------------------------
170
- # HMC analyzer
171
  # ----------------------------------------------------------------------
172
- hmc_analyzer = HMCAnalyzer()
 
 
173
 
174
  # ----------------------------------------------------------------------
175
- # IoT simulator
176
  # ----------------------------------------------------------------------
177
- iot_sim = IoTSimulator()
178
-
179
- # ----------------------------------------------------------------------
180
- # Infrastructure components
181
- # ----------------------------------------------------------------------
182
- if INFRA_DEPS_AVAILABLE:
183
- infra_sim = InfraSimulator()
184
- infra_graph = InfraGraph(
185
- uri=os.getenv("NEO4J_URI"),
186
- user=os.getenv("NEO4J_USER"),
187
- password=os.getenv("NEO4J_PASSWORD")
188
- )
189
- gnn_model = FailureGNN()
190
- ontology = InfraOntology()
191
- else:
192
- infra_sim = InfraSimulator() if INFRA_DEPS_AVAILABLE else None
193
- infra_graph = None
194
- gnn_model = None
195
- ontology = None
196
-
197
- # ========== Global History for Dashboard ==========
198
  decision_history = [] # list of (timestamp, decision, category)
199
  risk_history = [] # list of (timestamp, mean_risk)
200
 
@@ -207,19 +69,30 @@ def update_dashboard_data(decision: Dict, risk: float):
207
  if len(risk_history) > 100:
208
  risk_history.pop(0)
209
 
210
- # ========== Execution Governance Functions ==========
211
-
 
212
  def evaluate_policies(event_type: str, severity: str, component: str) -> Dict[str, Any]:
213
- """Evaluate policies against an event and return recommended actions."""
 
 
 
214
  try:
215
- actions = policy_engine.evaluate(event_type, severity, component)
 
 
 
 
 
 
 
216
  return {
217
  "timestamp": datetime.utcnow().isoformat(),
218
  "event_type": event_type,
219
  "severity": severity,
220
  "component": component,
221
- "recommended_actions": actions,
222
- "governance_status": "approved" if actions else "blocked"
223
  }
224
  except Exception as e:
225
  logger.error(f"Policy evaluation error: {e}")
@@ -229,6 +102,9 @@ def evaluate_policies(event_type: str, severity: str, component: str) -> Dict[st
229
  "recommended_actions": []
230
  }
231
 
 
 
 
232
  def autonomous_control_decision(analysis_result: Dict[str, Any], risk_threshold: float = 0.7) -> Dict[str, Any]:
233
  """
234
  Make autonomous control decision based on analysis and risk metrics.
@@ -243,183 +119,95 @@ def autonomous_control_decision(analysis_result: Dict[str, Any], risk_threshold:
243
  }
244
 
245
  try:
246
- # Extract risk metrics
247
- risk_metrics = analysis_result.get("risk_metrics", {})
248
- mean_risk = risk_metrics.get("mean", 0.5)
249
- p95_risk = risk_metrics.get("p95", 0.7)
250
 
251
- # Determine risk level
252
- if mean_risk > risk_threshold or p95_risk > risk_threshold:
253
  decision["risk_level"] = "high"
254
  decision["approved"] = False
255
- decision["reason"] = f"Risk exceeds threshold (mean={mean_risk:.2f}, p95={p95_risk:.2f})"
256
- else:
257
  decision["risk_level"] = "low"
258
  decision["approved"] = True
259
  decision["reason"] = "Risk within acceptable limits"
 
 
 
 
260
 
261
- # Generate autonomous actions based on findings
262
- if "hallucination_detection" in analysis_result:
263
- hallu = analysis_result["hallucination_detection"]
264
- if hallu.get("findings", {}).get("is_hallucination"):
265
- decision["actions"].append({
266
- "action": "regenerate",
267
- "params": {"temperature": 0.3},
268
- "reason": "Hallucination detected"
269
- })
270
 
271
- if "memory_drift_detection" in analysis_result:
272
- drift = analysis_result["memory_drift_detection"]
273
- if drift.get("findings", {}).get("drift_detected"):
274
- decision["actions"].append({
275
- "action": "reset_context",
276
- "params": {},
277
- "reason": "Memory drift detected"
278
- })
279
  except Exception as e:
280
  logger.error(f"Control decision error: {e}")
281
  decision["reason"] = f"Error in decision process: {str(e)}"
282
 
283
- update_dashboard_data(decision, analysis_result.get("risk_metrics", {}).get("mean", 0.5))
284
  return decision
285
 
286
- # ========== Async Handlers with Governance ==========
287
-
288
- async def handle_text(task_type, prompt, context_window):
289
- """Handle text generation with governance and control plane decisions."""
290
- global last_task_category
291
- last_task_category = task_type
292
-
293
  try:
294
- logger.info(f"Handling text task: {task_type}, prompt: {prompt[:50]}...")
295
-
296
- # Generate response
297
- response, avg_log_prob = generate_with_logprobs(prompt)
298
- retrieval_score = retriever.get_similarity(prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
- # Create event
301
- event = AIEvent(
302
- timestamp=datetime.utcnow(),
303
- component="ai",
304
- service_mesh="ai",
305
- latency_p99=0,
306
- error_rate=0.0,
307
- throughput=1,
308
- cpu_util=None,
309
- memory_util=None,
310
- action_category=task_type,
311
- model_name=gen_model_name,
312
- model_version="latest",
313
- prompt=prompt,
314
- response=response,
315
- response_length=len(response),
316
- confidence=float(np.exp(avg_log_prob)),
317
- perplexity=None,
318
- retrieval_scores=[retrieval_score],
319
- user_feedback=None,
320
- latency_ms=0
321
- )
322
 
323
- # Run analysis
324
- hallu_result = await hallucination_detective.analyze(event)
325
- drift_result = await memory_drift_diagnostician.analyze(event, context_window)
326
- risk_metrics = ai_risk_engine.risk_score(task_type)
327
 
328
- # Combine results
329
  analysis_result = {
330
- "response": response,
331
- "avg_log_prob": avg_log_prob,
332
- "confidence": event.confidence,
333
- "retrieval_score": retrieval_score,
334
- "hallucination_detection": hallu_result,
335
- "memory_drift_detection": drift_result,
336
- "risk_metrics": risk_metrics
 
 
337
  }
338
 
339
- # Apply governance and control plane
340
- policy_result = evaluate_policies(
341
- event_type="text_generation",
342
- severity="medium" if hallu_result.get("findings", {}).get("is_hallucination") else "low",
343
- component="ai_service"
344
- )
345
-
346
  control_decision = autonomous_control_decision(analysis_result)
347
 
348
- # Add governance to output
349
- analysis_result["governance"] = {
350
- "policy_evaluation": policy_result,
351
- "control_plane_decision": control_decision
352
- }
353
-
354
- return analysis_result
355
-
356
- except Exception as e:
357
- logger.error(f"Text task error: {e}", exc_info=True)
358
- return {
359
- "error": str(e),
360
- "traceback": traceback.format_exc(),
361
- "governance": {
362
- "policy_evaluation": evaluate_policies("text_generation", "critical", "ai_service"),
363
- "control_plane_decision": {"approved": False, "reason": f"Error: {str(e)}"}
364
- }
365
- }
366
-
367
- async def handle_infra_with_governance(fault_type, context_window, session_state):
368
- """Infrastructure analysis with execution governance."""
369
- if not INFRA_DEPS_AVAILABLE:
370
- return {
371
- "error": "Infrastructure modules not available",
372
- "governance": evaluate_policies("infrastructure", "critical", "system")
373
- }, session_state
374
-
375
- try:
376
- # Initialize simulator
377
- if "sim" not in session_state or session_state["sim"] is None:
378
- session_state["sim"] = InfraSimulator()
379
- sim = session_state["sim"]
380
-
381
- # Inject fault
382
- sim.set_fault(fault_type if fault_type != "none" else None)
383
- components = sim.read_state()
384
-
385
- # Update graph
386
- if infra_graph:
387
- infra_graph.update_from_state(components)
388
-
389
- # Determine severity based on fault
390
- severity = "low"
391
- if fault_type != "none":
392
- severity = "high" if fault_type == "cascade" else "medium"
393
-
394
- # Evaluate policies
395
- policy_result = evaluate_policies(
396
- event_type="infrastructure_failure",
397
- severity=severity,
398
- component="data_center"
399
- )
400
-
401
- # Control plane decision
402
- control_decision = {
403
- "timestamp": datetime.utcnow().isoformat(),
404
- "approved": policy_result["governance_status"] == "approved",
405
- "actions": policy_result["recommended_actions"],
406
- "reason": "Governance approved" if policy_result["governance_status"] == "approved" else "Blocked by policy",
407
- "risk_level": severity
408
- }
409
-
410
- # Combine results
411
  output = {
412
- "topology": components,
413
- "bayesian_risk": {"switch_failure": 0.1, "server_failure": 0.05},
414
- "gnn_predictions": {"at_risk": ["server-1"] if fault_type != "none" else []},
415
- "logic_explanations": "ProbLog analysis complete",
416
- "ontology": ontology.classify("server") if ontology else {"inferred": [], "consistent": True},
417
  "governance": {
418
  "policy_evaluation": policy_result,
419
  "control_plane_decision": control_decision
420
  }
421
  }
422
-
423
  return output, session_state
424
 
425
  except Exception as e:
@@ -427,31 +215,89 @@ async def handle_infra_with_governance(fault_type, context_window, session_state
427
  return {
428
  "error": str(e),
429
  "traceback": traceback.format_exc(),
430
- "governance": evaluate_policies("infrastructure", "critical", "system")
431
  }, session_state
432
 
433
- # ========== HMC Handler ==========
434
- def run_hmc(samples, warmup):
435
- summary = hmc_analyzer.run_inference(num_samples=samples, warmup=warmup)
436
- trace_data = hmc_analyzer.get_trace_data()
437
- fig_trace, fig_pair = None, None
438
- if trace_data:
439
- # Trace plot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  fig_trace = go.Figure()
441
  for key, vals in trace_data.items():
442
  fig_trace.add_trace(go.Scatter(y=vals, mode='lines', name=key))
443
  fig_trace.update_layout(title="Posterior Traces", xaxis_title="Sample", yaxis_title="Value")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
 
445
- # Pair plot (simplified scatter matrix)
446
- df = pd.DataFrame(trace_data)
447
- fig_pair = go.Figure(data=go.Splom(
448
- dimensions=[dict(label=k, values=df[k]) for k in df.columns],
449
- showupperhalf=False
450
- ))
451
- fig_pair.update_layout(title="Posterior Pair Plot")
452
- return summary, fig_trace, fig_pair
453
-
454
- # ========== Dashboard Plot Generators ==========
455
  def generate_risk_gauge():
456
  if not risk_history:
457
  return go.Figure()
@@ -460,12 +306,15 @@ def generate_risk_gauge():
460
  mode="gauge+number",
461
  value=latest_risk,
462
  title={'text': "Current Risk"},
463
- gauge={'axis': {'range': [0, 1]},
464
- 'bar': {'color': "darkblue"},
465
- 'steps': [
466
- {'range': [0, 0.3], 'color': "lightgreen"},
467
- {'range': [0.3, 0.7], 'color': "yellow"},
468
- {'range': [0.7, 1], 'color': "red"}]}))
 
 
 
469
  return fig
470
 
471
  def generate_decision_pie():
@@ -487,7 +336,6 @@ def generate_action_timeline():
487
  fig.update_layout(title="Autonomous Actions Timeline", xaxis_title="Time", yaxis_title="Approved (1) / Blocked (0)")
488
  return fig
489
 
490
- # ========== Dashboard Refresh Function ==========
491
  def refresh_dashboard():
492
  """Compute latest stats and return updated dashboard components."""
493
  total = len(decision_history)
@@ -508,35 +356,38 @@ def refresh_dashboard():
508
  )
509
 
510
  # ----------------------------------------------------------------------
511
- # Gradio UI with Governance Focus
 
 
 
 
 
512
  # ----------------------------------------------------------------------
513
- with gr.Blocks(title="ARF v4 – Autonomous AI Control Plane", theme="soft") as demo:
514
  gr.Markdown("""
515
- # 🧠 ARF v4 – Autonomous AI Control Plane
516
- **Execution Governance & Neuro‑Symbolic Reliability for Critical Infrastructure**
517
 
518
- This demo shows how ARF provides:
519
- - **Policy‑based Governance** – Automatic evaluation and enforcement
520
- - **Autonomous Control Decisions** – AI-driven remediation actions
521
- - **Neuro‑Symbolic Reasoning** – Combining neural networks with symbolic logic
522
- - **Real‑time Risk Assessment** – Bayesian online learning with hyperpriors
523
- - **Hamiltonian Monte Carlo** – Offline deep pattern discovery
524
  """)
525
 
526
- # Historic Context Window (shared across tabs)
527
- context_window_slider = gr.Slider(1, 200, value=50, step=1, label="Historic Context Window (readings)")
528
-
529
  with gr.Tabs():
530
  # Tab 1: Control Plane Dashboard
531
  with gr.TabItem("Control Plane Dashboard"):
532
- gr.Markdown("### 🎮 Autonomous Control Plane")
533
  with gr.Row():
534
  with gr.Column():
535
  system_status = gr.JSON(label="System Status", value={
536
- "governance_mode": "active",
537
- "policies_loaded": 5,
538
- "autonomous_actions": "enabled",
539
- "risk_threshold": 0.7
 
 
540
  })
541
  with gr.Column():
542
  control_stats = gr.JSON(label="Control Statistics", value={
@@ -552,24 +403,15 @@ with gr.Blocks(title="ARF v4 – Autonomous AI Control Plane", theme="soft") as
552
  action_timeline = gr.Plot(label="Autonomous Actions Timeline")
553
  with gr.Row():
554
  health_score = gr.Number(label="System Health Score", value=85, precision=0)
555
- # Refresh button for dashboard
556
  refresh_dash_btn = gr.Button("Refresh Dashboard")
557
  refresh_dash_btn.click(
558
  fn=refresh_dashboard,
559
  outputs=[control_stats, risk_gauge, decision_pie, action_timeline]
560
  )
561
 
562
- # Tab 2: Text Generation with Governance
563
- with gr.TabItem("Text Generation"):
564
- gr.Markdown("### AI Text Generation with Governance")
565
- text_task = gr.Dropdown(["chat", "code", "summary"], value="chat", label="Task")
566
- text_prompt = gr.Textbox(label="Prompt", value="What is the capital of France?", lines=3)
567
- text_btn = gr.Button("Generate with Governance")
568
- text_output = gr.JSON(label="Analysis with Control Decisions")
569
-
570
- # Tab 3: Infrastructure Reliability with Governance
571
  with gr.TabItem("Infrastructure Reliability"):
572
- gr.Markdown("### Neuro‑Symbolic Infrastructure with Autonomous Control")
573
  infra_state = gr.State(value={})
574
 
575
  with gr.Row():
@@ -579,11 +421,11 @@ with gr.Blocks(title="ARF v4 – Autonomous AI Control Plane", theme="soft") as
579
  value="none",
580
  label="Inject Fault"
581
  )
582
- infra_btn = gr.Button("Run Analysis with Governance")
583
  with gr.Column():
584
  infra_output = gr.JSON(label="Analysis with Control Decisions")
585
 
586
- # Tab 4: Deep Analysis (HMC)
587
  with gr.TabItem("Deep Analysis (HMC)"):
588
  gr.Markdown("### Hamiltonian Monte Carlo – Offline Pattern Discovery")
589
  with gr.Row():
@@ -597,81 +439,54 @@ with gr.Blocks(title="ARF v4 – Autonomous AI Control Plane", theme="soft") as
597
  hmc_trace_plot = gr.Plot(label="Trace Plot")
598
  hmc_pair_plot = gr.Plot(label="Pair Plot")
599
 
600
- # Tab 5: Policy Management
601
  with gr.TabItem("Policy Management"):
602
- gr.Markdown("### 📋 Execution Policies")
603
- policies = gr.JSON(label="Active Policies", value=[
604
- {
605
- "id": "POL-001",
606
- "name": "Hallucination Prevention",
607
- "condition": "confidence < 0.6",
608
- "action": "regenerate",
609
- "severity": "medium"
610
- },
611
- {
612
- "id": "POL-002",
613
- "name": "Infrastructure Cascade",
614
- "condition": "fault_type == 'cascade'",
615
- "action": "isolate_affected",
616
- "severity": "critical"
617
- },
618
- {
619
- "id": "POL-003",
620
- "name": "Memory Drift",
621
- "condition": "drift_detected == true",
622
- "action": "reset_context",
623
- "severity": "low"
624
- },
625
- {
626
- "id": "POL-004",
627
- "name": "High Risk",
628
- "condition": "risk_metrics.mean > 0.7",
629
- "action": "require_approval",
630
- "severity": "high"
631
- },
632
- {
633
- "id": "POL-005",
634
- "name": "Audio Quality",
635
- "condition": "confidence < 0.5",
636
- "action": "request_retry",
637
- "severity": "low"
638
- }
639
- ])
640
-
641
- # Tab 6: Enterprise
642
- with gr.TabItem("Enterprise"):
643
- gr.Markdown("""
644
- ## 🚀 ARF Enterprise – Autonomous Control Plane for Critical Infrastructure
645
 
646
- ### Key Enterprise Features:
647
- - **Execution Governance** Policy‑controlled autonomous actions
648
- - **Audit Trails & Compliance** – Full traceability for SOC2, HIPAA, GDPR
649
- - **Learning Loops** – Models improve over time with your data
650
- - **Multi‑Tenant Control** – Role‑based access and isolation
651
- - **Cloud Integrations** – Azure, AWS, GCP native clients
652
- - **24/7 Support & SLAs** – Enterprise‑grade reliability
653
 
654
- ### Get Started
655
- - 📅 [Book a Demo](https://calendly.com/petter2025us/30min)
656
- - 📧 [Contact Sales](mailto:petter2025us@outlook.com)
 
657
  """)
658
 
659
- # Feedback row
660
  with gr.Row():
661
  feedback_up = gr.Button("👍 Approve Decision")
662
  feedback_down = gr.Button("👎 Reject Decision")
663
  feedback_msg = gr.Textbox(label="Feedback", interactive=False)
664
 
665
  # Wire events
666
- text_btn.click(
667
- fn=lambda task, p, w: asyncio.run(handle_text(task, p, w)),
668
- inputs=[text_task, text_prompt, context_window_slider],
669
- outputs=text_output
670
- )
671
-
672
  infra_btn.click(
673
  fn=lambda f, w, s: asyncio.run(handle_infra_with_governance(f, w, s)),
674
- inputs=[infra_fault, context_window_slider, infra_state],
675
  outputs=[infra_output, infra_state]
676
  )
677
 
@@ -682,11 +497,9 @@ with gr.Blocks(title="ARF v4 – Autonomous AI Control Plane", theme="soft") as
682
  )
683
 
684
  def handle_control_feedback(approved: bool):
685
- global last_task_category
686
- if last_task_category is None:
687
- return "No recent decision to rate"
688
- return f"Control decision {'approved' if approved else 'rejected'} for {last_task_category}"
689
-
690
  feedback_up.click(
691
  fn=lambda: handle_control_feedback(True),
692
  outputs=feedback_msg
 
4
  import logging
5
  import traceback
6
  import os
 
7
  import numpy as np
8
  import pandas as pd
9
  from datetime import datetime
 
21
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
22
  logger = logging.getLogger(__name__)
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # ----------------------------------------------------------------------
25
+ # OSS Core Imports
26
  # ----------------------------------------------------------------------
27
+ from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine, HealingPolicy
28
+ from agentic_reliability_framework.core.governance.risk_engine import RiskEngine, ActionCategory
29
+ from agentic_reliability_framework.core.governance.intents import (
30
+ InfrastructureIntent, ProvisionResourceIntent, ResourceType, Environment
31
+ )
32
+ from agentic_reliability_framework.core.adapters.azure.azure_simulator import AzureInfrastructureSimulator
33
+ from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction, EventSeverity
34
+ from agentic_reliability_framework.runtime.hmc.hmc_learner import HMCRiskLearner
35
+ from agentic_reliability_framework.core.config.constants import (
36
+ LATENCY_CRITICAL, ERROR_RATE_HIGH, get_oss_capabilities,
37
+ RISK_THRESHOLD_LOW, RISK_THRESHOLD_HIGH # Note: these may need to be added to constants if missing; fallback defined below
38
+ )
39
 
40
  # ----------------------------------------------------------------------
41
+ # Fallback constants if not in OSS constants
42
  # ----------------------------------------------------------------------
 
 
43
  try:
44
+ from agentic_reliability_framework.core.config.constants import RISK_THRESHOLD_LOW, RISK_THRESHOLD_HIGH
45
+ except ImportError:
46
+ RISK_THRESHOLD_LOW = 0.2
47
+ RISK_THRESHOLD_HIGH = 0.8
48
+ logger.info("Using fallback risk thresholds (0.2/0.8)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  # ----------------------------------------------------------------------
51
+ # Infrastructure simulator and engines
52
  # ----------------------------------------------------------------------
53
+ infra_sim = AzureInfrastructureSimulator()
54
+ policy_engine = PolicyEngine() # loads default policies
55
+ risk_engine = RiskEngine(hmc_model_path="hmc_model.json", use_hyperpriors=True)
56
 
57
  # ----------------------------------------------------------------------
58
+ # Global history for dashboard
59
  # ----------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  decision_history = [] # list of (timestamp, decision, category)
61
  risk_history = [] # list of (timestamp, mean_risk)
62
 
 
69
  if len(risk_history) > 100:
70
  risk_history.pop(0)
71
 
72
+ # ----------------------------------------------------------------------
73
+ # Policy evaluation helper
74
+ # ----------------------------------------------------------------------
75
  def evaluate_policies(event_type: str, severity: str, component: str) -> Dict[str, Any]:
76
+ """
77
+ Evaluate policies against an event and return recommended actions.
78
+ Uses OSS PolicyEngine with a minimal ReliabilityEvent.
79
+ """
80
  try:
81
+ event = ReliabilityEvent(
82
+ component=component,
83
+ latency_p99=0.0, # dummy, not used in policy conditions
84
+ error_rate=0.0,
85
+ throughput=1.0,
86
+ severity=EventSeverity(severity)
87
+ )
88
+ actions = policy_engine.evaluate_policies(event)
89
  return {
90
  "timestamp": datetime.utcnow().isoformat(),
91
  "event_type": event_type,
92
  "severity": severity,
93
  "component": component,
94
+ "recommended_actions": [a.value for a in actions if a != HealingAction.NO_ACTION],
95
+ "governance_status": "approved" if actions and actions[0] != HealingAction.NO_ACTION else "blocked"
96
  }
97
  except Exception as e:
98
  logger.error(f"Policy evaluation error: {e}")
 
102
  "recommended_actions": []
103
  }
104
 
105
+ # ----------------------------------------------------------------------
106
+ # Autonomous control decision
107
+ # ----------------------------------------------------------------------
108
  def autonomous_control_decision(analysis_result: Dict[str, Any], risk_threshold: float = 0.7) -> Dict[str, Any]:
109
  """
110
  Make autonomous control decision based on analysis and risk metrics.
 
119
  }
120
 
121
  try:
122
+ # Extract risk metrics (if present)
123
+ risk = analysis_result.get("risk", 0.5)
124
+ p95 = analysis_result.get("risk_p95", risk)
 
125
 
126
+ # Determine risk level using OSS thresholds if available
127
+ if risk > RISK_THRESHOLD_HIGH or p95 > RISK_THRESHOLD_HIGH:
128
  decision["risk_level"] = "high"
129
  decision["approved"] = False
130
+ decision["reason"] = f"Risk exceeds high threshold ({RISK_THRESHOLD_HIGH})"
131
+ elif risk < RISK_THRESHOLD_LOW:
132
  decision["risk_level"] = "low"
133
  decision["approved"] = True
134
  decision["reason"] = "Risk within acceptable limits"
135
+ else:
136
+ decision["risk_level"] = "medium"
137
+ decision["approved"] = False
138
+ decision["reason"] = f"Risk in escalation zone ({RISK_THRESHOLD_LOW}-{RISK_THRESHOLD_HIGH})"
139
 
140
+ # Optionally add actions based on analysis (e.g., if risk is high, suggest mitigation)
141
+ if decision["risk_level"] == "high" and "healing_actions" in analysis_result:
142
+ decision["actions"] = analysis_result["healing_actions"]
 
 
 
 
 
 
143
 
 
 
 
 
 
 
 
 
144
  except Exception as e:
145
  logger.error(f"Control decision error: {e}")
146
  decision["reason"] = f"Error in decision process: {str(e)}"
147
 
148
+ update_dashboard_data(decision, analysis_result.get("risk", 0.5))
149
  return decision
150
 
151
+ # ----------------------------------------------------------------------
152
+ # Infrastructure analysis with governance
153
+ # ----------------------------------------------------------------------
154
+ async def handle_infra_with_governance(fault_type: str, context_window: int, session_state: Dict) -> tuple:
155
+ """
156
+ Infrastructure analysis using OSS simulator and risk engine.
157
+ """
158
  try:
159
+ # Map fault to an intent
160
+ if fault_type == "none":
161
+ intent = ProvisionResourceIntent(
162
+ resource_type=ResourceType.VM,
163
+ environment=Environment.DEVELOPMENT,
164
+ size="Standard_D2s_v3"
165
+ )
166
+ severity = "low"
167
+ else:
168
+ # Simulate a failure by using production environment and risky config
169
+ intent = ProvisionResourceIntent(
170
+ resource_type=ResourceType.VM,
171
+ environment=Environment.PRODUCTION,
172
+ size="custom_extra_large"
173
+ )
174
+ severity = "high" if fault_type == "cascade" else "medium"
175
+
176
+ # Evaluate via simulator
177
+ healing_intent = infra_sim.evaluate_intent(intent)
178
 
179
+ # Extract risk and contributions
180
+ risk = healing_intent.risk_score
181
+ # For simplicity, we take p95 from risk_contributions if available; else assume same
182
+ risk_p95 = healing_intent.risk_contributions.get("hyper_summary", {}).get("p95", risk) if healing_intent.risk_contributions else risk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
+ # Get policy evaluation
185
+ policy_result = evaluate_policies("infrastructure_failure", severity, "azure")
 
 
186
 
187
+ # Build analysis result
188
  analysis_result = {
189
+ "intent": intent.dict(),
190
+ "healing_intent": healing_intent.dict(),
191
+ "risk": risk,
192
+ "risk_p95": risk_p95,
193
+ "decision": healing_intent.decision, # "approve", "deny", "escalate"
194
+ "justification": healing_intent.justification,
195
+ "policy_violations": healing_intent.policy_violations,
196
+ "healing_actions": [a.value for a in healing_intent.recommended_actions] if healing_intent.recommended_actions else [],
197
+ "risk_contributions": healing_intent.risk_contributions
198
  }
199
 
200
+ # Apply autonomous control decision
 
 
 
 
 
 
201
  control_decision = autonomous_control_decision(analysis_result)
202
 
203
+ # Combine with governance
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  output = {
205
+ **analysis_result,
 
 
 
 
206
  "governance": {
207
  "policy_evaluation": policy_result,
208
  "control_plane_decision": control_decision
209
  }
210
  }
 
211
  return output, session_state
212
 
213
  except Exception as e:
 
215
  return {
216
  "error": str(e),
217
  "traceback": traceback.format_exc(),
218
+ "governance": evaluate_policies("infrastructure_failure", "critical", "system")
219
  }, session_state
220
 
221
+ # ----------------------------------------------------------------------
222
+ # HMC analysis using OSS HMCRiskLearner
223
+ # ----------------------------------------------------------------------
224
+ def run_hmc(samples: int, warmup: int) -> tuple:
225
+ """
226
+ Train HMCRiskLearner on synthetic data and return posterior summary + plots.
227
+ """
228
+ try:
229
+ # Generate synthetic incident data
230
+ np.random.seed(42)
231
+ n = 200
232
+ data = []
233
+ for _ in range(n):
234
+ latency = np.random.exponential(200)
235
+ error_rate = np.random.beta(1, 10)
236
+ throughput = np.random.normal(1000, 200)
237
+ cpu = np.random.uniform(0.2, 0.9)
238
+ mem = np.random.uniform(0.3, 0.8)
239
+ target = int(latency > LATENCY_CRITICAL or error_rate > ERROR_RATE_HIGH)
240
+ data.append({
241
+ "latency_p99": latency,
242
+ "error_rate": error_rate,
243
+ "throughput": throughput,
244
+ "cpu_util": cpu,
245
+ "memory_util": mem,
246
+ "target": target
247
+ })
248
+ df = pd.DataFrame(data)
249
+
250
+ learner = HMCRiskLearner()
251
+ learner.train(df.to_dict('records'), draws=samples, tune=warmup, chains=2)
252
+
253
+ # Get feature importance (coefficient summaries)
254
+ coeffs = learner.get_feature_importance()
255
+ summary = {k: v for k, v in coeffs.items()}
256
+
257
+ # Posterior predictive for a sample point
258
+ sample_metrics = {
259
+ "latency_p99": 350,
260
+ "error_rate": 0.08,
261
+ "throughput": 900,
262
+ "cpu_util": 0.7,
263
+ "memory_util": 0.6
264
+ }
265
+ pred_summary = learner.predict_risk_summary(sample_metrics)
266
+ summary["sample_prediction"] = pred_summary
267
+
268
+ # Extract trace for plotting
269
+ trace_data = {}
270
+ if learner.trace is not None:
271
+ for var in learner.trace.posterior.data_vars:
272
+ if var in ['alpha', 'beta']:
273
+ vals = learner.trace.posterior[var].values.flatten()
274
+ trace_data[var] = vals[:1000] # limit for performance
275
+
276
+ # Create trace plot
277
  fig_trace = go.Figure()
278
  for key, vals in trace_data.items():
279
  fig_trace.add_trace(go.Scatter(y=vals, mode='lines', name=key))
280
  fig_trace.update_layout(title="Posterior Traces", xaxis_title="Sample", yaxis_title="Value")
281
+
282
+ # Create pair plot (simplified)
283
+ fig_pair = go.Figure()
284
+ if len(trace_data) > 0:
285
+ df_trace = pd.DataFrame(trace_data)
286
+ fig_pair = go.Figure(data=go.Splom(
287
+ dimensions=[dict(label=k, values=df_trace[k]) for k in df_trace.columns],
288
+ showupperhalf=False
289
+ ))
290
+ fig_pair.update_layout(title="Posterior Pair Plot")
291
+
292
+ return summary, fig_trace, fig_pair
293
+
294
+ except Exception as e:
295
+ logger.error(f"HMC analysis error: {e}", exc_info=True)
296
+ return {"error": str(e)}, None, None
297
 
298
+ # ----------------------------------------------------------------------
299
+ # Dashboard plot generators
300
+ # ----------------------------------------------------------------------
 
 
 
 
 
 
 
301
  def generate_risk_gauge():
302
  if not risk_history:
303
  return go.Figure()
 
306
  mode="gauge+number",
307
  value=latest_risk,
308
  title={'text': "Current Risk"},
309
+ gauge={
310
+ 'axis': {'range': [0, 1]},
311
+ 'bar': {'color': "darkblue"},
312
+ 'steps': [
313
+ {'range': [0, RISK_THRESHOLD_LOW], 'color': "lightgreen"},
314
+ {'range': [RISK_THRESHOLD_LOW, RISK_THRESHOLD_HIGH], 'color': "yellow"},
315
+ {'range': [RISK_THRESHOLD_HIGH, 1], 'color': "red"}
316
+ ]
317
+ }))
318
  return fig
319
 
320
  def generate_decision_pie():
 
336
  fig.update_layout(title="Autonomous Actions Timeline", xaxis_title="Time", yaxis_title="Approved (1) / Blocked (0)")
337
  return fig
338
 
 
339
  def refresh_dashboard():
340
  """Compute latest stats and return updated dashboard components."""
341
  total = len(decision_history)
 
356
  )
357
 
358
  # ----------------------------------------------------------------------
359
+ # OSS capabilities (for status display)
360
+ # ----------------------------------------------------------------------
361
+ oss_caps = get_oss_capabilities()
362
+
363
+ # ----------------------------------------------------------------------
364
+ # Gradio UI
365
  # ----------------------------------------------------------------------
366
+ with gr.Blocks(title="ARF v4 – OSS Reliability Control Plane", theme="soft") as demo:
367
  gr.Markdown("""
368
+ # 🧠 ARF v4 – OSS Reliability Control Plane
369
+ **Deterministic Probability Thresholding & Hybrid Bayesian Inference**
370
 
371
+ This demo shows the OSS core of ARF:
372
+ - **Policy‑based Governance** – Automatic evaluation and enforcement (advisory mode)
373
+ - **Hybrid Risk Engine** – Conjugate priors + HMC + hyperpriors
374
+ - **Deterministic Thresholds** – Approve (<0.2), Escalate (0.2‑0.8), Deny (>0.8)
375
+ - **Hamiltonian Monte Carlo** – Offline pattern discovery (NUTS)
 
376
  """)
377
 
 
 
 
378
  with gr.Tabs():
379
  # Tab 1: Control Plane Dashboard
380
  with gr.TabItem("Control Plane Dashboard"):
381
+ gr.Markdown("### 🎮 OSS Control Plane")
382
  with gr.Row():
383
  with gr.Column():
384
  system_status = gr.JSON(label="System Status", value={
385
+ "edition": oss_caps["edition"],
386
+ "version": oss_caps["version"],
387
+ "governance_mode": "advisory",
388
+ "policies_loaded": len(policy_engine.policies),
389
+ "risk_threshold_low": RISK_THRESHOLD_LOW,
390
+ "risk_threshold_high": RISK_THRESHOLD_HIGH
391
  })
392
  with gr.Column():
393
  control_stats = gr.JSON(label="Control Statistics", value={
 
403
  action_timeline = gr.Plot(label="Autonomous Actions Timeline")
404
  with gr.Row():
405
  health_score = gr.Number(label="System Health Score", value=85, precision=0)
 
406
  refresh_dash_btn = gr.Button("Refresh Dashboard")
407
  refresh_dash_btn.click(
408
  fn=refresh_dashboard,
409
  outputs=[control_stats, risk_gauge, decision_pie, action_timeline]
410
  )
411
 
412
+ # Tab 2: Infrastructure Reliability with Governance
 
 
 
 
 
 
 
 
413
  with gr.TabItem("Infrastructure Reliability"):
414
+ gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Autonomous Control")
415
  infra_state = gr.State(value={})
416
 
417
  with gr.Row():
 
421
  value="none",
422
  label="Inject Fault"
423
  )
424
+ infra_btn = gr.Button("Evaluate Intent with Governance")
425
  with gr.Column():
426
  infra_output = gr.JSON(label="Analysis with Control Decisions")
427
 
428
+ # Tab 3: Deep Analysis (HMC)
429
  with gr.TabItem("Deep Analysis (HMC)"):
430
  gr.Markdown("### Hamiltonian Monte Carlo – Offline Pattern Discovery")
431
  with gr.Row():
 
439
  hmc_trace_plot = gr.Plot(label="Trace Plot")
440
  hmc_pair_plot = gr.Plot(label="Pair Plot")
441
 
442
+ # Tab 4: Policy Management
443
  with gr.TabItem("Policy Management"):
444
+ gr.Markdown("### 📋 Execution Policies (from OSS)")
445
+ # Convert policies to JSON‑serializable format
446
+ policies_json = []
447
+ for p in policy_engine.policies:
448
+ policies_json.append({
449
+ "name": p.name,
450
+ "conditions": [{"metric": c.metric, "operator": c.operator, "threshold": c.threshold} for c in p.conditions],
451
+ "actions": [a.value for a in p.actions],
452
+ "priority": p.priority,
453
+ "cool_down_seconds": p.cool_down_seconds,
454
+ "enabled": p.enabled
455
+ })
456
+ policies_display = gr.JSON(label="Active Policies", value=policies_json)
457
+
458
+ # Tab 5: Enterprise / OSS Info
459
+ with gr.TabItem("Enterprise / OSS"):
460
+ gr.Markdown(f"""
461
+ ## 🚀 ARF {oss_caps['edition'].upper()} Edition
462
+
463
+ **Version:** {oss_caps['version']}
464
+ **License:** {oss_caps['license']}
465
+ **Constants Hash:** {oss_caps.get('constants_hash', 'N/A')}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
 
467
+ ### OSS Capabilities
468
+ - **Execution modes:** {', '.join(oss_caps['execution']['modes'])}
469
+ - **Max incident history:** {oss_caps['execution']['max_incidents']}
470
+ - **Memory storage:** {oss_caps['memory']['type']}
471
+ - **FAISS index type:** {oss_caps['memory']['faiss_index_type']}
472
+ - **Max incident nodes:** {oss_caps['memory']['max_incident_nodes']}
 
473
 
474
+ ### Enterprise Features (not included)
475
+ {chr(10).join('- ' + f for f in oss_caps.get('enterprise_features', []))}
476
+
477
+ [📅 Book a Demo](https://calendly.com/petter2025us/30min) | [📧 Contact Sales](mailto:petter2025us@outlook.com)
478
  """)
479
 
480
+ # Feedback row (simplified)
481
  with gr.Row():
482
  feedback_up = gr.Button("👍 Approve Decision")
483
  feedback_down = gr.Button("👎 Reject Decision")
484
  feedback_msg = gr.Textbox(label="Feedback", interactive=False)
485
 
486
  # Wire events
 
 
 
 
 
 
487
  infra_btn.click(
488
  fn=lambda f, w, s: asyncio.run(handle_infra_with_governance(f, w, s)),
489
+ inputs=[infra_fault, gr.State(50), infra_state], # context_window not used, but keep for signature
490
  outputs=[infra_output, infra_state]
491
  )
492
 
 
497
  )
498
 
499
  def handle_control_feedback(approved: bool):
500
+ # Simple feedback placeholder
501
+ return f"Feedback recorded: {'approved' if approved else 'rejected'}"
502
+
 
 
503
  feedback_up.click(
504
  fn=lambda: handle_control_feedback(True),
505
  outputs=feedback_msg