petter2025 commited on
Commit
3b9c14e
·
verified ·
1 Parent(s): e13adeb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +260 -399
app.py CHANGED
@@ -1,375 +1,251 @@
1
  import gradio as gr
2
- import asyncio
3
- import json
4
- import logging
5
- import traceback
6
- import os
7
  import numpy as np
8
  import pandas as pd
9
- from datetime import datetime
10
- from typing import Dict, Any, List, Optional
 
11
  import threading
12
  import urllib.request
13
- import time
 
 
 
14
 
15
  # ----------------------------------------------------------------------
16
- # Memory monitoring (no external dependencies)
17
  # ----------------------------------------------------------------------
18
- def get_memory_usage():
19
- """Return current process memory usage in MB (RSS)."""
20
- try:
21
- # Try using resource module (Unix-like)
22
- import resource
23
- rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
24
- # On Linux, ru_maxrss is in kilobytes; on macOS, in bytes
25
- if rss < 1e9: # likely kilobytes
26
- return rss / 1024.0 # convert to MB
27
- else:
28
- return rss / (1024.0 * 1024.0) # convert to MB
29
- except ImportError:
30
- # Fallback to reading /proc/self/status (Linux)
31
- try:
32
- with open("/proc/self/status") as f:
33
- for line in f:
34
- if line.startswith("VmRSS:"):
35
- parts = line.split()
36
- if len(parts) >= 2:
37
- # Value in kB
38
- return int(parts[1]) / 1024.0 # convert to MB
39
- except Exception:
40
- pass
41
- return None
42
-
43
- def log_memory_usage():
44
- """Periodically log memory usage to help diagnose timeouts."""
45
- mem_mb = get_memory_usage()
46
- if mem_mb is not None:
47
- logging.info(f"Process memory: {mem_mb:.1f} MB")
48
- else:
49
- logging.info("Process memory: unknown")
50
- # Schedule next check in 60 seconds
51
- threading.Timer(60, log_memory_usage).start()
52
 
53
  # ----------------------------------------------------------------------
54
- # Keep‑alive to prevent idle timeout (external pings)
55
  # ----------------------------------------------------------------------
56
  def keep_alive():
57
- """Periodically ping the public Space URL to prevent idle timeout."""
58
- # Determine the public URL of the Space
59
  space_id = os.environ.get('SPACE_ID')
60
  if space_id:
61
- # Convert "username/space-name" to "username-space-name.hf.space"
62
  url = f"https://{space_id.replace('/', '-')}.hf.space/"
63
- logging.info(f"Using external URL for keep‑alive: {url}")
64
  else:
65
- # Fallback to localhost when running locally
66
  url = "http://127.0.0.1:7860/"
67
- logging.warning("No SPACE_ID found, using localhost for keep‑alive – will not prevent sleep!")
68
-
69
  while True:
70
- time.sleep(300) # 5 minutes
71
  try:
72
  with urllib.request.urlopen(url, timeout=10) as response:
73
  status = response.getcode()
74
- logging.info(f"Keep‑alive ping: {status}")
75
  except Exception as e:
76
- logging.warning(f"Keep‑alive failed: {e}")
77
 
78
- # Start keep‑alive thread (daemon so it exits when main process ends)
79
  threading.Thread(target=keep_alive, daemon=True).start()
80
 
81
  # ----------------------------------------------------------------------
82
- # Plotly for dashboards
83
- # ----------------------------------------------------------------------
84
- import plotly.graph_objects as go
85
- from plotly.subplots import make_subplots
86
-
87
- # ----------------------------------------------------------------------
88
- # Logging setup
89
  # ----------------------------------------------------------------------
90
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
91
- logger = logging.getLogger(__name__)
92
 
93
- # ----------------------------------------------------------------------
94
- # OSS Core Imports
95
- # ----------------------------------------------------------------------
96
- from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine, HealingPolicy
97
- from agentic_reliability_framework.core.governance.risk_engine import RiskEngine, ActionCategory
98
- from agentic_reliability_framework.core.governance.intents import (
99
- InfrastructureIntent, ProvisionResourceIntent, ResourceType, Environment
100
- )
101
- from agentic_reliability_framework.core.governance.azure.azure_simulator import AzureInfrastructureSimulator
102
- from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction, EventSeverity
103
- from agentic_reliability_framework.runtime.hmc.hmc_learner import HMCRiskLearner
104
- from agentic_reliability_framework.core.config.constants import (
105
- LATENCY_CRITICAL, ERROR_RATE_HIGH, get_oss_capabilities
106
- )
107
-
108
- # ----------------------------------------------------------------------
109
- # Fallback constants if not in OSS constants
110
- # ----------------------------------------------------------------------
111
- try:
112
- from agentic_reliability_framework.core.config.constants import RISK_THRESHOLD_LOW, RISK_THRESHOLD_HIGH
113
- except ImportError:
114
- RISK_THRESHOLD_LOW = 0.2
115
- RISK_THRESHOLD_HIGH = 0.8
116
- logger.info("Using fallback risk thresholds (0.2/0.8)")
117
-
118
- # ----------------------------------------------------------------------
119
- # Infrastructure simulator and engines
120
- # ----------------------------------------------------------------------
121
- # Define policy_engine first
122
- policy_engine = PolicyEngine() # loads default policies
123
-
124
- # Now create simulator, passing the policy_engine
125
- infra_sim = AzureInfrastructureSimulator(policy=policy_engine)
126
-
127
- # Risk engine (does not depend on policy_engine)
128
- risk_engine = RiskEngine(hmc_model_path="hmc_model.json", use_hyperpriors=True)
129
-
130
- # ----------------------------------------------------------------------
131
- # Global history for dashboard
132
- # ----------------------------------------------------------------------
133
- decision_history = [] # list of (timestamp, decision, category)
134
- risk_history = [] # list of (timestamp, mean_risk)
135
-
136
- def update_dashboard_data(decision: Dict, risk: float):
137
  decision_history.append((datetime.utcnow().isoformat(), decision, risk))
138
  risk_history.append((datetime.utcnow().isoformat(), risk))
139
- # Keep only last 100
140
  if len(decision_history) > 100:
141
  decision_history.pop(0)
142
  if len(risk_history) > 100:
143
  risk_history.pop(0)
144
 
145
  # ----------------------------------------------------------------------
146
- # Policy evaluation helper
147
  # ----------------------------------------------------------------------
148
- def evaluate_policies(event_type: str, severity: str, component: str) -> Dict[str, Any]:
149
  """
150
- Evaluate policies against an event and return recommended actions.
151
- Uses OSS PolicyEngine with a minimal ReliabilityEvent.
 
 
152
  """
153
- try:
154
- event = ReliabilityEvent(
155
- component=component,
156
- latency_p99=0.0, # dummy, not used in policy conditions
157
- error_rate=0.0,
158
- throughput=1.0,
159
- severity=EventSeverity(severity)
160
- )
161
- actions = policy_engine.evaluate_policies(event)
162
- return {
163
- "timestamp": datetime.utcnow().isoformat(),
164
- "event_type": event_type,
165
- "severity": severity,
166
- "component": component,
167
- "recommended_actions": [a.value for a in actions if a != HealingAction.NO_ACTION],
168
- "governance_status": "approved" if actions and actions[0] != HealingAction.NO_ACTION else "blocked"
169
- }
170
- except Exception as e:
171
- logger.error(f"Policy evaluation error: {e}")
172
- return {
173
- "error": str(e),
174
- "governance_status": "error",
175
- "recommended_actions": []
176
- }
177
 
178
  # ----------------------------------------------------------------------
179
- # Autonomous control decision
180
  # ----------------------------------------------------------------------
181
- def autonomous_control_decision(analysis_result: Dict[str, Any], risk_threshold: float = 0.7) -> Dict[str, Any]:
182
- """
183
- Make autonomous control decision based on analysis and risk metrics.
184
- This simulates an AI Control Plane that can take actions automatically.
185
- """
 
 
 
 
 
 
 
 
 
 
 
 
186
  decision = {
187
  "timestamp": datetime.utcnow().isoformat(),
188
- "approved": False,
189
- "actions": [],
190
- "reason": "",
191
- "risk_level": "unknown"
192
  }
193
-
194
- try:
195
- # Extract risk metrics (if present)
196
- risk = analysis_result.get("risk", 0.5)
197
- p95 = analysis_result.get("risk_p95", risk)
198
-
199
- # Determine risk level using OSS thresholds if available
200
- if risk > RISK_THRESHOLD_HIGH or p95 > RISK_THRESHOLD_HIGH:
201
- decision["risk_level"] = "high"
202
- decision["approved"] = False
203
- decision["reason"] = f"Risk exceeds high threshold ({RISK_THRESHOLD_HIGH})"
204
- elif risk < RISK_THRESHOLD_LOW:
205
- decision["risk_level"] = "low"
206
- decision["approved"] = True
207
- decision["reason"] = "Risk within acceptable limits"
208
- else:
209
- decision["risk_level"] = "medium"
210
- decision["approved"] = False
211
- decision["reason"] = f"Risk in escalation zone ({RISK_THRESHOLD_LOW}-{RISK_THRESHOLD_HIGH})"
212
-
213
- # Optionally add actions based on analysis (e.g., if risk is high, suggest mitigation)
214
- if decision["risk_level"] == "high" and "healing_actions" in analysis_result:
215
- decision["actions"] = analysis_result["healing_actions"]
216
-
217
- except Exception as e:
218
- logger.error(f"Control decision error: {e}")
219
- decision["reason"] = f"Error in decision process: {str(e)}"
220
-
221
- update_dashboard_data(decision, analysis_result.get("risk", 0.5))
222
  return decision
223
 
224
  # ----------------------------------------------------------------------
225
- # Infrastructure analysis with governance
226
- # ----------------------------------------------------------------------
227
- async def handle_infra_with_governance(fault_type: str, context_window: int, session_state: Dict) -> tuple:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  """
229
- Infrastructure analysis using OSS simulator and risk engine.
 
230
  """
231
- try:
232
- # Map fault to an intent
233
- if fault_type == "none":
234
- intent = ProvisionResourceIntent(
235
- resource_type=ResourceType.VM,
236
- environment=Environment.DEVELOPMENT,
237
- size="Standard_D2s_v3"
238
- )
239
- severity = "low"
240
- else:
241
- # Simulate a failure by using production environment and risky config
242
- intent = ProvisionResourceIntent(
243
- resource_type=ResourceType.VM,
244
- environment=Environment.PRODUCTION,
245
- size="custom_extra_large"
246
- )
247
- severity = "high" if fault_type == "cascade" else "medium"
248
-
249
- # Evaluate via simulator
250
- healing_intent = infra_sim.evaluate_intent(intent)
251
-
252
- # Extract risk and contributions
253
- risk = healing_intent.risk_score
254
- # For simplicity, we take p95 from risk_contributions if available; else assume same
255
- risk_p95 = healing_intent.risk_contributions.get("hyper_summary", {}).get("p95", risk) if healing_intent.risk_contributions else risk
256
-
257
- # Get policy evaluation
258
- policy_result = evaluate_policies("infrastructure_failure", severity, "azure")
259
-
260
- # Build analysis result
261
- analysis_result = {
262
- "intent": intent.dict(),
263
- "healing_intent": healing_intent.dict(),
264
- "risk": risk,
265
- "risk_p95": risk_p95,
266
- "decision": healing_intent.decision, # "approve", "deny", "escalate"
267
- "justification": healing_intent.justification,
268
- "policy_violations": healing_intent.policy_violations,
269
- "healing_actions": [a.value for a in healing_intent.recommended_actions] if healing_intent.recommended_actions else [],
270
- "risk_contributions": healing_intent.risk_contributions
271
- }
272
-
273
- # Apply autonomous control decision
274
- control_decision = autonomous_control_decision(analysis_result)
275
-
276
- # Combine with governance
277
- output = {
278
- **analysis_result,
279
- "governance": {
280
- "policy_evaluation": policy_result,
281
- "control_plane_decision": control_decision
282
- }
283
- }
284
- return output, session_state
285
-
286
- except Exception as e:
287
- logger.error(f"Infra task error: {e}", exc_info=True)
288
- return {
289
- "error": str(e),
290
- "traceback": traceback.format_exc(),
291
- "governance": evaluate_policies("infrastructure_failure", "critical", "system")
292
- }, session_state
293
 
294
  # ----------------------------------------------------------------------
295
- # HMC analysis using OSS HMCRiskLearner
296
  # ----------------------------------------------------------------------
297
- def run_hmc(samples: int, warmup: int) -> tuple:
298
- """
299
- Train HMCRiskLearner on synthetic data and return posterior summary + plots.
300
- """
301
- try:
302
- # Generate synthetic incident data
303
- np.random.seed(42)
304
- n = 200
305
- data = []
306
- for _ in range(n):
307
- latency = np.random.exponential(200)
308
- error_rate = np.random.beta(1, 10)
309
- throughput = np.random.normal(1000, 200)
310
- cpu = np.random.uniform(0.2, 0.9)
311
- mem = np.random.uniform(0.3, 0.8)
312
- target = int(latency > LATENCY_CRITICAL or error_rate > ERROR_RATE_HIGH)
313
- data.append({
314
- "latency_p99": latency,
315
- "error_rate": error_rate,
316
- "throughput": throughput,
317
- "cpu_util": cpu,
318
- "memory_util": mem,
319
- "target": target
320
- })
321
- df = pd.DataFrame(data)
322
-
323
- learner = HMCRiskLearner()
324
- learner.train(df.to_dict('records'), draws=samples, tune=warmup, chains=2)
325
-
326
- # Get feature importance (coefficient summaries)
327
- coeffs = learner.get_feature_importance()
328
- summary = {k: v for k, v in coeffs.items()}
329
-
330
- # Posterior predictive for a sample point
331
- sample_metrics = {
332
- "latency_p99": 350,
333
- "error_rate": 0.08,
334
- "throughput": 900,
335
- "cpu_util": 0.7,
336
- "memory_util": 0.6
 
 
 
 
 
 
337
  }
338
- pred_summary = learner.predict_risk_summary(sample_metrics)
339
- summary["sample_prediction"] = pred_summary
340
-
341
- # Extract trace for plotting
342
- trace_data = {}
343
- if learner.trace is not None:
344
- for var in learner.trace.posterior.data_vars:
345
- if var in ['alpha', 'beta']:
346
- vals = learner.trace.posterior[var].values.flatten()
347
- trace_data[var] = vals[:1000] # limit for performance
348
-
349
- # Create trace plot
350
- fig_trace = go.Figure()
351
- for key, vals in trace_data.items():
352
- fig_trace.add_trace(go.Scatter(y=vals, mode='lines', name=key))
353
- fig_trace.update_layout(title="Posterior Traces", xaxis_title="Sample", yaxis_title="Value")
354
-
355
- # Create pair plot (simplified)
356
- fig_pair = go.Figure()
357
- if len(trace_data) > 0:
358
- df_trace = pd.DataFrame(trace_data)
359
- fig_pair = go.Figure(data=go.Splom(
360
- dimensions=[dict(label=k, values=df_trace[k]) for k in df_trace.columns],
361
- showupperhalf=False
362
- ))
363
- fig_pair.update_layout(title="Posterior Pair Plot")
364
-
365
- return summary, fig_trace, fig_pair
366
-
367
- except Exception as e:
368
- logger.error(f"HMC analysis error: {e}", exc_info=True)
369
- return {"error": str(e)}, None, None
370
 
371
  # ----------------------------------------------------------------------
372
- # Dashboard plot generators
373
  # ----------------------------------------------------------------------
374
  def generate_risk_gauge():
375
  if not risk_history:
@@ -383,9 +259,9 @@ def generate_risk_gauge():
383
  'axis': {'range': [0, 1]},
384
  'bar': {'color': "darkblue"},
385
  'steps': [
386
- {'range': [0, RISK_THRESHOLD_LOW], 'color': "lightgreen"},
387
- {'range': [RISK_THRESHOLD_LOW, RISK_THRESHOLD_HIGH], 'color': "yellow"},
388
- {'range': [RISK_THRESHOLD_HIGH, 1], 'color': "red"}
389
  ]
390
  }))
391
  return fig
@@ -410,7 +286,6 @@ def generate_action_timeline():
410
  return fig
411
 
412
  def refresh_dashboard():
413
- """Compute latest stats and return updated dashboard components."""
414
  total = len(decision_history)
415
  approved = sum(1 for _, d, _ in decision_history if d.get("approved", False))
416
  blocked = total - approved
@@ -429,43 +304,47 @@ def refresh_dashboard():
429
  )
430
 
431
  # ----------------------------------------------------------------------
432
- # Start memory monitoring (non‑blocking)
433
- # ----------------------------------------------------------------------
434
- log_memory_usage()
435
-
436
- # ----------------------------------------------------------------------
437
- # OSS capabilities (for status display)
438
  # ----------------------------------------------------------------------
439
- oss_caps = get_oss_capabilities()
 
 
 
 
 
 
 
440
 
441
  # ----------------------------------------------------------------------
442
  # Gradio UI
443
  # ----------------------------------------------------------------------
444
- with gr.Blocks(title="ARF v4 – OSS Reliability Control Plane") as demo:
445
  gr.Markdown("""
446
- # 🧠 ARF v4 – OSS Reliability Control Plane
447
- **Deterministic Probability Thresholding & Hybrid Bayesian Inference**
448
-
449
- This demo shows the OSS core of ARF:
450
- - **Policy‑based Governance** – Automatic evaluation and enforcement (advisory mode)
451
- - **Hybrid Risk Engine** – Conjugate priors + HMC + hyperpriors
452
- - **Deterministic Thresholds** – Approve (<0.2), Escalate (0.2‑0.8), Deny (>0.8)
453
- - **Hamiltonian Monte Carlo** – Offline pattern discovery (NUTS)
 
 
454
  """)
455
 
456
  with gr.Tabs():
457
  # Tab 1: Control Plane Dashboard
458
  with gr.TabItem("Control Plane Dashboard"):
459
- gr.Markdown("### 🎮 OSS Control Plane")
460
  with gr.Row():
461
  with gr.Column():
462
  system_status = gr.JSON(label="System Status", value={
463
  "edition": oss_caps["edition"],
464
  "version": oss_caps["version"],
465
  "governance_mode": "advisory",
466
- "policies_loaded": len(policy_engine.policies),
467
- "risk_threshold_low": RISK_THRESHOLD_LOW,
468
- "risk_threshold_high": RISK_THRESHOLD_HIGH
469
  })
470
  with gr.Column():
471
  control_stats = gr.JSON(label="Control Statistics", value={
@@ -479,19 +358,23 @@ with gr.Blocks(title="ARF v4 – OSS Reliability Control Plane") as demo:
479
  decision_pie = gr.Plot(label="Policy Decisions")
480
  with gr.Row():
481
  action_timeline = gr.Plot(label="Autonomous Actions Timeline")
482
- with gr.Row():
483
- health_score = gr.Number(label="System Health Score", value=85, precision=0)
484
  refresh_dash_btn = gr.Button("Refresh Dashboard")
485
  refresh_dash_btn.click(
486
  fn=refresh_dashboard,
487
  outputs=[control_stats, risk_gauge, decision_pie, action_timeline]
488
  )
489
 
490
- # Tab 2: Infrastructure Reliability with Governance
491
  with gr.TabItem("Infrastructure Reliability"):
492
- gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Autonomous Control")
 
 
 
 
 
 
 
493
  infra_state = gr.State(value={})
494
-
495
  with gr.Row():
496
  with gr.Column():
497
  infra_fault = gr.Dropdown(
@@ -499,39 +382,38 @@ with gr.Blocks(title="ARF v4 – OSS Reliability Control Plane") as demo:
499
  value="none",
500
  label="Inject Fault"
501
  )
502
- infra_btn = gr.Button("Evaluate Intent with Governance")
503
  with gr.Column():
504
- infra_output = gr.JSON(label="Analysis with Control Decisions")
505
-
506
- # Tab 3: Deep Analysis (HMC)
507
- with gr.TabItem("Deep Analysis (HMC)"):
508
- gr.Markdown("### Hamiltonian Monte Carlo – Offline Pattern Discovery")
 
 
 
 
509
  with gr.Row():
510
  with gr.Column():
511
- hmc_samples = gr.Slider(100, 2000, value=500, step=100, label="Number of Samples")
512
- hmc_warmup = gr.Slider(50, 500, value=200, step=50, label="Warmup Steps")
513
- hmc_run_btn = gr.Button("Run HMC")
514
  with gr.Column():
515
  hmc_summary = gr.JSON(label="Posterior Summary")
516
  with gr.Row():
517
  hmc_trace_plot = gr.Plot(label="Trace Plot")
518
- hmc_pair_plot = gr.Plot(label="Pair Plot")
519
 
520
  # Tab 4: Policy Management
521
  with gr.TabItem("Policy Management"):
522
- gr.Markdown("### 📋 Execution Policies (from OSS)")
523
- # Convert policies to JSON‑serializable format
524
- policies_json = []
525
- for p in policy_engine.policies:
526
- policies_json.append({
527
- "name": p.name,
528
- "conditions": [{"metric": c.metric, "operator": c.operator, "threshold": c.threshold} for c in p.conditions],
529
- "actions": [a.value for a in p.actions],
530
- "priority": p.priority,
531
- "cool_down_seconds": p.cool_down_seconds,
532
- "enabled": p.enabled
533
- })
534
- policies_display = gr.JSON(label="Active Policies", value=policies_json)
535
 
536
  # Tab 5: Enterprise / OSS Info
537
  with gr.TabItem("Enterprise / OSS"):
@@ -540,52 +422,31 @@ with gr.Blocks(title="ARF v4 – OSS Reliability Control Plane") as demo:
540
 
541
  **Version:** {oss_caps['version']}
542
  **License:** {oss_caps['license']}
543
- **Constants Hash:** {oss_caps.get('constants_hash', 'N/A')}
544
 
545
- ### OSS Capabilities
546
- - **Execution modes:** {', '.join(oss_caps['execution']['modes'])}
547
- - **Max incident history:** {oss_caps['execution']['max_incidents']}
548
- - **Memory storage:** {oss_caps['memory']['type']}
549
- - **FAISS index type:** {oss_caps['memory']['faiss_index_type']}
550
- - **Max incident nodes:** {oss_caps['memory']['max_incident_nodes']}
551
 
552
  ### Enterprise Features (not included)
553
- {chr(10).join('- ' + f for f in oss_caps.get('enterprise_features', []))}
554
 
555
  [📅 Book a Demo](https://calendly.com/petter2025us/30min) | [📧 Contact Sales](mailto:petter2025us@outlook.com)
556
  """)
557
 
558
- # Feedback row (simplified)
559
- with gr.Row():
560
- feedback_up = gr.Button("👍 Approve Decision")
561
- feedback_down = gr.Button("👎 Reject Decision")
562
- feedback_msg = gr.Textbox(label="Feedback", interactive=False)
563
-
564
  # Wire events
565
  infra_btn.click(
566
- fn=lambda f, w, s: asyncio.run(handle_infra_with_governance(f, w, s)),
567
- inputs=[infra_fault, gr.State(50), infra_state], # context_window not used, but keep for signature
568
  outputs=[infra_output, infra_state]
569
  )
570
-
571
  hmc_run_btn.click(
572
- fn=run_hmc,
573
  inputs=[hmc_samples, hmc_warmup],
574
  outputs=[hmc_summary, hmc_trace_plot, hmc_pair_plot]
575
  )
576
-
577
- def handle_control_feedback(approved: bool):
578
- # Simple feedback placeholder
579
- return f"Feedback recorded: {'approved' if approved else 'rejected'}"
580
-
581
- feedback_up.click(
582
- fn=lambda: handle_control_feedback(True),
583
- outputs=feedback_msg
584
- )
585
- feedback_down.click(
586
- fn=lambda: handle_control_feedback(False),
587
- outputs=feedback_msg
588
- )
589
 
590
  if __name__ == "__main__":
591
  demo.launch(theme="soft")
 
1
  import gradio as gr
 
 
 
 
 
2
  import numpy as np
3
  import pandas as pd
4
+ import plotly.graph_objects as go
5
+ import random
6
+ import time
7
  import threading
8
  import urllib.request
9
+ import os
10
+ from datetime import datetime
11
+ import logging
12
+ from scipy.stats import beta, norm
13
 
14
  # ----------------------------------------------------------------------
15
+ # Logging
16
  # ----------------------------------------------------------------------
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # ----------------------------------------------------------------------
21
+ # Keep‑alive (pings public URL every 5 minutes)
22
  # ----------------------------------------------------------------------
23
  def keep_alive():
 
 
24
  space_id = os.environ.get('SPACE_ID')
25
  if space_id:
 
26
  url = f"https://{space_id.replace('/', '-')}.hf.space/"
 
27
  else:
 
28
  url = "http://127.0.0.1:7860/"
 
 
29
  while True:
30
+ time.sleep(300)
31
  try:
32
  with urllib.request.urlopen(url, timeout=10) as response:
33
  status = response.getcode()
34
+ logger.info(f"Keep‑alive ping: {status}")
35
  except Exception as e:
36
+ logger.warning(f"Keep‑alive failed: {e}")
37
 
 
38
  threading.Thread(target=keep_alive, daemon=True).start()
39
 
40
  # ----------------------------------------------------------------------
41
+ # Global history
 
 
 
 
 
 
42
  # ----------------------------------------------------------------------
43
+ decision_history = [] # (timestamp, decision, risk)
44
+ risk_history = [] # (timestamp, risk)
45
 
46
+ def update_dashboard_data(decision, risk):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  decision_history.append((datetime.utcnow().isoformat(), decision, risk))
48
  risk_history.append((datetime.utcnow().isoformat(), risk))
 
49
  if len(decision_history) > 100:
50
  decision_history.pop(0)
51
  if len(risk_history) > 100:
52
  risk_history.pop(0)
53
 
54
  # ----------------------------------------------------------------------
55
+ # Bayesian Risk Engine (Conjugate Beta-Binomial)
56
  # ----------------------------------------------------------------------
57
+ class BayesianRiskEngine:
58
  """
59
+ Implements a Beta-Binomial conjugate prior for binary failure events.
60
+ - Prior: Beta(alpha, beta)
61
+ - Posterior: Beta(alpha + failures, beta + successes)
62
+ - Predictive risk = mean of posterior.
63
  """
64
+ def __init__(self, alpha=1.0, beta=1.0):
65
+ self.alpha = alpha
66
+ self.beta = beta
67
+
68
+ def update(self, failures, successes):
69
+ """Update posterior with new observations."""
70
+ self.alpha += failures
71
+ self.beta += successes
72
+
73
+ def risk(self):
74
+ """Return current risk estimate (mean of posterior)."""
75
+ return self.alpha / (self.alpha + self.beta)
76
+
77
+ def risk_interval(self, prob=0.95):
78
+ """Return credible interval for risk."""
79
+ return beta.ppf((1-prob)/2, self.alpha, self.beta), beta.ppf((1+prob)/2, self.alpha, self.beta)
80
+
81
+ def risk_distribution(self, x):
82
+ """PDF of the posterior Beta distribution."""
83
+ return beta.pdf(x, self.alpha, self.beta)
 
 
 
 
84
 
85
  # ----------------------------------------------------------------------
86
+ # Policy Engine (threshold-based)
87
  # ----------------------------------------------------------------------
88
+ class PolicyEngine:
89
+ def __init__(self, thresholds={"low": 0.2, "high": 0.8}):
90
+ self.thresholds = thresholds
91
+
92
+ def evaluate(self, risk):
93
+ if risk < self.thresholds["low"]:
94
+ return "approve", "Risk within safe limits"
95
+ elif risk > self.thresholds["high"]:
96
+ return "deny", f"Risk exceeds high threshold ({self.thresholds['high']})"
97
+ else:
98
+ return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
99
+
100
+ # ----------------------------------------------------------------------
101
+ # Autonomous Control Decision (approve/deny based on risk)
102
+ # ----------------------------------------------------------------------
103
+ def autonomous_control_decision(risk, risk_engine, policy_engine):
104
+ action, reason = policy_engine.evaluate(risk)
105
  decision = {
106
  "timestamp": datetime.utcnow().isoformat(),
107
+ "approved": action == "approve",
108
+ "actions": ["escalate_human"] if action == "escalate" else [],
109
+ "reason": reason,
110
+ "risk_level": "low" if risk < 0.2 else "medium" if risk < 0.8 else "high"
111
  }
112
+ update_dashboard_data(decision, risk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  return decision
114
 
115
  # ----------------------------------------------------------------------
116
+ # Simple Metropolis-Hastings MCMC sampler (for HMC tab)
117
+ # ----------------------------------------------------------------------
118
+ class MHMCMC:
119
+ """A generic Metropolis-Hastings sampler for a target log-posterior."""
120
+ def __init__(self, log_target, proposal_sd=0.1):
121
+ self.log_target = log_target
122
+ self.proposal_sd = proposal_sd
123
+
124
+ def sample(self, n_samples, initial_state, burn_in=0):
125
+ samples = np.zeros((n_samples, len(initial_state)))
126
+ current = np.array(initial_state)
127
+ current_log = self.log_target(current)
128
+ accepted = 0
129
+ for i in range(n_samples + burn_in):
130
+ # Propose
131
+ proposal = current + np.random.normal(0, self.proposal_sd, size=len(current))
132
+ proposal_log = self.log_target(proposal)
133
+ # Acceptance ratio
134
+ accept_prob = min(1, np.exp(proposal_log - current_log))
135
+ if np.random.rand() < accept_prob:
136
+ current = proposal
137
+ current_log = proposal_log
138
+ accepted += 1
139
+ if i >= burn_in:
140
+ samples[i - burn_in] = current
141
+ acceptance_rate = accepted / (n_samples + burn_in)
142
+ return samples, acceptance_rate
143
+
144
+ # ----------------------------------------------------------------------
145
+ # HMC analysis (MCMC on a simple model)
146
+ # ----------------------------------------------------------------------
147
+ def run_hmc_mcmc(samples, warmup):
148
  """
149
+ Simulate an HMC-like analysis using Metropolis-Hastings.
150
+ Target: posterior of a Normal distribution with unknown mean.
151
  """
152
+ # Generate some data: assume we observed 10 points with mean 0.5, std 0.2
153
+ data = np.random.normal(0.5, 0.2, 10)
154
+
155
+ # Prior: Normal(0, 1) on mu
156
+ def log_prior(mu):
157
+ return -0.5 * (mu ** 2) # ignoring constant
158
+
159
+ # Likelihood: Normal(data | mu, sigma=0.2)
160
+ def log_likelihood(mu):
161
+ return -0.5 * np.sum(((data - mu) / 0.2) ** 2) # ignoring constant
162
+
163
+ def log_posterior(mu):
164
+ return log_prior(mu) + log_likelihood(mu)
165
+
166
+ # Run MCMC
167
+ sampler = MHMCMC(log_posterior, proposal_sd=0.05)
168
+ mu_samples, acceptance = sampler.sample(samples, initial_state=[0.0], burn_in=warmup)
169
+
170
+ # Summary
171
+ mu_samples = mu_samples.flatten()
172
+ mean = np.mean(mu_samples)
173
+ median = np.median(mu_samples)
174
+ credible_interval = np.percentile(mu_samples, [2.5, 97.5])
175
+
176
+ # Trace plot
177
+ fig_trace = go.Figure()
178
+ fig_trace.add_trace(go.Scatter(y=mu_samples, mode='lines', name='μ', line=dict(width=1)))
179
+ fig_trace.update_layout(title="Trace of μ (Metropolis-Hastings)", xaxis_title="Iteration", yaxis_title="μ")
180
+
181
+ # Histogram
182
+ fig_hist = go.Figure()
183
+ fig_hist.add_trace(go.Histogram(x=mu_samples, nbinsx=50, name='Posterior'))
184
+ fig_hist.update_layout(title="Posterior Distribution of μ", xaxis_title="μ", yaxis_title="Density")
185
+
186
+ summary = {
187
+ "mean": mean,
188
+ "median": median,
189
+ "credible_interval_95": f"[{credible_interval[0]:.3f}, {credible_interval[1]:.3f}]",
190
+ "acceptance_rate": f"{acceptance:.2%}"
191
+ }
192
+ return summary, fig_trace, fig_hist
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
  # ----------------------------------------------------------------------
195
+ # Infrastructure Analysis (uses BayesianRiskEngine)
196
  # ----------------------------------------------------------------------
197
+ async def handle_infra_with_governance(fault_type, context_window, session_state):
198
+ # Map fault to simulated observations (failures, successes)
199
+ fault_map = {
200
+ "none": (1, 99),
201
+ "switch_down": (20, 80),
202
+ "server_overload": (35, 65),
203
+ "cascade": (60, 40)
204
+ }
205
+ failures, successes = fault_map.get(fault_type, (1, 99))
206
+ severity = "low" if failures < 10 else "medium" if failures < 40 else "high"
207
+
208
+ # Create risk engine with prior Beta(1,1)
209
+ risk_engine = BayesianRiskEngine(alpha=1, beta=1)
210
+ # Update with observed data
211
+ risk_engine.update(failures, successes)
212
+ risk = risk_engine.risk()
213
+ ci_low, ci_high = risk_engine.risk_interval(0.95)
214
+
215
+ # Policy evaluation
216
+ policy_engine = PolicyEngine(thresholds={"low": 0.2, "high": 0.8})
217
+ action, reason = policy_engine.evaluate(risk)
218
+
219
+ # Autonomous decision
220
+ control_decision = autonomous_control_decision(risk, risk_engine, policy_engine)
221
+
222
+ # Build output
223
+ analysis_result = {
224
+ "risk": risk,
225
+ "risk_ci": [ci_low, ci_high],
226
+ "decision": action,
227
+ "justification": reason,
228
+ "healing_actions": ["restart"] if action == "deny" else ["monitor"],
229
+ "posterior_parameters": {
230
+ "alpha": risk_engine.alpha,
231
+ "beta": risk_engine.beta
232
+ }
233
+ }
234
+ output = {
235
+ **analysis_result,
236
+ "governance": {
237
+ "policy_evaluation": {
238
+ "action": action,
239
+ "reason": reason,
240
+ "thresholds": policy_engine.thresholds
241
+ },
242
+ "control_plane_decision": control_decision
243
  }
244
+ }
245
+ return output, session_state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
  # ----------------------------------------------------------------------
248
+ # Dashboard plots
249
  # ----------------------------------------------------------------------
250
  def generate_risk_gauge():
251
  if not risk_history:
 
259
  'axis': {'range': [0, 1]},
260
  'bar': {'color': "darkblue"},
261
  'steps': [
262
+ {'range': [0, 0.2], 'color': "lightgreen"},
263
+ {'range': [0.2, 0.8], 'color': "yellow"},
264
+ {'range': [0.8, 1], 'color': "red"}
265
  ]
266
  }))
267
  return fig
 
286
  return fig
287
 
288
  def refresh_dashboard():
 
289
  total = len(decision_history)
290
  approved = sum(1 for _, d, _ in decision_history if d.get("approved", False))
291
  blocked = total - approved
 
304
  )
305
 
306
  # ----------------------------------------------------------------------
307
+ # OSS capabilities (mocked)
 
 
 
 
 
308
  # ----------------------------------------------------------------------
309
+ oss_caps = {
310
+ "edition": "OSS (Demo)",
311
+ "version": "4.0.0-bayesian",
312
+ "license": "Apache 2.0",
313
+ "execution": {"modes": ["advisory"], "max_incidents": 100},
314
+ "memory": {"type": "in-memory", "faiss_index_type": "flat", "max_incident_nodes": 100},
315
+ "enterprise_features": ["Real-time HMC (using PyMC)", "Hyperpriors", "Decision Engine"]
316
+ }
317
 
318
  # ----------------------------------------------------------------------
319
  # Gradio UI
320
  # ----------------------------------------------------------------------
321
+ with gr.Blocks(title="ARF v4 – Bayesian Risk Scoring Demo", theme="soft") as demo:
322
  gr.Markdown("""
323
+ # 🧠 ARF v4 – Bayesian Risk Scoring for AI Reliability (Demo)
324
+ **Mathematically rigorous risk estimation using conjugate priors and MCMC**
325
+
326
+ This demo showcases:
327
+ - **Bayesian conjugate prior (Beta-Binomial)** – online risk update from observed failures/successes.
328
+ - **Policy thresholds** – approve (<0.2), escalate (0.2‑0.8), deny (>0.8).
329
+ - **Metropolis-Hastings MCMC** – sampling from a posterior distribution (simulating HMC concepts).
330
+ - **Autonomous control decisions** – based on the current risk estimate.
331
+
332
+ All components are implemented from first principles using only `numpy` and standard libraries.
333
  """)
334
 
335
  with gr.Tabs():
336
  # Tab 1: Control Plane Dashboard
337
  with gr.TabItem("Control Plane Dashboard"):
338
+ gr.Markdown("### 🎮 Control Plane")
339
  with gr.Row():
340
  with gr.Column():
341
  system_status = gr.JSON(label="System Status", value={
342
  "edition": oss_caps["edition"],
343
  "version": oss_caps["version"],
344
  "governance_mode": "advisory",
345
+ "policies_loaded": 2,
346
+ "risk_threshold_low": 0.2,
347
+ "risk_threshold_high": 0.8
348
  })
349
  with gr.Column():
350
  control_stats = gr.JSON(label="Control Statistics", value={
 
358
  decision_pie = gr.Plot(label="Policy Decisions")
359
  with gr.Row():
360
  action_timeline = gr.Plot(label="Autonomous Actions Timeline")
 
 
361
  refresh_dash_btn = gr.Button("Refresh Dashboard")
362
  refresh_dash_btn.click(
363
  fn=refresh_dashboard,
364
  outputs=[control_stats, risk_gauge, decision_pie, action_timeline]
365
  )
366
 
367
+ # Tab 2: Infrastructure Reliability (Bayesian Risk Update)
368
  with gr.TabItem("Infrastructure Reliability"):
369
+ gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Bayesian Risk")
370
+ gr.Markdown("""
371
+ This tab simulates evaluating an infrastructure change.
372
+ The risk is computed using a **Beta-Binomial conjugate prior**:
373
+ - Prior: Beta(α=1, β=1) (uniform)
374
+ - Posterior: Beta(α + failures, β + successes)
375
+ - Risk = mean of posterior
376
+ """)
377
  infra_state = gr.State(value={})
 
378
  with gr.Row():
379
  with gr.Column():
380
  infra_fault = gr.Dropdown(
 
382
  value="none",
383
  label="Inject Fault"
384
  )
385
+ infra_btn = gr.Button("Evaluate Intent")
386
  with gr.Column():
387
+ infra_output = gr.JSON(label="Analysis Result")
388
+
389
+ # Tab 3: Deep Analysis (MCMC)
390
+ with gr.TabItem("Deep Analysis (MCMC)"):
391
+ gr.Markdown("### Markov Chain Monte Carlo (Metropolis‑Hastings)")
392
+ gr.Markdown("""
393
+ This sampler approximates the posterior distribution of a **normal mean** given 10 observations.
394
+ It demonstrates how MCMC can be used for Bayesian inference without external libraries.
395
+ """)
396
  with gr.Row():
397
  with gr.Column():
398
+ hmc_samples = gr.Slider(500, 10000, value=5000, step=500, label="Number of Samples")
399
+ hmc_warmup = gr.Slider(100, 2000, value=1000, step=100, label="Burn‑in Steps")
400
+ hmc_run_btn = gr.Button("Run MCMC")
401
  with gr.Column():
402
  hmc_summary = gr.JSON(label="Posterior Summary")
403
  with gr.Row():
404
  hmc_trace_plot = gr.Plot(label="Trace Plot")
405
+ hmc_pair_plot = gr.Plot(label="Posterior Histogram")
406
 
407
  # Tab 4: Policy Management
408
  with gr.TabItem("Policy Management"):
409
+ gr.Markdown("### 📋 Execution Policies")
410
+ gr.Markdown("Policies define risk thresholds for autonomous actions.")
411
+ policies_json = [
412
+ {"name": "Low Risk Policy", "conditions": ["risk < 0.2"], "action": "approve", "priority": 1},
413
+ {"name": "Medium Risk Policy", "conditions": ["0.2 ≤ risk ≤ 0.8"], "action": "escalate", "priority": 2},
414
+ {"name": "High Risk Policy", "conditions": ["risk > 0.8"], "action": "deny", "priority": 3}
415
+ ]
416
+ gr.JSON(label="Active Policies", value=policies_json)
 
 
 
 
 
417
 
418
  # Tab 5: Enterprise / OSS Info
419
  with gr.TabItem("Enterprise / OSS"):
 
422
 
423
  **Version:** {oss_caps['version']}
424
  **License:** {oss_caps['license']}
 
425
 
426
+ ### OSS Capabilities (Demo)
427
+ - **Bayesian conjugate prior** Beta-Binomial risk scoring
428
+ - **Policy thresholds** – configurable approve/escalate/deny
429
+ - **MCMC sampling** – Metropolis-Hastings (simulates HMC concepts)
430
+ - **In-memory storage** – no persistence
 
431
 
432
  ### Enterprise Features (not included)
433
+ {chr(10).join('- ' + f for f in oss_caps['enterprise_features'])}
434
 
435
  [📅 Book a Demo](https://calendly.com/petter2025us/30min) | [📧 Contact Sales](mailto:petter2025us@outlook.com)
436
  """)
437
 
 
 
 
 
 
 
438
  # Wire events
439
  infra_btn.click(
440
+ fn=lambda f, w, s: handle_infra_with_governance(f, w, s),
441
+ inputs=[infra_fault, gr.State(50), infra_state],
442
  outputs=[infra_output, infra_state]
443
  )
444
+
445
  hmc_run_btn.click(
446
+ fn=run_hmc_mcmc,
447
  inputs=[hmc_samples, hmc_warmup],
448
  outputs=[hmc_summary, hmc_trace_plot, hmc_pair_plot]
449
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
450
 
451
  if __name__ == "__main__":
452
  demo.launch(theme="soft")