Spaces:

A-R-F
/

Agentic-Reliability-Framework-v4

Running

App Files Files Community

petter2025 commited on Apr 1

Commit

8c057f3

verified ·

1 Parent(s): 3854e5f

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -20

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import json
 import logging
 import logging.handlers
 import numpy as np
-import pandas as pd
 from datetime import datetime
 from typing import Dict, Any, List, Optional, Tuple
 import threading
@@ -14,6 +13,7 @@ import contextlib
 import signal
 import sys
 import functools
 from scipy.stats import beta
 import plotly.graph_objects as go
@@ -69,6 +69,7 @@ console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(level
 logger.addHandler(file_handler)
 logger.addHandler(console_handler)
 # ----------------------------------------------------------------------
 # SQLite persistence with secure permissions
@@ -89,9 +90,12 @@ def init_db():
             )
         ''')
         conn.commit()
-    # Restrict permissions (owner read/write only)
-    os.chmod(DB_PATH, 0o600)
-    logger.info(f"Database initialized at {DB_PATH} with secure permissions")
 def save_decision_to_db(decision: dict, risk: float):
     """Insert a decision into the database."""
@@ -181,8 +185,9 @@ def refresh_history_from_db():
         for ts, dec, risk in decisions:
             decision_history.append((ts, dec, risk))
             risk_history.append((ts, risk))
-            if PROMETHEUS_AVAILABLE:
-                prom_risk_gauge.set(risk)  # update gauge with latest risk
 # ----------------------------------------------------------------------
 # Memory monitoring (daemon thread with graceful stop)
@@ -219,19 +224,42 @@ def memory_monitor_loop():
                 logger.info("Process memory: unknown")
         except Exception as e:
             logger.error(f"Memory logging error: {e}")
-        time.sleep(60)
 # ----------------------------------------------------------------------
-# Bayesian Risk Engine (Beta‑Binomial)
 # ----------------------------------------------------------------------
 class BayesianRiskEngine:
-    def __init__(self, alpha=ALPHA_PRIOR, beta=BETA_PRIOR):
         self.alpha = alpha
         self.beta = beta
     def update(self, failures, successes):
-        self.alpha += failures
-        self.beta += successes
     def risk(self):
         return self.alpha / (self.alpha + self.beta)
@@ -259,7 +287,7 @@ class PolicyEngine:
             return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
 # ----------------------------------------------------------------------
-# Infrastructure analysis (synchronous, with validation)
 # ----------------------------------------------------------------------
 def handle_infra_with_governance(fault_type: str, context_window: int, session_state: dict):
     start_time = time.time()
@@ -278,7 +306,9 @@ def handle_infra_with_governance(fault_type: str, context_window: int, session_s
         }
         failures, successes = fault_map.get(fault_type, (1, 99))
-        risk_engine = BayesianRiskEngine()
         risk_engine.update(failures, successes)
         risk = risk_engine.risk()
         ci_low, ci_high = risk_engine.risk_interval(0.95)
@@ -296,7 +326,8 @@ def handle_infra_with_governance(fault_type: str, context_window: int, session_s
             "posterior_parameters": {
                 "alpha": risk_engine.alpha,
                 "beta": risk_engine.beta
-            }
         }
         output = {
             **analysis_result,
@@ -319,12 +350,14 @@ def handle_infra_with_governance(fault_type: str, context_window: int, session_s
 def autonomous_control_decision(risk, risk_engine, policy_engine):
     action, reason = policy_engine.evaluate(risk)
     decision = {
         "timestamp": datetime.utcnow().isoformat(),
         "approved": action == "approve",
         "actions": ["escalate_human"] if action == "escalate" else [],
         "reason": reason,
-        "risk_level": "low" if risk < 0.2 else "medium" if risk < 0.8 else "high"
     }
     update_dashboard_data(decision, risk)
     return decision
@@ -602,6 +635,15 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo") as demo:
                         value="none",
                         label="Inject Fault"
                     )
                     infra_btn = gr.Button("Evaluate Intent")
                 with gr.Column():
                     infra_output = gr.JSON(label="Analysis Result")
@@ -674,14 +716,14 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo") as demo:
             <div style="text-align: center; margin-top: 2rem;">
                 <a href="https://calendly.com/petter2025us/30min" target="_blank" style="background: #764ba2; color: white; padding: 12px 24px; text-decoration: none; border-radius: 8px; font-weight: bold; margin-right: 1rem;">📅 Book a Demo</a>
-                <a href="mailto:petter2025us@outlook.com" style="background: #667eea; color: white; padding: 12px 24px; text-decoration: none; border-radius: 8px; font-weight: bold;">📧 Contact Sales</a>
             </div>
             """)
     # Wire events
     infra_btn.click(
         fn=handle_infra_with_governance,
-        inputs=[infra_fault, gr.State(50), infra_state],
         outputs=[infra_output, infra_state]
     )
@@ -695,9 +737,7 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo") as demo:
 # Launch
 # ----------------------------------------------------------------------
 if __name__ == "__main__":
-    # Enable queue and add metrics route if available
     demo.queue()
-    if PROMETHEUS_AVAILABLE:
-        # Access the underlying FastAPI app after queueing
         demo.app.add_api_route("/metrics", lambda: (generate_latest(), 200, {"Content-Type": CONTENT_TYPE_LATEST}), methods=["GET"])
     demo.launch(theme="soft", server_name="0.0.0.0", server_port=7860)

 import logging
 import logging.handlers
 import numpy as np
 from datetime import datetime
 from typing import Dict, Any, List, Optional, Tuple
 import threading
 import signal
 import sys
 import functools
+from collections import deque
 from scipy.stats import beta
 import plotly.graph_objects as go
 logger.addHandler(file_handler)
 logger.addHandler(console_handler)
+logger.propagate = False  # Prevent duplicate logs
 # ----------------------------------------------------------------------
 # SQLite persistence with secure permissions
             )
         ''')
         conn.commit()
+    # Restrict permissions (owner read/write only) – best effort
+    try:
+        os.chmod(DB_PATH, 0o600)
+    except Exception as e:
+        logger.warning(f"Could not set secure permissions on DB: {e}")
+    logger.info(f"Database initialized at {DB_PATH}")
 def save_decision_to_db(decision: dict, risk: float):
     """Insert a decision into the database."""
         for ts, dec, risk in decisions:
             decision_history.append((ts, dec, risk))
             risk_history.append((ts, risk))
+    # After loading, set the Prometheus gauge to the latest risk
+    if PROMETHEUS_AVAILABLE and risk_history:
+        prom_risk_gauge.set(risk_history[-1][1])
 # ----------------------------------------------------------------------
 # Memory monitoring (daemon thread with graceful stop)
                 logger.info("Process memory: unknown")
         except Exception as e:
             logger.error(f"Memory logging error: {e}")
+        # Sleep in small intervals to react quickly to shutdown
+        for _ in range(60):
+            if shutdown_event.is_set():
+                break
+            time.sleep(1)
 # ----------------------------------------------------------------------
+# Bayesian Risk Engine (Beta‑Binomial) with sliding window
 # ----------------------------------------------------------------------
 class BayesianRiskEngine:
+    def __init__(self, alpha=ALPHA_PRIOR, beta=BETA_PRIOR, maxlen=None):
         self.alpha = alpha
         self.beta = beta
+        self.maxlen = maxlen
+        self.events = deque(maxlen=maxlen)  # store (failures, successes)
+        self.total_failures = 0
+        self.total_successes = 0
     def update(self, failures, successes):
+        # Add new event
+        self.events.append((failures, successes))
+        self.total_failures += failures
+        self.total_successes += successes
+        # If maxlen is reached and the queue overflows, we've already removed the oldest,
+        # but we need to subtract it from totals.
+        if self.maxlen is not None and len(self.events) == self.maxlen:
+            # The deque automatically discards the leftmost when full, but we have to
+            # manually adjust totals to reflect the discarded event.
+            # However, we can't easily know what was discarded. Instead, recompute from deque.
+            self.total_failures = sum(f for f, _ in self.events)
+            self.total_successes = sum(s for _, s in self.events)
+        # Set alpha,beta = prior + totals
+        self.alpha = ALPHA_PRIOR + self.total_failures
+        self.beta = BETA_PRIOR + self.total_successes
     def risk(self):
         return self.alpha / (self.alpha + self.beta)
             return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
 # ----------------------------------------------------------------------
+# Infrastructure analysis (synchronous, with validation and sliding window)
 # ----------------------------------------------------------------------
 def handle_infra_with_governance(fault_type: str, context_window: int, session_state: dict):
     start_time = time.time()
         }
         failures, successes = fault_map.get(fault_type, (1, 99))
+        # Use context_window: if >0, limit to last N events; else unlimited
+        maxlen = context_window if context_window > 0 else None
+        risk_engine = BayesianRiskEngine(maxlen=maxlen)
         risk_engine.update(failures, successes)
         risk = risk_engine.risk()
         ci_low, ci_high = risk_engine.risk_interval(0.95)
             "posterior_parameters": {
                 "alpha": risk_engine.alpha,
                 "beta": risk_engine.beta
+            },
+            "context_window": context_window
         }
         output = {
             **analysis_result,
 def autonomous_control_decision(risk, risk_engine, policy_engine):
     action, reason = policy_engine.evaluate(risk)
+    # Use configurable thresholds for risk level
+    risk_level = "low" if risk < LOW_THRESHOLD else "medium" if risk < HIGH_THRESHOLD else "high"
     decision = {
         "timestamp": datetime.utcnow().isoformat(),
         "approved": action == "approve",
         "actions": ["escalate_human"] if action == "escalate" else [],
         "reason": reason,
+        "risk_level": risk_level
     }
     update_dashboard_data(decision, risk)
     return decision
                         value="none",
                         label="Inject Fault"
                     )
+                    # Use a Number component to allow user to set context window
+                    context_window_input = gr.Number(
+                        value=50,
+                        label="Context Window (number of recent events)",
+                        minimum=1,
+                        maximum=1000,
+                        step=1,
+                        info="How many past incidents to consider for risk calculation (0 = unlimited)"
+                    )
                     infra_btn = gr.Button("Evaluate Intent")
                 with gr.Column():
                     infra_output = gr.JSON(label="Analysis Result")
             <div style="text-align: center; margin-top: 2rem;">
                 <a href="https://calendly.com/petter2025us/30min" target="_blank" style="background: #764ba2; color: white; padding: 12px 24px; text-decoration: none; border-radius: 8px; font-weight: bold; margin-right: 1rem;">📅 Book a Demo</a>
+                <a href="mailto:petter2025us@outlook.com" style="background: #667eea; color: white; padding: 12px 24px; text-decoration: none; border-radius: 8px; font-weight: bold;">📧 Email me</a>
             </div>
             """)
     # Wire events
     infra_btn.click(
         fn=handle_infra_with_governance,
+        inputs=[infra_fault, context_window_input, infra_state],
         outputs=[infra_output, infra_state]
     )
 # Launch
 # ----------------------------------------------------------------------
 if __name__ == "__main__":
     demo.queue()
+    if PROMETHEUS_AVAILABLE and hasattr(demo, 'app') and demo.app:
         demo.app.add_api_route("/metrics", lambda: (generate_latest(), 200, {"Content-Type": CONTENT_TYPE_LATEST}), methods=["GET"])
     demo.launch(theme="soft", server_name="0.0.0.0", server_port=7860)