Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

petter2025 commited on Nov 24, 2025

Commit

644fff6

verified ·

1 Parent(s): 00342ac

Update app.py

Browse files

Files changed (1) hide show

app.py +314 -170

app.py CHANGED Viewed

@@ -1,59 +1,141 @@
 import os
 import json
-import random
-import time
-import datetime
 import numpy as np
 import gradio as gr
 import requests
-from sentence_transformers import SentenceTransformer
-import faiss
-# === Config ===
-HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
-if not HF_TOKEN:
-    print("⚠️ No Hugging Face token found. Running in fallback/local mode.")
-else:
-    print("✅ Hugging Face token loaded successfully.")
 HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
 HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
-# === FAISS Setup ===
-VECTOR_DIM = 384
-INDEX_FILE = "incident_vectors.index"
-TEXTS_FILE = "incident_texts.json"
-model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-if os.path.exists(INDEX_FILE):
-    index = faiss.read_index(INDEX_FILE)
-    with open(TEXTS_FILE, "r") as f:
-        incident_texts = json.load(f)
-else:
-    index = faiss.IndexFlatL2(VECTOR_DIM)
     incident_texts = []
 def save_index():
-    faiss.write_index(index, INDEX_FILE)
-    with open(TEXTS_FILE, "w") as f:
-        json.dump(incident_texts, f)
-# === Event Memory ===
-events = []
-def detect_anomaly(event):
-    """Adaptive threshold-based anomaly detection."""
-    latency = event["latency"]
-    error_rate = event["error_rate"]
-    # Remove random forcing for production - use actual thresholds only
-    latency_anomaly = latency > 150
-    error_anomaly = error_rate > 0.05
-    return latency_anomaly or error_anomaly
-def call_huggingface_analysis(prompt):
-    """Use HF Inference API or fallback simulation."""
     if not HF_TOKEN:
         # Enhanced fallback analysis
         fallback_insights = [
@@ -63,10 +145,10 @@ def call_huggingface_analysis(prompt):
             "Intermittent failures indicate potential dependency service degradation",
             "Performance degradation detected - consider scaling compute resources"
         ]
         return random.choice(fallback_insights)
     try:
-        # Enhanced prompt for better analysis
         enhanced_prompt = f"""
         As a senior reliability engineer, analyze this telemetry event and provide a concise root cause analysis:
@@ -91,159 +173,221 @@ def call_huggingface_analysis(prompt):
         if response.status_code == 200:
             result = response.json()
             analysis_text = result.get("choices", [{}])[0].get("text", "").strip()
-            # Clean up any extra formatting from the response
             if analysis_text and len(analysis_text) > 10:
-                return analysis_text.split('\n')[0]  # Take first line if multiple
             return analysis_text
         else:
             return f"API Error {response.status_code}: Service temporarily unavailable"
     except Exception as e:
         return f"Analysis service error: {str(e)}"
-def simulate_healing(event):
-    actions = [
-        "Restarted container",
-        "Scaled up instance",
-        "Cleared queue backlog",
-        "No actionable step detected."
-    ]
-    return random.choice(actions)
-def analyze_event(component, latency, error_rate):
-    # Ensure unique timestamps with higher precision
-    event = {
-        "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3],
-        "component": component,
-        "latency": latency,
-        "error_rate": error_rate
-    }
-    is_anomaly = detect_anomaly(event)
-    event["anomaly"] = is_anomaly
-    event["status"] = "Anomaly" if is_anomaly else "Normal"
-    # Build enhanced textual prompt
     prompt = (
         f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
-        f"Status: {event['status']}\n\n"
         "Provide a one-line reliability insight or root cause analysis."
     )
-    # Analysis
     analysis = call_huggingface_analysis(prompt)
-    event["analysis"] = analysis
-    # Healing simulation
-    healing_action = simulate_healing(event)
-    event["healing_action"] = healing_action
-    # === Vector learning ===
-    vector_text = f"{component} {latency} {error_rate} {analysis}"
-    vec = model.encode([vector_text])
-    index.add(np.array(vec, dtype=np.float32))
-    incident_texts.append(vector_text)
-    save_index()
-    # Find similar incidents
-    if len(incident_texts) > 1:
-        D, I = index.search(vec, k=min(3, len(incident_texts)))
-        similar = [incident_texts[i] for i in I[0] if i < len(incident_texts)]
-        if similar:
-            # Extract meaningful part from similar incident
-            similar_preview = similar[0][:100] + "..." if len(similar[0]) > 100 else similar[0]
-            event["healing_action"] += f" Found {len(similar)} similar incidents (e.g., {similar_preview})."
-    else:
-        event["healing_action"] += " - Not enough incidents stored yet."
-    events.append(event)
-    return json.dumps(event, indent=2)
-# === UI ===
-def submit_event(component, latency, error_rate):
-    result = analyze_event(component, latency, error_rate)
-    parsed = json.loads(result)
-    # Display last 15 events to keep table manageable
-    table = [
-        [e["timestamp"], e["component"], e["latency"], e["error_rate"],
-         e["status"], e["analysis"], e["healing_action"]]
-        for e in events[-15:]
-    ]
-    return (
-        f"✅ Event Processed ({parsed['status']})",
-        gr.Dataframe(
-            headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"],
-            value=table
-        )
-    )
-with gr.Blocks(title="🧠 Agentic Reliability Framework MVP", theme="soft") as demo:
-    gr.Markdown("""
-    # 🧠 Agentic Reliability Framework MVP
-    **Adaptive anomaly detection + AI-driven self-healing + persistent FAISS memory**
-    *Monitor your services in real-time with AI-powered reliability engineering*
-    """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("### 📊 Telemetry Input")
-            component = gr.Textbox(
-                label="Component",
-                value="api-service",
-                info="Name of the service being monitored"
-            )
-            latency = gr.Slider(
-                minimum=10,
-                maximum=400,
-                value=100,
-                step=1,
-                label="Latency (ms)",
-                info="Alert threshold: >150ms"
-            )
-            error_rate = gr.Slider(
-                minimum=0,
-                maximum=0.2,
-                value=0.02,
-                step=0.001,
-                label="Error Rate",
-                info="Alert threshold: >0.05"
-            )
-            submit = gr.Button("🚀 Submit Telemetry Event", variant="primary")
-        with gr.Column(scale=2):
-            gr.Markdown("### 🔍 Live Analysis")
-            output_text = gr.Textbox(
-                label="Detection Output",
-                placeholder="Submit an event to see analysis results...",
-                lines=2
-            )
-            gr.Markdown("### 📈 Recent Events")
-            table_output = gr.Dataframe(
-                headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"],
-                label="Event History",
                 wrap=True
             )
-    # Add some explanation
-    with gr.Accordion("ℹ️ How it works", open=False):
         gr.Markdown("""
-        - **Anomaly Detection**: Flags events with latency >150ms or error rate >5%
-        - **AI Analysis**: Uses Mistral-8x7B for root cause analysis via Hugging Face
-        - **Vector Memory**: Stores incidents in FAISS for similarity search
-        - **Self-Healing**: Simulates automated recovery actions based on historical patterns
         """)
-    submit.click(
-        fn=submit_event,
-        inputs=[component, latency, error_rate],
-        outputs=[output_text, table_output]
-    )
 if __name__ == "__main__":
     demo.launch(
-        server_name="0.0.0.0",
         server_port=7860,
         share=False
     )

 import os
 import json
 import numpy as np
 import gradio as gr
 import requests
+import pandas as pd
+import datetime
+from typing import List, Dict, Any
+import hashlib
+# Import our new modules
+from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
+from healing_policies import PolicyEngine
+# === Configuration ===
+HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
 HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
 HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
+# === FAISS & Embeddings Setup ===
+try:
+    from sentence_transformers import SentenceTransformer
+    import faiss
+    VECTOR_DIM = 384
+    INDEX_FILE = "incident_vectors.index"
+    TEXTS_FILE = "incident_texts.json"
+    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    if os.path.exists(INDEX_FILE):
+        index = faiss.read_index(INDEX_FILE)
+        with open(TEXTS_FILE, "r") as f:
+            incident_texts = json.load(f)
+    else:
+        index = faiss.IndexFlatL2(VECTOR_DIM)
+        incident_texts = []
+except ImportError as e:
+    print(f"Warning: FAISS or SentenceTransformers not available: {e}")
+    index = None
     incident_texts = []
 def save_index():
+    """Save FAISS index and incident texts"""
+    if index is not None:
+        faiss.write_index(index, INDEX_FILE)
+        with open(TEXTS_FILE, "w") as f:
+            json.dump(incident_texts, f)
+# === Core Engine Components ===
+policy_engine = PolicyEngine()
+events_history: List[ReliabilityEvent] = []
+class BusinessImpactCalculator:
+    """Calculate business impact of anomalies"""
+    def __init__(self, revenue_per_request: float = 0.01):
+        self.revenue_per_request = revenue_per_request
+    def calculate_impact(self, event: ReliabilityEvent, duration_minutes: int = 5) -> Dict[str, Any]:
+        """Calculate business impact of an anomaly"""
+        # Estimate throughput reduction (simplified)
+        normal_throughput = 1000  # This should come from historical baseline
+        throughput_reduction = max(0, 1 - (event.throughput / normal_throughput))
+        # Revenue impact
+        revenue_loss = (event.throughput * throughput_reduction *
+                       self.revenue_per_request * (duration_minutes / 60))
+        # User impact
+        affected_users = event.user_impact or int(event.throughput * duration_minutes * 0.1)
+        # Severity classification
+        if revenue_loss > 1000 or affected_users > 10000:
+            severity = "CRITICAL"
+        elif revenue_loss > 100 or affected_users > 1000:
+            severity = "HIGH"
+        elif revenue_loss > 10 or affected_users > 100:
+            severity = "MEDIUM"
+        else:
+            severity = "LOW"
+        return {
+            'revenue_loss_estimate': round(revenue_loss, 2),
+            'affected_users_estimate': affected_users,
+            'severity_level': severity,
+            'throughput_reduction_pct': round(throughput_reduction * 100, 1)
+        }
+business_calculator = BusinessImpactCalculator()
+class AdvancedAnomalyDetector:
+    """Enhanced anomaly detection with adaptive thresholds"""
+    def __init__(self):
+        self.historical_data = []
+        self.adaptive_thresholds = {
+            'latency_p99': 150,  # Will adapt based on history
+            'error_rate': 0.05
+        }
+    def detect_anomaly(self, event: ReliabilityEvent) -> bool:
+        """Enhanced anomaly detection with adaptive thresholds"""
+        # Basic threshold checks
+        latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
+        error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
+        # Resource-based anomalies
+        resource_anomaly = False
+        if event.cpu_util and event.cpu_util > 0.9:
+            resource_anomaly = True
+        if event.memory_util and event.memory_util > 0.9:
+            resource_anomaly = True
+        # Update adaptive thresholds (simplified)
+        self._update_thresholds(event)
+        return latency_anomaly or error_anomaly or resource_anomaly
+    def _update_thresholds(self, event: ReliabilityEvent):
+        """Update adaptive thresholds based on historical data"""
+        self.historical_data.append(event)
+        # Keep only recent history
+        if len(self.historical_data) > 100:
+            self.historical_data.pop(0)
+        # Update latency threshold to 90th percentile of recent data
+        if len(self.historical_data) > 10:
+            recent_latencies = [e.latency_p99 for e in self.historical_data[-20:]]
+            self.adaptive_thresholds['latency_p99'] = np.percentile(recent_latencies, 90)
+anomaly_detector = AdvancedAnomalyDetector()
+def call_huggingface_analysis(prompt: str) -> str:
+    """Use HF Inference API or fallback simulation"""
     if not HF_TOKEN:
         # Enhanced fallback analysis
         fallback_insights = [
             "Intermittent failures indicate potential dependency service degradation",
             "Performance degradation detected - consider scaling compute resources"
         ]
+        import random
         return random.choice(fallback_insights)
     try:
         enhanced_prompt = f"""
         As a senior reliability engineer, analyze this telemetry event and provide a concise root cause analysis:
         if response.status_code == 200:
             result = response.json()
             analysis_text = result.get("choices", [{}])[0].get("text", "").strip()
             if analysis_text and len(analysis_text) > 10:
+                return analysis_text.split('\n')[0]
             return analysis_text
         else:
             return f"API Error {response.status_code}: Service temporarily unavailable"
     except Exception as e:
         return f"Analysis service error: {str(e)}"
+def analyze_event(component: str, latency: float, error_rate: float,
+                 throughput: float = 1000, cpu_util: float = None,
+                 memory_util: float = None) -> Dict[str, Any]:
+    """Main event analysis function"""
+    # Create enhanced event
+    event = ReliabilityEvent(
+        component=component,
+        latency_p99=latency,
+        error_rate=error_rate,
+        throughput=throughput,
+        cpu_util=cpu_util,
+        memory_util=memory_util,
+        upstream_deps=["auth-service", "database"] if component == "api-service" else []
+    )
+    # Detect anomaly
+    is_anomaly = anomaly_detector.detect_anomaly(event)
+    event.severity = EventSeverity.HIGH if is_anomaly else EventSeverity.LOW
+    # Build analysis prompt
     prompt = (
         f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
+        f"Throughput: {throughput:.0f}\nCPU: {cpu_util or 'N/A'}\nMemory: {memory_util or 'N/A'}\n"
+        f"Status: {'ANOMALY' if is_anomaly else 'NORMAL'}\n\n"
         "Provide a one-line reliability insight or root cause analysis."
     )
+    # Get AI analysis
     analysis = call_huggingface_analysis(prompt)
+    # Evaluate healing policies
+    healing_actions = policy_engine.evaluate_policies(event)
+    # Calculate business impact
+    business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
+    # Vector memory learning
+    if index is not None and is_anomaly:
+        vector_text = f"{component} {latency} {error_rate} {analysis}"
+        vec = model.encode([vector_text])
+        index.add(np.array(vec, dtype=np.float32))
+        incident_texts.append(vector_text)
+        save_index()
+    # Prepare result
+    result = {
+        "timestamp": event.timestamp,
+        "component": component,
+        "latency_p99": latency,
+        "error_rate": error_rate,
+        "throughput": throughput,
+        "status": "ANOMALY" if is_anomaly else "NORMAL",
+        "analysis": analysis,
+        "healing_actions": [action.value for action in healing_actions],
+        "business_impact": business_impact,
+        "severity": event.severity.value,
+        "similar_incidents_count": len(incident_texts) if is_anomaly else 0
+    }
+    events_history.append(event)
+    return result
+# === Gradio UI ===
+def submit_event(component, latency, error_rate, throughput, cpu_util, memory_util):
+    """Handle event submission from UI"""
+    try:
+        # Convert inputs
+        latency = float(latency)
+        error_rate = float(error_rate)
+        throughput = float(throughput) if throughput else 1000
+        cpu_util = float(cpu_util) if cpu_util else None
+        memory_util = float(memory_util) if memory_util else None
+        result = analyze_event(component, latency, error_rate, throughput, cpu_util, memory_util)
+        # Prepare table data
+        table_data = []
+        for event in events_history[-15:]:
+            table_data.append([
+                event.timestamp[:19],  # Trim microseconds
+                event.component,
+                event.latency_p99,
+                f"{event.error_rate:.3f}",
+                event.throughput,
+                event.severity.value.upper(),
+                getattr(event, 'analysis', 'N/A')[:50] + "..." if getattr(event, 'analysis', 'N/A') else 'N/A'
+            ])
+        # Format output message
+        status_emoji = "🚨" if result["status"] == "ANOMALY" else "✅"
+        output_msg = f"{status_emoji} {result['status']} - {result['analysis']}"
+        if result["business_impact"]:
+            impact = result["business_impact"]
+            output_msg += f"\n💰 Business Impact: ${impact['revenue_loss_estimate']} | 👥 {impact['affected_users_estimate']} users"
+        if result["healing_actions"]:
+            actions = ", ".join(result["healing_actions"])
+            output_msg += f"\n🔧 Auto-Actions: {actions}"
+        return (
+            output_msg,
+            gr.Dataframe(
+                headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
+                value=table_data,
                 wrap=True
             )
+        )
+    except Exception as e:
+        return f"❌ Error processing event: {str(e)}", gr.Dataframe(value=[])
+def create_ui():
+    """Create the Gradio interface"""
+    with gr.Blocks(title="🧠 Agentic Reliability Framework v2", theme="soft") as demo:
         gr.Markdown("""
+        # 🧠 Agentic Reliability Framework v2
+        **Production-Grade Self-Healing AI Systems**
+        *Advanced anomaly detection + AI-driven root cause analysis + Business impact quantification*
         """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 📊 Telemetry Input")
+                component = gr.Dropdown(
+                    choices=["api-service", "auth-service", "payment-service", "database", "cache-service"],
+                    value="api-service",
+                    label="Component",
+                    info="Select the service being monitored"
+                )
+                latency = gr.Slider(
+                    minimum=10, maximum=1000, value=100, step=1,
+                    label="Latency P99 (ms)",
+                    info="Alert threshold: >150ms (adaptive)"
+                )
+                error_rate = gr.Slider(
+                    minimum=0, maximum=0.5, value=0.02, step=0.001,
+                    label="Error Rate",
+                    info="Alert threshold: >0.05"
+                )
+                throughput = gr.Number(
+                    value=1000,
+                    label="Throughput (req/sec)",
+                    info="Current request rate"
+                )
+                cpu_util = gr.Slider(
+                    minimum=0, maximum=1, value=0.4, step=0.01,
+                    label="CPU Utilization",
+                    info="0.0 - 1.0 scale"
+                )
+                memory_util = gr.Slider(
+                    minimum=0, maximum=1, value=0.3, step=0.01,
+                    label="Memory Utilization",
+                    info="0.0 - 1.0 scale"
+                )
+                submit_btn = gr.Button("🚀 Submit Telemetry Event", variant="primary", size="lg")
+            with gr.Column(scale=2):
+                gr.Markdown("### 🔍 Live Analysis & Healing")
+                output_text = gr.Textbox(
+                    label="Analysis Results",
+                    placeholder="Submit an event to see AI-powered analysis...",
+                    lines=4
+                )
+                gr.Markdown("### 📈 Recent Events (Last 15)")
+                events_table = gr.Dataframe(
+                    headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
+                    label="Event History",
+                    wrap=True,
+                    max_height="400px"
+                )
+        # Information sections
+        with gr.Accordion("ℹ️ Framework Capabilities", open=False):
+            gr.Markdown("""
+            - **🤖 AI-Powered Analysis**: Mistral-8x7B for intelligent root cause analysis
+            - **🔧 Policy-Based Healing**: Automated recovery actions based on severity and context
+            - **💰 Business Impact**: Revenue and user impact quantification
+            - **🎯 Adaptive Detection**: ML-powered thresholds that learn from your environment
+            - **📚 Vector Memory**: FAISS-based incident memory for similarity detection
+            - **⚡ Production Ready**: Circuit breakers, cooldowns, and enterprise features
+            """)
+        with gr.Accordion("🔧 Healing Policies", open=False):
+            policy_info = []
+            for policy in policy_engine.policies:
+                if policy.enabled:
+                    actions = ", ".join([action.value for action in policy.actions])
+                    policy_info.append(f"**{policy.name}**: {actions} (Priority: {policy.priority})")
+            gr.Markdown("\n\n".join(policy_info))
+        # Event handling
+        submit_btn.click(
+            fn=submit_event,
+            inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
+            outputs=[output_text, events_table]
+        )
+    return demo
 if __name__ == "__main__":
+    demo = create_ui()
     demo.launch(
+        server_name="0.0.0.0",
         server_port=7860,
         share=False
     )