File size: 5,009 Bytes
ba59239
 
5c55cb5
e94f0ea
ba59239
644fff6
3e50ac5
644fff6
 
775c77d
82009c8
775c77d
644fff6
 
775c77d
1eb0dc5
644fff6
 
e94f0ea
 
 
775c77d
644fff6
775c77d
644fff6
ba59239
775c77d
1eb0dc5
775c77d
644fff6
775c77d
 
 
 
644fff6
 
775c77d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a81efd4
775c77d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e94f0ea
775c77d
 
 
82009c8
775c77d
 
d97b7c8
775c77d
e94f0ea
775c77d
 
644fff6
 
 
 
 
 
 
 
775c77d
 
 
 
644fff6
775c77d
644fff6
775c77d
644fff6
775c77d
644fff6
775c77d
 
 
 
 
 
 
644fff6
775c77d
644fff6
775c77d
644fff6
 
775c77d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import json
import numpy as np
import gradio as gr
import requests
import pandas as pd
import datetime
from typing import List, Dict, Any
import hashlib
import asyncio

# Import enhanced modules
from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
from healing_policies import PolicyEngine
from agent_orchestrator import OrchestrationManager, AgentSpecialization

# === Configuration ===
HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}

# === Initialize Enhanced Components ===
policy_engine = PolicyEngine()
orchestration_manager = OrchestrationManager()
events_history: List[ReliabilityEvent] = []

# [Keep existing FAISS setup and BusinessImpactCalculator...]

class EnhancedReliabilityEngine:
    def __init__(self):
        self.performance_metrics = {
            'total_incidents_processed': 0,
            'multi_agent_analyses': 0,
            'average_processing_time': 0.0
        }
    
    async def process_event_enhanced(self, component: str, latency: float, error_rate: float,
                                   throughput: float = 1000, cpu_util: float = None,
                                   memory_util: float = None) -> Dict[str, Any]:
        """Enhanced event processing with multi-agent orchestration"""
        start_time = asyncio.get_event_loop().time()
        
        # Create event
        event = ReliabilityEvent(
            component=component,
            latency_p99=latency,
            error_rate=error_rate,
            throughput=throughput,
            cpu_util=cpu_util,
            memory_util=memory_util,
            upstream_deps=["auth-service", "database"] if component == "api-service" else []
        )
        
        # Multi-agent analysis
        agent_analysis = await orchestration_manager.orchestrate_analysis(event)
        
        # Policy evaluation
        healing_actions = policy_engine.evaluate_policies(event)
        
        # Business impact
        business_impact = business_calculator.calculate_impact(event) 
        
        # Update metrics
        processing_time = asyncio.get_event_loop().time() - start_time
        self._update_performance_metrics(processing_time)
        
        # Prepare comprehensive result
        result = {
            "timestamp": event.timestamp,
            "component": component,
            "latency_p99": latency,
            "error_rate": error_rate,
            "throughput": throughput,
            "status": "ANOMALY" if agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0) > 0.5 else "NORMAL",
            "multi_agent_analysis": agent_analysis,
            "healing_actions": [action.value for action in healing_actions],
            "business_impact": business_impact,
            "processing_metadata": {
                "processing_time_seconds": round(processing_time, 3),
                "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []),
                "analysis_confidence": agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
            }
        }
        
        events_history.append(event)
        return result

# Initialize enhanced engine
enhanced_engine = EnhancedReliabilityEngine()

# [Keep existing UI setup, but enhance the submission function...]

async def submit_event_enhanced(component, latency, error_rate, throughput, cpu_util, memory_util):
    """Enhanced event submission with async processing"""
    try:
        # Convert inputs
        latency = float(latency)
        error_rate = float(error_rate)
        throughput = float(throughput) if throughput else 1000
        cpu_util = float(cpu_util) if cpu_util else None
        memory_util = float(memory_util) if memory_util else None
        
        # Process with enhanced engine
        result = await enhanced_engine.process_event_enhanced(
            component, latency, error_rate, throughput, cpu_util, memory_util
        )
        
        # [Keep existing table formatting...]
        
        # Enhanced output formatting
        status_emoji = "🚨" if result["status"] == "ANOMALY" else "✅"
        output_msg = f"{status_emoji} {result['status']}"
        
        # Add multi-agent insights
        if "multi_agent_analysis" in result:
            analysis = result["multi_agent_analysis"]
            output_msg += f"\n🎯 Confidence: {analysis.get('incident_summary', {}).get('anomaly_confidence', 0)*100:.1f}%"
            
            if analysis.get('recommended_actions'):
                output_msg += f"\n💡 Insights: {', '.join(analysis['recommended_actions'][:2])}"
        
        # [Keep existing business impact and healing actions display...]
        
        return (output_msg, table_output)
        
    except Exception as e:
        return f"❌ Error processing event: {str(e)}", gr.Dataframe(value=[])