| import os |
| import json |
| import numpy as np |
| import gradio as gr |
| import requests |
| import pandas as pd |
| import datetime |
| from typing import List, Dict, Any |
| import hashlib |
| import asyncio |
|
|
| |
| from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction |
| from healing_policies import PolicyEngine |
| from agent_orchestrator import OrchestrationManager, AgentSpecialization |
|
|
| |
| HF_TOKEN = os.getenv("HF_TOKEN", "").strip() |
| HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions" |
| HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {} |
|
|
| |
| policy_engine = PolicyEngine() |
| orchestration_manager = OrchestrationManager() |
| events_history: List[ReliabilityEvent] = [] |
|
|
| |
|
|
| class EnhancedReliabilityEngine: |
| def __init__(self): |
| self.performance_metrics = { |
| 'total_incidents_processed': 0, |
| 'multi_agent_analyses': 0, |
| 'average_processing_time': 0.0 |
| } |
| |
| async def process_event_enhanced(self, component: str, latency: float, error_rate: float, |
| throughput: float = 1000, cpu_util: float = None, |
| memory_util: float = None) -> Dict[str, Any]: |
| """Enhanced event processing with multi-agent orchestration""" |
| start_time = asyncio.get_event_loop().time() |
| |
| |
| event = ReliabilityEvent( |
| component=component, |
| latency_p99=latency, |
| error_rate=error_rate, |
| throughput=throughput, |
| cpu_util=cpu_util, |
| memory_util=memory_util, |
| upstream_deps=["auth-service", "database"] if component == "api-service" else [] |
| ) |
| |
| |
| agent_analysis = await orchestration_manager.orchestrate_analysis(event) |
| |
| |
| healing_actions = policy_engine.evaluate_policies(event) |
| |
| |
| business_impact = business_calculator.calculate_impact(event) |
| |
| |
| processing_time = asyncio.get_event_loop().time() - start_time |
| self._update_performance_metrics(processing_time) |
| |
| |
| result = { |
| "timestamp": event.timestamp, |
| "component": component, |
| "latency_p99": latency, |
| "error_rate": error_rate, |
| "throughput": throughput, |
| "status": "ANOMALY" if agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0) > 0.5 else "NORMAL", |
| "multi_agent_analysis": agent_analysis, |
| "healing_actions": [action.value for action in healing_actions], |
| "business_impact": business_impact, |
| "processing_metadata": { |
| "processing_time_seconds": round(processing_time, 3), |
| "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []), |
| "analysis_confidence": agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0) |
| } |
| } |
| |
| events_history.append(event) |
| return result |
|
|
| |
| enhanced_engine = EnhancedReliabilityEngine() |
|
|
| |
|
|
| async def submit_event_enhanced(component, latency, error_rate, throughput, cpu_util, memory_util): |
| """Enhanced event submission with async processing""" |
| try: |
| |
| latency = float(latency) |
| error_rate = float(error_rate) |
| throughput = float(throughput) if throughput else 1000 |
| cpu_util = float(cpu_util) if cpu_util else None |
| memory_util = float(memory_util) if memory_util else None |
| |
| |
| result = await enhanced_engine.process_event_enhanced( |
| component, latency, error_rate, throughput, cpu_util, memory_util |
| ) |
| |
| |
| |
| |
| status_emoji = "🚨" if result["status"] == "ANOMALY" else "✅" |
| output_msg = f"{status_emoji} {result['status']}" |
| |
| |
| if "multi_agent_analysis" in result: |
| analysis = result["multi_agent_analysis"] |
| output_msg += f"\n🎯 Confidence: {analysis.get('incident_summary', {}).get('anomaly_confidence', 0)*100:.1f}%" |
| |
| if analysis.get('recommended_actions'): |
| output_msg += f"\n💡 Insights: {', '.join(analysis['recommended_actions'][:2])}" |
| |
| |
| |
| return (output_msg, table_output) |
| |
| except Exception as e: |
| return f"❌ Error processing event: {str(e)}", gr.Dataframe(value=[]) |