import os import json import numpy as np import gradio as gr import requests import pandas as pd import datetime from typing import List, Dict, Any import hashlib import asyncio # Import enhanced modules from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction from healing_policies import PolicyEngine from agent_orchestrator import OrchestrationManager, AgentSpecialization # === Configuration === HF_TOKEN = os.getenv("HF_TOKEN", "").strip() HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions" HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {} # === Initialize Enhanced Components === policy_engine = PolicyEngine() orchestration_manager = OrchestrationManager() events_history: List[ReliabilityEvent] = [] # [Keep existing FAISS setup and BusinessImpactCalculator...] class EnhancedReliabilityEngine: def __init__(self): self.performance_metrics = { 'total_incidents_processed': 0, 'multi_agent_analyses': 0, 'average_processing_time': 0.0 } async def process_event_enhanced(self, component: str, latency: float, error_rate: float, throughput: float = 1000, cpu_util: float = None, memory_util: float = None) -> Dict[str, Any]: """Enhanced event processing with multi-agent orchestration""" start_time = asyncio.get_event_loop().time() # Create event event = ReliabilityEvent( component=component, latency_p99=latency, error_rate=error_rate, throughput=throughput, cpu_util=cpu_util, memory_util=memory_util, upstream_deps=["auth-service", "database"] if component == "api-service" else [] ) # Multi-agent analysis agent_analysis = await orchestration_manager.orchestrate_analysis(event) # Policy evaluation healing_actions = policy_engine.evaluate_policies(event) # Business impact business_impact = business_calculator.calculate_impact(event) # Update metrics processing_time = asyncio.get_event_loop().time() - start_time self._update_performance_metrics(processing_time) # Prepare comprehensive result result = { "timestamp": event.timestamp, "component": component, "latency_p99": latency, "error_rate": error_rate, "throughput": throughput, "status": "ANOMALY" if agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0) > 0.5 else "NORMAL", "multi_agent_analysis": agent_analysis, "healing_actions": [action.value for action in healing_actions], "business_impact": business_impact, "processing_metadata": { "processing_time_seconds": round(processing_time, 3), "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []), "analysis_confidence": agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0) } } events_history.append(event) return result # Initialize enhanced engine enhanced_engine = EnhancedReliabilityEngine() # [Keep existing UI setup, but enhance the submission function...] async def submit_event_enhanced(component, latency, error_rate, throughput, cpu_util, memory_util): """Enhanced event submission with async processing""" try: # Convert inputs latency = float(latency) error_rate = float(error_rate) throughput = float(throughput) if throughput else 1000 cpu_util = float(cpu_util) if cpu_util else None memory_util = float(memory_util) if memory_util else None # Process with enhanced engine result = await enhanced_engine.process_event_enhanced( component, latency, error_rate, throughput, cpu_util, memory_util ) # [Keep existing table formatting...] # Enhanced output formatting status_emoji = "🚨" if result["status"] == "ANOMALY" else "āœ…" output_msg = f"{status_emoji} {result['status']}" # Add multi-agent insights if "multi_agent_analysis" in result: analysis = result["multi_agent_analysis"] output_msg += f"\nšŸŽÆ Confidence: {analysis.get('incident_summary', {}).get('anomaly_confidence', 0)*100:.1f}%" if analysis.get('recommended_actions'): output_msg += f"\nšŸ’” Insights: {', '.join(analysis['recommended_actions'][:2])}" # [Keep existing business impact and healing actions display...] return (output_msg, table_output) except Exception as e: return f"āŒ Error processing event: {str(e)}", gr.Dataframe(value=[])