petter2025 commited on
Commit
1299bba
·
verified ·
1 Parent(s): 9514e94

Create engine.py

Browse files
Files changed (1) hide show
  1. engine.py +199 -0
engine.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced Reliability Engine – main entry point for processing reliability events.
3
+ """
4
+
5
+ import asyncio
6
+ import threading
7
+ import logging
8
+ import datetime
9
+ import json
10
+ import numpy as np
11
+ from typing import Optional, Dict, Any, List
12
+
13
+ from agentic_reliability_framework.core.models.event import ReliabilityEvent, EventSeverity, HealingAction
14
+ from agentic_reliability_framework.core.governance.policies import PolicyEngine # FIXED IMPORT
15
+ from agentic_reliability_framework.runtime.analytics.anomaly import AdvancedAnomalyDetector
16
+ from agentic_reliability_framework.runtime.analytics.predictive import BusinessImpactCalculator
17
+ from agentic_reliability_framework.runtime.orchestration.manager import OrchestrationManager
18
+ from agentic_reliability_framework.runtime.hmc.hmc_learner import HMCRiskLearner
19
+ from agentic_reliability_framework.core.adapters.claude import ClaudeAdapter
20
+ from agentic_reliability_framework.core.config.constants import (
21
+ MAX_EVENTS_STORED, AGENT_TIMEOUT_SECONDS
22
+ )
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class ThreadSafeEventStore:
28
+ """Simple thread-safe event store for recent events."""
29
+ def __init__(self, max_size: int = MAX_EVENTS_STORED):
30
+ from collections import deque
31
+ self._events = deque(maxlen=max_size)
32
+ self._lock = threading.RLock()
33
+
34
+ def add(self, event: ReliabilityEvent):
35
+ with self._lock:
36
+ self._events.append(event)
37
+
38
+ def get_recent(self, n: int = 15) -> List[ReliabilityEvent]:
39
+ with self._lock:
40
+ return list(self._events)[-n:] if self._events else []
41
+
42
+
43
+ class EnhancedReliabilityEngine:
44
+ def __init__(self, orchestrator: Optional[OrchestrationManager] = None,
45
+ policy_engine: Optional[PolicyEngine] = None,
46
+ event_store: Optional[ThreadSafeEventStore] = None,
47
+ anomaly_detector: Optional[AdvancedAnomalyDetector] = None,
48
+ business_calculator: Optional[BusinessImpactCalculator] = None,
49
+ hmc_learner: Optional[HMCRiskLearner] = None,
50
+ claude_adapter: Optional[ClaudeAdapter] = None):
51
+ self.orchestrator = orchestrator or OrchestrationManager()
52
+ self.policy_engine = policy_engine or PolicyEngine()
53
+ self.event_store = event_store or ThreadSafeEventStore()
54
+ self.anomaly_detector = anomaly_detector or AdvancedAnomalyDetector()
55
+ self.business_calculator = business_calculator or BusinessImpactCalculator()
56
+ self.hmc_learner = hmc_learner or HMCRiskLearner()
57
+ self.claude_adapter = claude_adapter or ClaudeAdapter()
58
+ self.performance_metrics = {
59
+ 'total_incidents_processed': 0,
60
+ 'multi_agent_analyses': 0,
61
+ 'anomalies_detected': 0
62
+ }
63
+ self._lock = threading.RLock()
64
+ logger.info("Initialized EnhancedReliabilityEngine")
65
+
66
+ async def process_event_enhanced(self, component: str, latency: float, error_rate: float,
67
+ throughput: float = 1000, cpu_util: Optional[float] = None,
68
+ memory_util: Optional[float] = None) -> Dict[str, Any]:
69
+ logger.info(f"Processing event for {component}: latency={latency}ms, error_rate={error_rate*100:.1f}%")
70
+ from agentic_reliability_framework.core.models.event import validate_component_id
71
+ is_valid, error_msg = validate_component_id(component)
72
+ if not is_valid:
73
+ return {'error': error_msg, 'status': 'INVALID'}
74
+
75
+ try:
76
+ event = ReliabilityEvent(
77
+ component=component,
78
+ latency_p99=latency,
79
+ error_rate=error_rate,
80
+ throughput=throughput,
81
+ cpu_util=cpu_util,
82
+ memory_util=memory_util
83
+ )
84
+ except Exception as e:
85
+ logger.error(f"Event creation error: {e}")
86
+ return {'error': f'Invalid event data: {str(e)}', 'status': 'INVALID'}
87
+
88
+ # Multi-agent analysis
89
+ agent_analysis = await self.orchestrator.orchestrate_analysis(event)
90
+
91
+ # Anomaly detection
92
+ is_anomaly = self.anomaly_detector.detect_anomaly(event)
93
+
94
+ # Determine severity based on agent confidence
95
+ agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0.0) if agent_analysis else 0.0
96
+ if is_anomaly and agent_confidence > 0.8:
97
+ severity = EventSeverity.CRITICAL
98
+ elif is_anomaly and agent_confidence > 0.6:
99
+ severity = EventSeverity.HIGH
100
+ elif is_anomaly and agent_confidence > 0.4:
101
+ severity = EventSeverity.MEDIUM
102
+ else:
103
+ severity = EventSeverity.LOW
104
+ event = event.model_copy(update={'severity': severity})
105
+
106
+ # Evaluate healing policies
107
+ healing_actions = self.policy_engine.evaluate_policies(event)
108
+
109
+ # Calculate business impact
110
+ business_impact = self.business_calculator.calculate_impact(event) if is_anomaly else None
111
+
112
+ # HMC analysis (if available)
113
+ hmc_analysis = None
114
+ if self.hmc_learner.is_ready:
115
+ try:
116
+ risk_samples = self.hmc_learner.posterior_predictive(event.component, event.model_dump())
117
+ hmc_analysis = {
118
+ 'mean_risk': float(np.mean(risk_samples)),
119
+ 'std_risk': float(np.std(risk_samples)),
120
+ 'samples': risk_samples.tolist()[:5]
121
+ }
122
+ except Exception as e:
123
+ logger.error(f"HMC analysis error: {e}")
124
+
125
+ # Build result
126
+ result = {
127
+ "timestamp": event.timestamp.isoformat(),
128
+ "component": component,
129
+ "latency_p99": latency,
130
+ "error_rate": error_rate,
131
+ "throughput": throughput,
132
+ "status": "ANOMALY" if is_anomaly else "NORMAL",
133
+ "multi_agent_analysis": agent_analysis,
134
+ "healing_actions": [a.value for a in healing_actions],
135
+ "business_impact": business_impact,
136
+ "severity": event.severity.value,
137
+ "hmc_analysis": hmc_analysis,
138
+ "processing_metadata": {
139
+ "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []),
140
+ "analysis_confidence": agent_confidence
141
+ }
142
+ }
143
+
144
+ self.event_store.add(event)
145
+ with self._lock:
146
+ self.performance_metrics['total_incidents_processed'] += 1
147
+ self.performance_metrics['multi_agent_analyses'] += 1
148
+ if is_anomaly:
149
+ self.performance_metrics['anomalies_detected'] += 1
150
+
151
+ # Enhance with Claude (optional)
152
+ try:
153
+ result = await self.enhance_with_claude(event, result)
154
+ except Exception as e:
155
+ logger.error(f"Claude enhancement failed: {e}")
156
+
157
+ return result
158
+
159
+ async def enhance_with_claude(self, event: ReliabilityEvent, agent_results: Dict[str, Any]) -> Dict[str, Any]:
160
+ context_parts = []
161
+ context_parts.append("INCIDENT SUMMARY:")
162
+ context_parts.append(f"Component: {event.component}")
163
+ context_parts.append(f"Timestamp: {event.timestamp.isoformat()}")
164
+ context_parts.append(f"Severity: {event.severity.value}")
165
+ context_parts.append("")
166
+ context_parts.append("METRICS:")
167
+ context_parts.append(f"• Latency P99: {event.latency_p99}ms")
168
+ context_parts.append(f"• Error Rate: {event.error_rate:.1%}")
169
+ context_parts.append(f"• Throughput: {event.throughput} req/s")
170
+ if event.cpu_util:
171
+ context_parts.append(f"• CPU: {event.cpu_util:.1%}")
172
+ if event.memory_util:
173
+ context_parts.append(f"• Memory: {event.memory_util:.1%}")
174
+ context_parts.append("")
175
+ if agent_results.get('multi_agent_analysis'):
176
+ context_parts.append("AGENT ANALYSIS:")
177
+ context_parts.append(json.dumps(agent_results['multi_agent_analysis'], indent=2))
178
+ context = "\n".join(context_parts)
179
+
180
+ prompt = f"""{context}
181
+ TASK: Provide an executive summary synthesizing all agent analyses.
182
+ Include:
183
+ 1. Concise incident description
184
+ 2. Most likely root cause
185
+ 3. Single best recovery action
186
+ 4. Estimated impact and recovery time
187
+ Be specific and actionable."""
188
+
189
+ system_prompt = """You are a senior Site Reliability Engineer synthesizing
190
+ multiple AI agent analyses into clear, actionable guidance for incident response.
191
+ Focus on clarity, accuracy, and decisive recommendations."""
192
+
193
+ claude_synthesis = self.claude_adapter.generate_completion(prompt, system_prompt)
194
+ agent_results['claude_synthesis'] = {
195
+ 'summary': claude_synthesis,
196
+ 'timestamp': datetime.datetime.now(datetime.timezone.utc).isoformat(),
197
+ 'source': 'claude-opus-4'
198
+ }
199
+ return agent_results