petter2025 commited on
Commit
f4dfe34
·
verified ·
1 Parent(s): f4f95a3

Delete engine.py

Browse files
Files changed (1) hide show
  1. engine.py +0 -222
engine.py DELETED
@@ -1,222 +0,0 @@
1
- """
2
- Enhanced Reliability Engine – main entry point for processing reliability events.
3
- """
4
-
5
- import asyncio
6
- import threading
7
- import logging
8
- import datetime
9
- import json
10
- import numpy as np
11
- from typing import Optional, Dict, Any, List
12
-
13
- from agentic_reliability_framework.core.models.event import ReliabilityEvent, EventSeverity, HealingAction
14
- from policy_engine import PolicyEngine # local patched version (see Dockerfile)
15
- from agentic_reliability_framework.runtime.analytics.anomaly import AdvancedAnomalyDetector
16
- from agentic_reliability_framework.runtime.analytics.predictive import BusinessImpactCalculator
17
- from agentic_reliability_framework.runtime.orchestration.manager import OrchestrationManager
18
- from agentic_reliability_framework.runtime.hmc.hmc_learner import HMCRiskLearner
19
- from agentic_reliability_framework.core.adapters.claude import ClaudeAdapter
20
- from agentic_reliability_framework.core.config.constants import (
21
- MAX_EVENTS_STORED, AGENT_TIMEOUT_SECONDS
22
- )
23
-
24
- logger = logging.getLogger(__name__)
25
-
26
-
27
- class ThreadSafeEventStore:
28
- """Simple thread-safe event store for recent events."""
29
- def __init__(self, max_size: int = MAX_EVENTS_STORED):
30
- from collections import deque
31
- self._events = deque(maxlen=max_size)
32
- self._lock = threading.RLock()
33
-
34
- def add(self, event: ReliabilityEvent):
35
- with self._lock:
36
- self._events.append(event)
37
-
38
- def get_recent(self, n: int = 15) -> List[ReliabilityEvent]:
39
- with self._lock:
40
- return list(self._events)[-n:] if self._events else []
41
-
42
-
43
- class EnhancedReliabilityEngine:
44
- """
45
- Main engine for processing infrastructure events.
46
- Orchestrates agents, policy evaluation, risk scoring, and optional Claude enhancement.
47
- """
48
-
49
- def __init__(self, orchestrator: Optional[OrchestrationManager] = None,
50
- policy_engine: Optional[PolicyEngine] = None,
51
- event_store: Optional[ThreadSafeEventStore] = None,
52
- anomaly_detector: Optional[AdvancedAnomalyDetector] = None,
53
- business_calculator: Optional[BusinessImpactCalculator] = None,
54
- hmc_learner: Optional[HMCRiskLearner] = None,
55
- claude_adapter: Optional[ClaudeAdapter] = None):
56
- self.orchestrator = orchestrator or OrchestrationManager()
57
- self.policy_engine = policy_engine or PolicyEngine()
58
- self.event_store = event_store or ThreadSafeEventStore()
59
- self.anomaly_detector = anomaly_detector or AdvancedAnomalyDetector()
60
- self.business_calculator = business_calculator or BusinessImpactCalculator()
61
- self.hmc_learner = hmc_learner or HMCRiskLearner()
62
- self.claude_adapter = claude_adapter or ClaudeAdapter()
63
- self.performance_metrics = {
64
- 'total_incidents_processed': 0,
65
- 'multi_agent_analyses': 0,
66
- 'anomalies_detected': 0
67
- }
68
- self._lock = threading.RLock()
69
- logger.info("Initialized EnhancedReliabilityEngine")
70
-
71
- async def process_event_enhanced(self, component: str, latency: float, error_rate: float,
72
- throughput: float = 1000, cpu_util: Optional[float] = None,
73
- memory_util: Optional[float] = None) -> Dict[str, Any]:
74
- """
75
- Process a single telemetry event and return analysis results.
76
-
77
- Args:
78
- component: Name of the component (e.g., "api-service").
79
- latency: P99 latency in milliseconds.
80
- error_rate: Error rate between 0 and 1.
81
- throughput: Requests per second.
82
- cpu_util: CPU utilization (0-1), optional.
83
- memory_util: Memory utilization (0-1), optional.
84
-
85
- Returns:
86
- Dictionary containing analysis results.
87
- """
88
- logger.info(f"Processing event for {component}: latency={latency}ms, error_rate={error_rate*100:.1f}%")
89
- from agentic_reliability_framework.core.models.event import validate_component_id
90
- is_valid, error_msg = validate_component_id(component)
91
- if not is_valid:
92
- return {'error': error_msg, 'status': 'INVALID'}
93
-
94
- try:
95
- event = ReliabilityEvent(
96
- component=component,
97
- latency_p99=latency,
98
- error_rate=error_rate,
99
- throughput=throughput,
100
- cpu_util=cpu_util,
101
- memory_util=memory_util
102
- )
103
- except Exception as e:
104
- logger.error(f"Event creation error: {e}")
105
- return {'error': f'Invalid event data: {str(e)}', 'status': 'INVALID'}
106
-
107
- # Multi-agent analysis
108
- agent_analysis = await self.orchestrator.orchestrate_analysis(event)
109
-
110
- # Anomaly detection
111
- is_anomaly = self.anomaly_detector.detect_anomaly(event)
112
-
113
- # Determine severity based on agent confidence
114
- agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0.0) if agent_analysis else 0.0
115
- if is_anomaly and agent_confidence > 0.8:
116
- severity = EventSeverity.CRITICAL
117
- elif is_anomaly and agent_confidence > 0.6:
118
- severity = EventSeverity.HIGH
119
- elif is_anomaly and agent_confidence > 0.4:
120
- severity = EventSeverity.MEDIUM
121
- else:
122
- severity = EventSeverity.LOW
123
- event = event.model_copy(update={'severity': severity})
124
-
125
- # Evaluate healing policies
126
- healing_actions = self.policy_engine.evaluate_policies(event)
127
-
128
- # Calculate business impact
129
- business_impact = self.business_calculator.calculate_impact(event) if is_anomaly else None
130
-
131
- # HMC analysis (if available)
132
- hmc_analysis = None
133
- if self.hmc_learner.is_ready:
134
- try:
135
- risk_samples = self.hmc_learner.posterior_predictive(event.component, event.model_dump())
136
- hmc_analysis = {
137
- 'mean_risk': float(np.mean(risk_samples)),
138
- 'std_risk': float(np.std(risk_samples)),
139
- 'samples': risk_samples.tolist()[:5]
140
- }
141
- except Exception as e:
142
- logger.error(f"HMC analysis error: {e}")
143
-
144
- # Build result
145
- result = {
146
- "timestamp": event.timestamp.isoformat(),
147
- "component": component,
148
- "latency_p99": latency,
149
- "error_rate": error_rate,
150
- "throughput": throughput,
151
- "status": "ANOMALY" if is_anomaly else "NORMAL",
152
- "multi_agent_analysis": agent_analysis,
153
- "healing_actions": [a.value for a in healing_actions],
154
- "business_impact": business_impact,
155
- "severity": event.severity.value,
156
- "hmc_analysis": hmc_analysis,
157
- "processing_metadata": {
158
- "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []),
159
- "analysis_confidence": agent_confidence
160
- }
161
- }
162
-
163
- self.event_store.add(event)
164
- with self._lock:
165
- self.performance_metrics['total_incidents_processed'] += 1
166
- self.performance_metrics['multi_agent_analyses'] += 1
167
- if is_anomaly:
168
- self.performance_metrics['anomalies_detected'] += 1
169
-
170
- # Enhance with Claude (optional)
171
- try:
172
- result = await self.enhance_with_claude(event, result)
173
- except Exception as e:
174
- logger.error(f"Claude enhancement failed: {e}")
175
-
176
- return result
177
-
178
- async def enhance_with_claude(self, event: ReliabilityEvent, agent_results: Dict[str, Any]) -> Dict[str, Any]:
179
- """
180
- Enhance agent results with a Claude‑generated executive summary.
181
- Falls back gracefully if Claude is unavailable.
182
- """
183
- context_parts = []
184
- context_parts.append("INCIDENT SUMMARY:")
185
- context_parts.append(f"Component: {event.component}")
186
- context_parts.append(f"Timestamp: {event.timestamp.isoformat()}")
187
- context_parts.append(f"Severity: {event.severity.value}")
188
- context_parts.append("")
189
- context_parts.append("METRICS:")
190
- context_parts.append(f"• Latency P99: {event.latency_p99}ms")
191
- context_parts.append(f"• Error Rate: {event.error_rate:.1%}")
192
- context_parts.append(f"• Throughput: {event.throughput} req/s")
193
- if event.cpu_util:
194
- context_parts.append(f"• CPU: {event.cpu_util:.1%}")
195
- if event.memory_util:
196
- context_parts.append(f"• Memory: {event.memory_util:.1%}")
197
- context_parts.append("")
198
- if agent_results.get('multi_agent_analysis'):
199
- context_parts.append("AGENT ANALYSIS:")
200
- context_parts.append(json.dumps(agent_results['multi_agent_analysis'], indent=2))
201
- context = "\n".join(context_parts)
202
-
203
- prompt = f"""{context}
204
- TASK: Provide an executive summary synthesizing all agent analyses.
205
- Include:
206
- 1. Concise incident description
207
- 2. Most likely root cause
208
- 3. Single best recovery action
209
- 4. Estimated impact and recovery time
210
- Be specific and actionable."""
211
-
212
- system_prompt = """You are a senior Site Reliability Engineer synthesizing
213
- multiple AI agent analyses into clear, actionable guidance for incident response.
214
- Focus on clarity, accuracy, and decisive recommendations."""
215
-
216
- claude_synthesis = self.claude_adapter.generate_completion(prompt, system_prompt)
217
- agent_results['claude_synthesis'] = {
218
- 'summary': claude_synthesis,
219
- 'timestamp': datetime.datetime.now(datetime.timezone.utc).isoformat(),
220
- 'source': 'claude-opus-4'
221
- }
222
- return agent_results