petter2025 commited on
Commit
775c77d
·
verified ·
1 Parent(s): e78c83f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -365
app.py CHANGED
@@ -7,264 +7,91 @@ import pandas as pd
7
  import datetime
8
  from typing import List, Dict, Any
9
  import hashlib
 
10
 
11
- # Import our new modules
12
  from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
13
  from healing_policies import PolicyEngine
 
14
 
15
  # === Configuration ===
16
  HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
17
  HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
18
  HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
19
 
20
- # === FAISS & Embeddings Setup ===
21
- try:
22
- from sentence_transformers import SentenceTransformer
23
- import faiss
24
-
25
- VECTOR_DIM = 384
26
- INDEX_FILE = "incident_vectors.index"
27
- TEXTS_FILE = "incident_texts.json"
28
-
29
- # Try to load model with error handling
30
- try:
31
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
32
- except Exception as e:
33
- print(f"Model loading warning: {e}")
34
- # Fallback to direct loading
35
- from sentence_transformers import SentenceTransformer as ST
36
- model = ST("sentence-transformers/all-MiniLM-L6-v2")
37
-
38
- if os.path.exists(INDEX_FILE):
39
- index = faiss.read_index(INDEX_FILE)
40
- with open(TEXTS_FILE, "r") as f:
41
- incident_texts = json.load(f)
42
- else:
43
- index = faiss.IndexFlatL2(VECTOR_DIM)
44
- incident_texts = []
45
-
46
- except ImportError as e:
47
- print(f"Warning: FAISS or SentenceTransformers not available: {e}")
48
- index = None
49
- incident_texts = []
50
- model = None
51
-
52
- def save_index():
53
- """Save FAISS index and incident texts"""
54
- if index is not None:
55
- faiss.write_index(index, INDEX_FILE)
56
- with open(TEXTS_FILE, "w") as f:
57
- json.dump(incident_texts, f)
58
-
59
- # === Core Engine Components ===
60
  policy_engine = PolicyEngine()
 
61
  events_history: List[ReliabilityEvent] = []
62
 
63
- class BusinessImpactCalculator:
64
- """Calculate business impact of anomalies"""
65
-
66
- def __init__(self, revenue_per_request: float = 0.01):
67
- self.revenue_per_request = revenue_per_request
68
-
69
- def calculate_impact(self, event: ReliabilityEvent, duration_minutes: int = 5) -> Dict[str, Any]:
70
- """Enhanced business impact calculation"""
71
-
72
- # More realistic impact calculation
73
- base_revenue_per_minute = 100 # Base revenue per minute for the service
74
-
75
- # Calculate impact based on severity of anomalies
76
- impact_multiplier = 1.0
77
-
78
- if event.latency_p99 > 300:
79
- impact_multiplier += 0.5 # High latency impact
80
- if event.error_rate > 0.1:
81
- impact_multiplier += 0.8 # High error rate impact
82
- if event.cpu_util and event.cpu_util > 0.9:
83
- impact_multiplier += 0.3 # Resource exhaustion impact
84
-
85
- revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
86
-
87
- # More realistic user impact
88
- base_users_affected = 1000 # Base user count
89
- user_impact_multiplier = (event.error_rate * 10) + (max(0, event.latency_p99 - 100) / 500)
90
- affected_users = int(base_users_affected * user_impact_multiplier)
91
-
92
- # Severity classification
93
- if revenue_loss > 500 or affected_users > 5000:
94
- severity = "CRITICAL"
95
- elif revenue_loss > 100 or affected_users > 1000:
96
- severity = "HIGH"
97
- elif revenue_loss > 50 or affected_users > 500:
98
- severity = "MEDIUM"
99
- else:
100
- severity = "LOW"
101
-
102
- return {
103
- 'revenue_loss_estimate': round(revenue_loss, 2),
104
- 'affected_users_estimate': affected_users,
105
- 'severity_level': severity,
106
- 'throughput_reduction_pct': round(min(100, user_impact_multiplier * 100), 1)
107
- }
108
-
109
- business_calculator = BusinessImpactCalculator()
110
 
111
- class AdvancedAnomalyDetector:
112
- """Enhanced anomaly detection with adaptive thresholds"""
113
-
114
  def __init__(self):
115
- self.historical_data = []
116
- self.adaptive_thresholds = {
117
- 'latency_p99': 150, # Will adapt based on history
118
- 'error_rate': 0.05
119
  }
120
 
121
- def detect_anomaly(self, event: ReliabilityEvent) -> bool:
122
- """Enhanced anomaly detection with adaptive thresholds"""
123
-
124
- # Basic threshold checks
125
- latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
126
- error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
127
-
128
- # Resource-based anomalies
129
- resource_anomaly = False
130
- if event.cpu_util and event.cpu_util > 0.9:
131
- resource_anomaly = True
132
- if event.memory_util and event.memory_util > 0.9:
133
- resource_anomaly = True
134
-
135
- # Update adaptive thresholds (simplified)
136
- self._update_thresholds(event)
137
-
138
- return latency_anomaly or error_anomaly or resource_anomaly
139
-
140
- def _update_thresholds(self, event: ReliabilityEvent):
141
- """Update adaptive thresholds based on historical data"""
142
- self.historical_data.append(event)
143
-
144
- # Keep only recent history
145
- if len(self.historical_data) > 100:
146
- self.historical_data.pop(0)
147
-
148
- # Update latency threshold to 90th percentile of recent data
149
- if len(self.historical_data) > 10:
150
- recent_latencies = [e.latency_p99 for e in self.historical_data[-20:]]
151
- self.adaptive_thresholds['latency_p99'] = np.percentile(recent_latencies, 90)
152
-
153
- anomaly_detector = AdvancedAnomalyDetector()
154
-
155
- def call_huggingface_analysis(prompt: str) -> str:
156
- """Use HF Inference API or fallback simulation"""
157
- if not HF_TOKEN:
158
- # Enhanced fallback analysis
159
- fallback_insights = [
160
- "High latency detected - possible resource contention or network issues",
161
- "Error rate increase suggests recent deployment instability",
162
- "Latency spike correlates with increased user traffic patterns",
163
- "Intermittent failures indicate potential dependency service degradation",
164
- "Performance degradation detected - consider scaling compute resources"
165
- ]
166
- import random
167
- return random.choice(fallback_insights)
168
-
169
- try:
170
- enhanced_prompt = f"""
171
- As a senior reliability engineer, analyze this telemetry event and provide a concise root cause analysis:
172
-
173
- {prompt}
174
-
175
- Focus on:
176
- - Potential infrastructure or application issues
177
- - Correlation between metrics
178
- - Business impact assessment
179
- - Recommended investigation areas
180
-
181
- Provide 1-2 sentences maximum with actionable insights.
182
- """
183
 
184
- payload = {
185
- "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
186
- "prompt": enhanced_prompt,
187
- "max_tokens": 150,
188
- "temperature": 0.4,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  }
190
- response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=15)
191
- if response.status_code == 200:
192
- result = response.json()
193
- analysis_text = result.get("choices", [{}])[0].get("text", "").strip()
194
- if analysis_text and len(analysis_text) > 10:
195
- return analysis_text.split('\n')[0]
196
- return analysis_text
197
- else:
198
- return f"API Error {response.status_code}: Service temporarily unavailable"
199
- except Exception as e:
200
- return f"Analysis service error: {str(e)}"
201
 
202
- def analyze_event(component: str, latency: float, error_rate: float,
203
- throughput: float = 1000, cpu_util: float = None,
204
- memory_util: float = None) -> Dict[str, Any]:
205
- """Main event analysis function"""
206
-
207
- # Create enhanced event
208
- event = ReliabilityEvent(
209
- component=component,
210
- latency_p99=latency,
211
- error_rate=error_rate,
212
- throughput=throughput,
213
- cpu_util=cpu_util,
214
- memory_util=memory_util,
215
- upstream_deps=["auth-service", "database"] if component == "api-service" else []
216
- )
217
-
218
- # Detect anomaly
219
- is_anomaly = anomaly_detector.detect_anomaly(event)
220
- event.severity = EventSeverity.HIGH if is_anomaly else EventSeverity.LOW
221
-
222
- # Build analysis prompt
223
- prompt = (
224
- f"Component: {component}\nLatency: {latency:.2f}ms\nError Rate: {error_rate:.3f}\n"
225
- f"Throughput: {throughput:.0f}\nCPU: {cpu_util or 'N/A'}\nMemory: {memory_util or 'N/A'}\n"
226
- f"Status: {'ANOMALY' if is_anomaly else 'NORMAL'}\n\n"
227
- "Provide a one-line reliability insight or root cause analysis."
228
- )
229
 
230
- # Get AI analysis
231
- analysis = call_huggingface_analysis(prompt)
232
-
233
- # Evaluate healing policies
234
- healing_actions = policy_engine.evaluate_policies(event)
235
-
236
- # Calculate business impact
237
- business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
238
-
239
- # Vector memory learning
240
- if index is not None and is_anomaly:
241
- vector_text = f"{component} {latency} {error_rate} {analysis}"
242
- vec = model.encode([vector_text])
243
- index.add(np.array(vec, dtype=np.float32))
244
- incident_texts.append(vector_text)
245
- save_index()
246
-
247
- # Prepare result
248
- result = {
249
- "timestamp": event.timestamp,
250
- "component": component,
251
- "latency_p99": latency,
252
- "error_rate": error_rate,
253
- "throughput": throughput,
254
- "status": "ANOMALY" if is_anomaly else "NORMAL",
255
- "analysis": analysis,
256
- "healing_actions": [action.value for action in healing_actions],
257
- "business_impact": business_impact,
258
- "severity": event.severity.value,
259
- "similar_incidents_count": len(incident_texts) if is_anomaly else 0
260
- }
261
-
262
- events_history.append(event)
263
- return result
264
 
265
- # === Gradio UI ===
266
- def submit_event(component, latency, error_rate, throughput, cpu_util, memory_util):
267
- """Handle event submission from UI"""
268
  try:
269
  # Convert inputs
270
  latency = float(latency)
@@ -273,139 +100,28 @@ def submit_event(component, latency, error_rate, throughput, cpu_util, memory_ut
273
  cpu_util = float(cpu_util) if cpu_util else None
274
  memory_util = float(memory_util) if memory_util else None
275
 
276
- result = analyze_event(component, latency, error_rate, throughput, cpu_util, memory_util)
 
 
 
277
 
278
- # Prepare table data
279
- table_data = []
280
- for event in events_history[-15:]:
281
- table_data.append([
282
- event.timestamp[:19], # Trim microseconds
283
- event.component,
284
- event.latency_p99,
285
- f"{event.error_rate:.3f}",
286
- event.throughput,
287
- event.severity.value.upper(),
288
- getattr(event, 'analysis', 'N/A')[:50] + "..." if getattr(event, 'analysis', 'N/A') else 'N/A'
289
- ])
290
 
291
- # Format output message
292
  status_emoji = "🚨" if result["status"] == "ANOMALY" else "✅"
293
- output_msg = f"{status_emoji} {result['status']} - {result['analysis']}"
294
 
295
- if result["business_impact"]:
296
- impact = result["business_impact"]
297
- output_msg += f"\n💰 Business Impact: ${impact['revenue_loss_estimate']} | 👥 {impact['affected_users_estimate']} users | 🚨 {impact['severity_level']}"
 
 
 
 
298
 
299
- if result["healing_actions"]:
300
- actions = ", ".join(result["healing_actions"])
301
- output_msg += f"\n🔧 Auto-Actions: {actions}"
302
 
303
- return (
304
- output_msg,
305
- gr.Dataframe(
306
- headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
307
- value=table_data,
308
- wrap=True
309
- )
310
- )
311
 
312
  except Exception as e:
313
- return f"❌ Error processing event: {str(e)}", gr.Dataframe(value=[])
314
-
315
- def create_ui():
316
- """Create the Gradio interface"""
317
- with gr.Blocks(title="🧠 Agentic Reliability Framework v2", theme="soft") as demo:
318
- gr.Markdown("""
319
- # 🧠 Agentic Reliability Framework v2
320
- **Production-Grade Self-Healing AI Systems**
321
-
322
- *Advanced anomaly detection + AI-driven root cause analysis + Business impact quantification*
323
- """)
324
-
325
- with gr.Row():
326
- with gr.Column(scale=1):
327
- gr.Markdown("### 📊 Telemetry Input")
328
- component = gr.Dropdown(
329
- choices=["api-service", "auth-service", "payment-service", "database", "cache-service"],
330
- value="api-service",
331
- label="Component",
332
- info="Select the service being monitored"
333
- )
334
- latency = gr.Slider(
335
- minimum=10, maximum=1000, value=100, step=1,
336
- label="Latency P99 (ms)",
337
- info="Alert threshold: >150ms (adaptive)"
338
- )
339
- error_rate = gr.Slider(
340
- minimum=0, maximum=0.5, value=0.02, step=0.001,
341
- label="Error Rate",
342
- info="Alert threshold: >0.05"
343
- )
344
- throughput = gr.Number(
345
- value=1000,
346
- label="Throughput (req/sec)",
347
- info="Current request rate"
348
- )
349
- cpu_util = gr.Slider(
350
- minimum=0, maximum=1, value=0.4, step=0.01,
351
- label="CPU Utilization",
352
- info="0.0 - 1.0 scale"
353
- )
354
- memory_util = gr.Slider(
355
- minimum=0, maximum=1, value=0.3, step=0.01,
356
- label="Memory Utilization",
357
- info="0.0 - 1.0 scale"
358
- )
359
- submit_btn = gr.Button("🚀 Submit Telemetry Event", variant="primary", size="lg")
360
-
361
- with gr.Column(scale=2):
362
- gr.Markdown("### 🔍 Live Analysis & Healing")
363
- output_text = gr.Textbox(
364
- label="Analysis Results",
365
- placeholder="Submit an event to see AI-powered analysis...",
366
- lines=4
367
- )
368
- gr.Markdown("### 📈 Recent Events (Last 15)")
369
- events_table = gr.Dataframe(
370
- headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
371
- label="Event History",
372
- wrap=True,
373
- max_height="400px"
374
- )
375
-
376
- # Information sections
377
- with gr.Accordion("ℹ️ Framework Capabilities", open=False):
378
- gr.Markdown("""
379
- - **🤖 AI-Powered Analysis**: Mistral-8x7B for intelligent root cause analysis
380
- - **🔧 Policy-Based Healing**: Automated recovery actions based on severity and context
381
- - **💰 Business Impact**: Revenue and user impact quantification
382
- - **🎯 Adaptive Detection**: ML-powered thresholds that learn from your environment
383
- - **📚 Vector Memory**: FAISS-based incident memory for similarity detection
384
- - **⚡ Production Ready**: Circuit breakers, cooldowns, and enterprise features
385
- """)
386
-
387
- with gr.Accordion("🔧 Healing Policies", open=False):
388
- policy_info = []
389
- for policy in policy_engine.policies:
390
- if policy.enabled:
391
- actions = ", ".join([action.value for action in policy.actions])
392
- policy_info.append(f"**{policy.name}**: {actions} (Priority: {policy.priority})")
393
-
394
- gr.Markdown("\n\n".join(policy_info))
395
-
396
- # Event handling
397
- submit_btn.click(
398
- fn=submit_event,
399
- inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
400
- outputs=[output_text, events_table]
401
- )
402
-
403
- return demo
404
-
405
- if __name__ == "__main__":
406
- demo = create_ui()
407
- demo.launch(
408
- server_name="0.0.0.0",
409
- server_port=7860,
410
- share=False
411
- )
 
7
  import datetime
8
  from typing import List, Dict, Any
9
  import hashlib
10
+ import asyncio
11
 
12
+ # Import enhanced modules
13
  from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
14
  from healing_policies import PolicyEngine
15
+ from agent_orchestrator import OrchestrationManager, AgentSpecialization
16
 
17
  # === Configuration ===
18
  HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
19
  HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
20
  HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
21
 
22
+ # === Initialize Enhanced Components ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  policy_engine = PolicyEngine()
24
+ orchestration_manager = OrchestrationManager()
25
  events_history: List[ReliabilityEvent] = []
26
 
27
+ # [Keep existing FAISS setup and BusinessImpactCalculator...]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ class EnhancedReliabilityEngine:
 
 
30
  def __init__(self):
31
+ self.performance_metrics = {
32
+ 'total_incidents_processed': 0,
33
+ 'multi_agent_analyses': 0,
34
+ 'average_processing_time': 0.0
35
  }
36
 
37
+ async def process_event_enhanced(self, component: str, latency: float, error_rate: float,
38
+ throughput: float = 1000, cpu_util: float = None,
39
+ memory_util: float = None) -> Dict[str, Any]:
40
+ """Enhanced event processing with multi-agent orchestration"""
41
+ start_time = asyncio.get_event_loop().time()
42
+
43
+ # Create event
44
+ event = ReliabilityEvent(
45
+ component=component,
46
+ latency_p99=latency,
47
+ error_rate=error_rate,
48
+ throughput=throughput,
49
+ cpu_util=cpu_util,
50
+ memory_util=memory_util,
51
+ upstream_deps=["auth-service", "database"] if component == "api-service" else []
52
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ # Multi-agent analysis
55
+ agent_analysis = await orchestration_manager.orchestrate_analysis(event)
56
+
57
+ # Policy evaluation
58
+ healing_actions = policy_engine.evaluate_policies(event)
59
+
60
+ # Business impact
61
+ business_impact = business_calculator.calculate_impact(event)
62
+
63
+ # Update metrics
64
+ processing_time = asyncio.get_event_loop().time() - start_time
65
+ self._update_performance_metrics(processing_time)
66
+
67
+ # Prepare comprehensive result
68
+ result = {
69
+ "timestamp": event.timestamp,
70
+ "component": component,
71
+ "latency_p99": latency,
72
+ "error_rate": error_rate,
73
+ "throughput": throughput,
74
+ "status": "ANOMALY" if agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0) > 0.5 else "NORMAL",
75
+ "multi_agent_analysis": agent_analysis,
76
+ "healing_actions": [action.value for action in healing_actions],
77
+ "business_impact": business_impact,
78
+ "processing_metadata": {
79
+ "processing_time_seconds": round(processing_time, 3),
80
+ "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []),
81
+ "analysis_confidence": agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
82
+ }
83
  }
84
+
85
+ events_history.append(event)
86
+ return result
 
 
 
 
 
 
 
 
87
 
88
+ # Initialize enhanced engine
89
+ enhanced_engine = EnhancedReliabilityEngine()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ # [Keep existing UI setup, but enhance the submission function...]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ async def submit_event_enhanced(component, latency, error_rate, throughput, cpu_util, memory_util):
94
+ """Enhanced event submission with async processing"""
 
95
  try:
96
  # Convert inputs
97
  latency = float(latency)
 
100
  cpu_util = float(cpu_util) if cpu_util else None
101
  memory_util = float(memory_util) if memory_util else None
102
 
103
+ # Process with enhanced engine
104
+ result = await enhanced_engine.process_event_enhanced(
105
+ component, latency, error_rate, throughput, cpu_util, memory_util
106
+ )
107
 
108
+ # [Keep existing table formatting...]
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ # Enhanced output formatting
111
  status_emoji = "🚨" if result["status"] == "ANOMALY" else "✅"
112
+ output_msg = f"{status_emoji} {result['status']}"
113
 
114
+ # Add multi-agent insights
115
+ if "multi_agent_analysis" in result:
116
+ analysis = result["multi_agent_analysis"]
117
+ output_msg += f"\n🎯 Confidence: {analysis.get('incident_summary', {}).get('anomaly_confidence', 0)*100:.1f}%"
118
+
119
+ if analysis.get('recommended_actions'):
120
+ output_msg += f"\n💡 Insights: {', '.join(analysis['recommended_actions'][:2])}"
121
 
122
+ # [Keep existing business impact and healing actions display...]
 
 
123
 
124
+ return (output_msg, table_output)
 
 
 
 
 
 
 
125
 
126
  except Exception as e:
127
+ return f"❌ Error processing event: {str(e)}", gr.Dataframe(value=[])