petter2025 commited on
Commit
3c1311b
·
verified ·
1 Parent(s): 714bfce

Create ml_models.py

Browse files
Files changed (1) hide show
  1. ml_models.py +526 -0
ml_models.py ADDED
@@ -0,0 +1,526 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Machine Learning Models for Advanced Anomaly Detection
3
+ Includes ensemble methods, causal inference, and adaptive thresholds
4
+ """
5
+
6
+ import numpy as np
7
+ from typing import Tuple, Optional, Dict, List
8
+ import logging
9
+ import datetime
10
+
11
+ # Try importing optional ML libraries
12
+ try:
13
+ from sklearn.ensemble import IsolationForest
14
+ from sklearn.preprocessing import StandardScaler
15
+ SKLEARN_AVAILABLE = True
16
+ except ImportError:
17
+ SKLEARN_AVAILABLE = False
18
+ logging.warning("scikit-learn not available. Using fallback detection only.")
19
+
20
+ try:
21
+ import torch
22
+ import torch.nn as nn
23
+ PYTORCH_AVAILABLE = True
24
+ except ImportError:
25
+ PYTORCH_AVAILABLE = False
26
+ logging.warning("PyTorch not available. LSTM detector disabled.")
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # === LSTM Model (Optional - Only if PyTorch available) ===
31
+
32
+ if PYTORCH_AVAILABLE:
33
+ class LSTMAnomalyDetector(nn.Module):
34
+ """
35
+ LSTM-based anomaly detector for time-series analysis.
36
+ Uses sequence-to-sequence learning to predict next values
37
+ and flag anomalies based on prediction error.
38
+ """
39
+
40
+ def __init__(self, input_size: int = 5, hidden_size: int = 64, num_layers: int = 2):
41
+ super(LSTMAnomalyDetector, self).__init__()
42
+
43
+ self.hidden_size = hidden_size
44
+ self.num_layers = num_layers
45
+
46
+ # LSTM layers
47
+ self.lstm = nn.LSTM(
48
+ input_size=input_size,
49
+ hidden_size=hidden_size,
50
+ num_layers=num_layers,
51
+ batch_first=True,
52
+ dropout=0.2
53
+ )
54
+
55
+ # Fully connected layers
56
+ self.fc1 = nn.Linear(hidden_size, 32)
57
+ self.fc2 = nn.Linear(32, input_size)
58
+ self.relu = nn.ReLU()
59
+
60
+ def forward(self, x):
61
+ """Forward pass through the network"""
62
+ # LSTM forward pass
63
+ lstm_out, _ = self.lstm(x)
64
+
65
+ # Take last time step
66
+ last_output = lstm_out[:, -1, :]
67
+
68
+ # Fully connected layers
69
+ out = self.relu(self.fc1(last_output))
70
+ out = self.fc2(out)
71
+
72
+ return out
73
+ else:
74
+ # Dummy class if PyTorch not available
75
+ class LSTMAnomalyDetector:
76
+ def __init__(self, *args, **kwargs):
77
+ logger.warning("LSTM detector not available (PyTorch not installed)")
78
+
79
+ # === Ensemble Anomaly Detector ===
80
+
81
+ class EnsembleAnomalyDetector:
82
+ """
83
+ Ensemble of multiple anomaly detection algorithms for robust detection.
84
+ Gracefully degrades if ML libraries aren't available.
85
+ """
86
+
87
+ def __init__(self):
88
+ self.isolation_forest = None
89
+ self.lstm_model = None
90
+ self.scaler = None
91
+ self.is_trained = False
92
+ self.training_data = []
93
+
94
+ # Initialize models if libraries are available
95
+ if SKLEARN_AVAILABLE:
96
+ try:
97
+ self.isolation_forest = IsolationForest(
98
+ contamination=0.1,
99
+ random_state=42,
100
+ n_estimators=100
101
+ )
102
+ self.scaler = StandardScaler()
103
+ logger.info("Initialized Isolation Forest detector")
104
+ except Exception as e:
105
+ logger.error(f"Failed to initialize Isolation Forest: {e}")
106
+
107
+ if PYTORCH_AVAILABLE:
108
+ try:
109
+ self.lstm_model = LSTMAnomalyDetector()
110
+ logger.info("Initialized LSTM detector")
111
+ except Exception as e:
112
+ logger.error(f"Failed to initialize LSTM: {e}")
113
+
114
+ logger.info(f"EnsembleAnomalyDetector initialized (sklearn={SKLEARN_AVAILABLE}, pytorch={PYTORCH_AVAILABLE})")
115
+
116
+ def add_sample(self, features: np.ndarray) -> None:
117
+ """
118
+ Add training sample
119
+
120
+ Args:
121
+ features: numpy array of [latency, error_rate, cpu, memory, throughput]
122
+ """
123
+ if not isinstance(features, np.ndarray):
124
+ features = np.array(features)
125
+
126
+ self.training_data.append(features)
127
+
128
+ # Auto-train when we have enough data
129
+ if len(self.training_data) >= 100 and not self.is_trained:
130
+ self.train()
131
+
132
+ def train(self) -> None:
133
+ """Train all available models in the ensemble"""
134
+ if len(self.training_data) < 50:
135
+ logger.warning(f"Insufficient data for training: {len(self.training_data)} samples (need 50+)")
136
+ return
137
+
138
+ try:
139
+ X = np.array(self.training_data)
140
+
141
+ # Train Isolation Forest if available
142
+ if self.isolation_forest is not None and SKLEARN_AVAILABLE:
143
+ self.isolation_forest.fit(X)
144
+ logger.info(f"Trained Isolation Forest on {len(self.training_data)} samples")
145
+
146
+ # Train LSTM if available (placeholder for now)
147
+ if self.lstm_model is not None and PYTORCH_AVAILABLE:
148
+ # TODO: Implement full LSTM training loop
149
+ # For now, just scale the data
150
+ if self.scaler is not None:
151
+ X_scaled = self.scaler.fit_transform(X)
152
+ logger.info("LSTM training not yet implemented (using fallback)")
153
+
154
+ self.is_trained = True
155
+ logger.info(f"✅ Ensemble trained on {len(self.training_data)} samples")
156
+
157
+ except Exception as e:
158
+ logger.error(f"Training failed: {e}", exc_info=True)
159
+ self.is_trained = False
160
+
161
+ def predict_anomaly(self, features: np.ndarray) -> Tuple[bool, float, Dict]:
162
+ """
163
+ Predict if features represent an anomaly
164
+
165
+ Args:
166
+ features: numpy array of [latency, error_rate, cpu, memory, throughput]
167
+
168
+ Returns:
169
+ Tuple of (is_anomaly: bool, confidence: float, explanation: dict)
170
+ """
171
+ if not isinstance(features, np.ndarray):
172
+ features = np.array(features)
173
+
174
+ # If not trained or no ML libraries, use fallback
175
+ if not self.is_trained or not SKLEARN_AVAILABLE:
176
+ return self._fallback_detection(features)
177
+
178
+ try:
179
+ # Isolation Forest prediction
180
+ if_score = self.isolation_forest.score_samples(features.reshape(1, -1))[0]
181
+ if_anomaly = self.isolation_forest.predict(features.reshape(1, -1))[0] == -1
182
+
183
+ # LSTM prediction (placeholder for now)
184
+ lstm_score = 0.5 # TODO: Implement actual LSTM prediction
185
+
186
+ # Statistical tests
187
+ stat_score = self._statistical_tests(features)
188
+
189
+ # Ensemble voting (weighted average)
190
+ confidence = np.mean([
191
+ abs(if_score),
192
+ lstm_score,
193
+ stat_score
194
+ ])
195
+
196
+ is_anomaly = if_anomaly or confidence > 0.7
197
+
198
+ explanation = {
199
+ 'isolation_forest_score': float(if_score),
200
+ 'isolation_forest_anomaly': bool(if_anomaly),
201
+ 'lstm_reconstruction_error': float(lstm_score),
202
+ 'statistical_score': float(stat_score),
203
+ 'ensemble_confidence': float(confidence),
204
+ 'primary_detector': 'isolation_forest' if if_anomaly else 'ensemble',
205
+ 'models_used': ['isolation_forest', 'statistical']
206
+ }
207
+
208
+ return is_anomaly, confidence, explanation
209
+
210
+ except Exception as e:
211
+ logger.error(f"Prediction failed, using fallback: {e}", exc_info=True)
212
+ return self._fallback_detection(features)
213
+
214
+ def _statistical_tests(self, features: np.ndarray) -> float:
215
+ """
216
+ Perform statistical tests for anomaly detection using z-scores
217
+
218
+ Args:
219
+ features: Current feature values
220
+
221
+ Returns:
222
+ Anomaly probability (0-1)
223
+ """
224
+ if len(self.training_data) < 10:
225
+ return 0.5
226
+
227
+ try:
228
+ # Calculate z-scores
229
+ historical = np.array(self.training_data)
230
+ mean = np.mean(historical, axis=0)
231
+ std = np.std(historical, axis=0)
232
+
233
+ # Avoid division by zero
234
+ z_scores = np.abs((features - mean) / (std + 1e-8))
235
+ max_z_score = np.max(z_scores)
236
+
237
+ # Convert z-score to probability (3-sigma rule)
238
+ # z > 3 is very anomalous
239
+ anomaly_prob = min(1.0, max_z_score / 3.0)
240
+
241
+ return anomaly_prob
242
+
243
+ except Exception as e:
244
+ logger.error(f"Statistical test failed: {e}")
245
+ return 0.5
246
+
247
+ def _fallback_detection(self, features: np.ndarray) -> Tuple[bool, float, Dict]:
248
+ """
249
+ Fallback detection when ML models aren't trained or available
250
+ Uses simple threshold-based detection
251
+
252
+ Args:
253
+ features: [latency, error_rate, cpu, memory, throughput]
254
+
255
+ Returns:
256
+ Tuple of (is_anomaly, confidence, explanation)
257
+ """
258
+ latency_threshold = 150
259
+ error_rate_threshold = 0.05
260
+ cpu_threshold = 0.8
261
+ memory_threshold = 0.8
262
+
263
+ latency = features[0] if len(features) > 0 else 0
264
+ error_rate = features[1] if len(features) > 1 else 0
265
+ cpu = features[2] if len(features) > 2 else 0
266
+ memory = features[3] if len(features) > 3 else 0
267
+
268
+ is_anomaly = (
269
+ latency > latency_threshold or
270
+ error_rate > error_rate_threshold or
271
+ cpu > cpu_threshold or
272
+ memory > memory_threshold
273
+ )
274
+
275
+ confidence = 0.5 if is_anomaly else 0.1
276
+
277
+ explanation = {
278
+ 'method': 'fallback_threshold',
279
+ 'latency_exceeded': latency > latency_threshold,
280
+ 'error_rate_exceeded': error_rate > error_rate_threshold,
281
+ 'cpu_exceeded': cpu > cpu_threshold,
282
+ 'memory_exceeded': memory > memory_threshold
283
+ }
284
+
285
+ return is_anomaly, confidence, explanation
286
+
287
+ # === Causal Inference Engine ===
288
+
289
+ class CausalInferenceEngine:
290
+ """
291
+ Bayesian causal inference for root cause analysis.
292
+ Uses probabilistic graphical models to infer causality.
293
+ """
294
+
295
+ def __init__(self):
296
+ # Define causal relationships (cause -> effects)
297
+ self.causal_graph = {
298
+ 'database_latency': ['api_latency', 'error_rate'],
299
+ 'network_issues': ['api_latency', 'timeout_errors'],
300
+ 'memory_leak': ['memory_util', 'gc_time', 'response_time'],
301
+ 'cpu_saturation': ['cpu_util', 'queue_length', 'latency'],
302
+ 'traffic_spike': ['throughput', 'latency', 'error_rate']
303
+ }
304
+
305
+ # Prior probabilities for each root cause
306
+ self.prior_probabilities = {
307
+ 'database_latency': 0.3,
308
+ 'network_issues': 0.2,
309
+ 'memory_leak': 0.15,
310
+ 'cpu_saturation': 0.2,
311
+ 'traffic_spike': 0.15
312
+ }
313
+
314
+ logger.info("Initialized CausalInferenceEngine")
315
+
316
+ def infer_root_cause(self, symptoms: Dict[str, float]) -> List[Tuple[str, float]]:
317
+ """
318
+ Use Bayesian inference to determine likely root causes
319
+
320
+ Args:
321
+ symptoms: Dictionary of observed symptoms and their values
322
+ e.g., {'api_latency': 500, 'error_rate': 0.15, 'cpu_util': 0.9}
323
+
324
+ Returns:
325
+ List of (root_cause, probability) tuples sorted by probability
326
+ """
327
+ posterior_probs = {}
328
+
329
+ for cause, effects in self.causal_graph.items():
330
+ # Calculate likelihood P(symptoms|cause)
331
+ likelihood = self._calculate_likelihood(symptoms, effects)
332
+
333
+ # Calculate posterior P(cause|symptoms) ∝ P(symptoms|cause) * P(cause)
334
+ prior = self.prior_probabilities[cause]
335
+ posterior = likelihood * prior
336
+
337
+ posterior_probs[cause] = posterior
338
+
339
+ # Normalize probabilities
340
+ total = sum(posterior_probs.values())
341
+ if total > 0:
342
+ posterior_probs = {k: v/total for k, v in posterior_probs.items()}
343
+ else:
344
+ # If all probabilities are 0, return uniform distribution
345
+ posterior_probs = {k: 1.0/len(posterior_probs) for k in posterior_probs}
346
+
347
+ # Sort by probability (descending)
348
+ ranked_causes = sorted(
349
+ posterior_probs.items(),
350
+ key=lambda x: x[1],
351
+ reverse=True
352
+ )
353
+
354
+ logger.info(f"Inferred root causes: {ranked_causes[:3]}")
355
+
356
+ return ranked_causes
357
+
358
+ def _calculate_likelihood(self, symptoms: Dict[str, float], effects: List[str]) -> float:
359
+ """
360
+ Calculate likelihood of symptoms given a cause
361
+
362
+ Args:
363
+ symptoms: Observed symptoms
364
+ effects: Expected effects of the cause
365
+
366
+ Returns:
367
+ Likelihood score (0-1)
368
+ """
369
+ matching_effects = sum(1 for effect in effects if effect in symptoms)
370
+
371
+ if matching_effects == 0:
372
+ return 0.1 # Low but non-zero probability
373
+
374
+ # Higher likelihood if more effects are observed
375
+ likelihood = matching_effects / len(effects)
376
+
377
+ return likelihood
378
+
379
+ # === Adaptive Threshold Learner ===
380
+
381
+ class AdaptiveThresholdLearner:
382
+ """
383
+ Online learning system that adapts thresholds based on historical patterns.
384
+ Uses exponential moving averages and seasonality detection.
385
+ """
386
+
387
+ def __init__(self, window_size: int = 100):
388
+ self.window_size = window_size
389
+ self.historical_data: Dict[str, List[Dict]] = {}
390
+ self.thresholds: Dict[str, Dict] = {}
391
+ self.seasonality_patterns: Dict[str, Dict] = {}
392
+
393
+ logger.info(f"Initialized AdaptiveThresholdLearner with window_size={window_size}")
394
+
395
+ def update(self, metric: str, value: float, timestamp: datetime.datetime) -> None:
396
+ """
397
+ Update historical data with new metric value
398
+
399
+ Args:
400
+ metric: Metric name (e.g., 'latency', 'error_rate')
401
+ value: Metric value
402
+ timestamp: Timestamp of the measurement
403
+ """
404
+ if metric not in self.historical_data:
405
+ self.historical_data[metric] = []
406
+
407
+ self.historical_data[metric].append({
408
+ 'value': value,
409
+ 'timestamp': timestamp
410
+ })
411
+
412
+ # Keep only recent data
413
+ if len(self.historical_data[metric]) > self.window_size:
414
+ self.historical_data[metric].pop(0)
415
+
416
+ # Update threshold
417
+ self._update_threshold(metric)
418
+
419
+ def _update_threshold(self, metric: str) -> None:
420
+ """
421
+ Calculate adaptive threshold using statistical methods
422
+
423
+ Args:
424
+ metric: Metric name
425
+ """
426
+ data = self.historical_data[metric]
427
+ if len(data) < 10:
428
+ return
429
+
430
+ try:
431
+ values = [d['value'] for d in data]
432
+
433
+ # Calculate statistics
434
+ mean = np.mean(values)
435
+ std = np.std(values)
436
+ percentile_90 = np.percentile(values, 90)
437
+ percentile_95 = np.percentile(values, 95)
438
+
439
+ # Detect seasonality
440
+ hour_of_day = data[-1]['timestamp'].hour
441
+ day_of_week = data[-1]['timestamp'].weekday()
442
+
443
+ # Adjust threshold based on time
444
+ time_multiplier = self._get_time_multiplier(hour_of_day, day_of_week)
445
+
446
+ # Set adaptive threshold (mean + 2*std, adjusted for time)
447
+ threshold = (mean + 2 * std) * time_multiplier
448
+
449
+ self.thresholds[metric] = {
450
+ 'value': threshold,
451
+ 'mean': mean,
452
+ 'std': std,
453
+ 'p90': percentile_90,
454
+ 'p95': percentile_95,
455
+ 'last_updated': datetime.datetime.now(),
456
+ 'time_multiplier': time_multiplier
457
+ }
458
+
459
+ logger.debug(f"Updated threshold for {metric}: {threshold:.2f}")
460
+
461
+ except Exception as e:
462
+ logger.error(f"Failed to update threshold for {metric}: {e}")
463
+
464
+ def _get_time_multiplier(self, hour: int, day_of_week: int) -> float:
465
+ """
466
+ Adjust threshold based on time of day and day of week
467
+
468
+ Args:
469
+ hour: Hour of day (0-23)
470
+ day_of_week: Day of week (0=Monday, 6=Sunday)
471
+
472
+ Returns:
473
+ Multiplier for threshold adjustment
474
+ """
475
+ # Business hours (9 AM - 5 PM) on weekdays: higher threshold
476
+ if 9 <= hour <= 17 and day_of_week < 5:
477
+ return 1.2
478
+
479
+ # Off hours or weekends: lower threshold (more sensitive)
480
+ return 0.8
481
+
482
+ def get_threshold(self, metric: str) -> Optional[float]:
483
+ """
484
+ Get current adaptive threshold for metric
485
+
486
+ Args:
487
+ metric: Metric name
488
+
489
+ Returns:
490
+ Current threshold value or None if not available
491
+ """
492
+ if metric in self.thresholds:
493
+ return self.thresholds[metric]['value']
494
+ return None
495
+
496
+ def get_statistics(self, metric: str) -> Optional[Dict]:
497
+ """
498
+ Get full statistics for a metric
499
+
500
+ Args:
501
+ metric: Metric name
502
+
503
+ Returns:
504
+ Dictionary of statistics or None
505
+ """
506
+ return self.thresholds.get(metric)
507
+
508
+ # === Utility Functions ===
509
+
510
+ def create_feature_vector(event) -> np.ndarray:
511
+ """
512
+ Convert ReliabilityEvent to feature vector for ML models
513
+
514
+ Args:
515
+ event: ReliabilityEvent object
516
+
517
+ Returns:
518
+ numpy array of [latency, error_rate, cpu, memory, throughput]
519
+ """
520
+ return np.array([
521
+ event.latency_p99,
522
+ event.error_rate,
523
+ event.cpu_util if event.cpu_util is not None else 0.5,
524
+ event.memory_util if event.memory_util is not None else 0.5,
525
+ event.throughput
526
+ ])