likhonsheikh commited on
Commit
0efaf6e
·
verified ·
1 Parent(s): 2093744

Upload folder using huggingface_hub

Browse files
monitoring/__pycache__/monitoring.cpython-312.pyc ADDED
Binary file (32.4 kB). View file
 
monitoring/dashboard.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sheikh-Kitty Monitoring Dashboard
3
+ Real-time system monitoring and visualization
4
+
5
+ Features:
6
+ - System resource monitoring (CPU, memory, disk)
7
+ - API performance metrics
8
+ - Security alerts display
9
+ - Execution history tracking
10
+ - Health status indicators
11
+
12
+ Author: MiniMax Agent
13
+ Date: 2025-11-14
14
+ """
15
+
16
+ import json
17
+ import time
18
+ import psutil
19
+ from datetime import datetime, timedelta
20
+ from pathlib import Path
21
+ from typing import Dict, List, Any, Optional
22
+ from dataclasses import asdict
23
+ import logging
24
+
25
+ # Configure logging
26
+ logging.basicConfig(level=logging.INFO)
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class SimpleDashboard:
31
+ """Simple terminal-based dashboard for monitoring"""
32
+
33
+ def __init__(self, log_dir: str = "logs"):
34
+ self.log_dir = Path(log_dir)
35
+ self.log_dir.mkdir(exist_ok=True)
36
+ self.state_file = self.log_dir / "dashboard_state.json"
37
+
38
+ # System thresholds
39
+ self.thresholds = {
40
+ 'cpu_warning': 70.0,
41
+ 'cpu_critical': 90.0,
42
+ 'memory_warning': 75.0,
43
+ 'memory_critical': 90.0,
44
+ 'disk_warning': 80.0,
45
+ 'disk_critical': 95.0
46
+ }
47
+
48
+ def get_system_status(self) -> Dict[str, Any]:
49
+ """Get current system status"""
50
+ try:
51
+ # CPU usage
52
+ cpu_percent = psutil.cpu_percent(interval=1)
53
+ cpu_count = psutil.cpu_count()
54
+
55
+ # Memory usage
56
+ memory = psutil.virtual_memory()
57
+
58
+ # Disk usage
59
+ disk = psutil.disk_usage('/')
60
+
61
+ # Load average (Unix systems)
62
+ try:
63
+ load_avg = psutil.getloadavg()[0] if hasattr(psutil, 'getloadavg') else 0.0
64
+ except AttributeError:
65
+ load_avg = 0.0
66
+
67
+ return {
68
+ 'timestamp': datetime.now().isoformat(),
69
+ 'cpu': {
70
+ 'usage_percent': cpu_percent,
71
+ 'count': cpu_count,
72
+ 'load_average': load_avg,
73
+ 'status': self._get_status_level(cpu_percent, 'cpu')
74
+ },
75
+ 'memory': {
76
+ 'usage_percent': memory.percent,
77
+ 'available_gb': memory.available / (1024**3),
78
+ 'total_gb': memory.total / (1024**3),
79
+ 'status': self._get_status_level(memory.percent, 'memory')
80
+ },
81
+ 'disk': {
82
+ 'usage_percent': (disk.used / disk.total) * 100,
83
+ 'free_gb': disk.free / (1024**3),
84
+ 'total_gb': disk.total / (1024**3),
85
+ 'status': self._get_status_level((disk.used / disk.total) * 100, 'disk')
86
+ }
87
+ }
88
+ except Exception as e:
89
+ logger.error(f"Failed to get system status: {e}")
90
+ return {}
91
+
92
+ def _get_status_level(self, value: float, resource_type: str) -> str:
93
+ """Determine status level based on thresholds"""
94
+ if resource_type == 'cpu':
95
+ if value >= self.thresholds['cpu_critical']:
96
+ return 'critical'
97
+ elif value >= self.thresholds['cpu_warning']:
98
+ return 'warning'
99
+ elif resource_type == 'memory':
100
+ if value >= self.thresholds['memory_critical']:
101
+ return 'critical'
102
+ elif value >= self.thresholds['memory_warning']:
103
+ return 'warning'
104
+ elif resource_type == 'disk':
105
+ if value >= self.thresholds['disk_critical']:
106
+ return 'critical'
107
+ elif value >= self.thresholds['disk_warning']:
108
+ return 'warning'
109
+
110
+ return 'healthy'
111
+
112
+ def get_api_metrics(self) -> Dict[str, Any]:
113
+ """Get API metrics from log files"""
114
+ try:
115
+ api_log = self.log_dir / "api_requests.jsonl"
116
+ if not api_log.exists():
117
+ return {}
118
+
119
+ # Read recent API requests
120
+ recent_requests = []
121
+ with open(api_log, 'r') as f:
122
+ for line in f:
123
+ try:
124
+ request = json.loads(line.strip())
125
+ recent_requests.append(request)
126
+ except json.JSONDecodeError:
127
+ continue
128
+
129
+ # Filter requests from last hour
130
+ one_hour_ago = datetime.now() - timedelta(hours=1)
131
+ recent_requests = [
132
+ req for req in recent_requests
133
+ if datetime.fromisoformat(req['timestamp']) > one_hour_ago
134
+ ]
135
+
136
+ if not recent_requests:
137
+ return {}
138
+
139
+ # Calculate metrics
140
+ execution_times = [req['execution_time'] for req in recent_requests]
141
+ successes = [req['response_data']['success'] for req in recent_requests]
142
+
143
+ return {
144
+ 'total_requests': len(recent_requests),
145
+ 'successful_requests': sum(successes),
146
+ 'success_rate': sum(successes) / len(successes) if successes else 0,
147
+ 'average_execution_time': sum(execution_times) / len(execution_times) if execution_times else 0,
148
+ 'p95_execution_time': sorted(execution_times)[int(len(execution_times) * 0.95)] if execution_times else 0,
149
+ 'endpoints': {
150
+ req['endpoint']: {
151
+ 'count': 1,
152
+ 'success': req['response_data']['success']
153
+ }
154
+ for req in recent_requests
155
+ }
156
+ }
157
+
158
+ except Exception as e:
159
+ logger.error(f"Failed to get API metrics: {e}")
160
+ return {}
161
+
162
+ def get_alerts(self) -> List[Dict[str, Any]]:
163
+ """Get recent alerts"""
164
+ try:
165
+ alerts_file = self.log_dir / "alerts.jsonl"
166
+ if not alerts_file.exists():
167
+ return []
168
+
169
+ alerts = []
170
+ with open(alerts_file, 'r') as f:
171
+ for line in f:
172
+ try:
173
+ alert = json.loads(line.strip())
174
+ alerts.append(alert)
175
+ except json.JSONDecodeError:
176
+ continue
177
+
178
+ # Return recent alerts (last 24 hours)
179
+ one_day_ago = datetime.now() - timedelta(days=1)
180
+ recent_alerts = [
181
+ alert for alert in alerts
182
+ if datetime.fromisoformat(alert['timestamp']) > one_day_ago
183
+ ]
184
+
185
+ return sorted(recent_alerts, key=lambda x: x['timestamp'], reverse=True)[:10]
186
+
187
+ except Exception as e:
188
+ logger.error(f"Failed to get alerts: {e}")
189
+ return []
190
+
191
+ def display_dashboard(self):
192
+ """Display dashboard in terminal"""
193
+ # Clear screen (ANSI escape code)
194
+ print("\033[2J\033[H")
195
+
196
+ print("=" * 60)
197
+ print("🏗️ SHEIKH-KITTY MONITORING DASHBOARD")
198
+ print("=" * 60)
199
+ print(f"📅 {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
200
+ print()
201
+
202
+ # System Status
203
+ system_status = self.get_system_status()
204
+ if system_status:
205
+ print("🖥️ SYSTEM STATUS")
206
+ print("-" * 20)
207
+
208
+ # CPU
209
+ cpu = system_status['cpu']
210
+ status_icon = self._get_status_icon(cpu['status'])
211
+ print(f"{status_icon} CPU: {cpu['usage_percent']:6.1f}% (Cores: {cpu['count']}, Load: {cpu['load_average']:.2f})")
212
+
213
+ # Memory
214
+ memory = system_status['memory']
215
+ status_icon = self._get_status_icon(memory['status'])
216
+ print(f"{status_icon} Memory: {memory['usage_percent']:6.1f}% (Available: {memory['available_gb']:.1f}GB)")
217
+
218
+ # Disk
219
+ disk = system_status['disk']
220
+ status_icon = self._get_status_icon(disk['status'])
221
+ print(f"{status_icon} Disk: {disk['usage_percent']:6.1f}% (Free: {disk['free_gb']:.1f}GB)")
222
+ print()
223
+
224
+ # API Metrics
225
+ api_metrics = self.get_api_metrics()
226
+ if api_metrics:
227
+ print("🌐 API METRICS (Last Hour)")
228
+ print("-" * 25)
229
+ print(f"📊 Requests: {api_metrics['total_requests']}")
230
+ print(f"✅ Success: {api_metrics['successful_requests']} ({api_metrics['success_rate']:.1%})")
231
+ print(f"⏱️ Avg Time: {api_metrics['average_execution_time']:.3f}s")
232
+ print(f"🚀 P95 Time: {api_metrics['p95_execution_time']:.3f}s")
233
+
234
+ # Endpoint breakdown
235
+ if api_metrics['endpoints']:
236
+ print("🔗 Endpoints:")
237
+ for endpoint, stats in api_metrics['endpoints'].items():
238
+ print(f" {endpoint}: {stats['count']} requests, {stats['success']:.1%} success")
239
+ print()
240
+
241
+ # Recent Alerts
242
+ alerts = self.get_alerts()
243
+ if alerts:
244
+ print("🚨 RECENT ALERTS")
245
+ print("-" * 15)
246
+ for alert in alerts[:5]: # Show last 5 alerts
247
+ severity_icon = self._get_alert_icon(alert['severity'])
248
+ print(f"{severity_icon} {alert['severity'].upper()}: {alert['message']}")
249
+ print(f" 📅 {alert['timestamp']}")
250
+ print()
251
+
252
+ # Health Summary
253
+ print("💚 SYSTEM HEALTH")
254
+ print("-" * 15)
255
+ health_score = self._calculate_health_score(system_status, api_metrics, alerts)
256
+ health_status = self._get_health_status(health_score)
257
+ print(f"Overall: {health_status} ({health_score:.1%})")
258
+ print()
259
+
260
+ print("Press Ctrl+C to exit")
261
+
262
+ def _get_status_icon(self, status: str) -> str:
263
+ """Get icon for status"""
264
+ icons = {
265
+ 'healthy': '🟢',
266
+ 'warning': '🟡',
267
+ 'critical': '🔴'
268
+ }
269
+ return icons.get(status, '⚪')
270
+
271
+ def _get_alert_icon(self, severity: str) -> str:
272
+ """Get icon for alert severity"""
273
+ icons = {
274
+ 'info': 'ℹ️',
275
+ 'warning': '⚠️',
276
+ 'error': '❌',
277
+ 'critical': '🚨'
278
+ }
279
+ return icons.get(severity, '📢')
280
+
281
+ def _calculate_health_score(self, system_status: Dict, api_metrics: Dict, alerts: List) -> float:
282
+ """Calculate overall health score"""
283
+ score = 1.0
284
+
285
+ # Deduct for system resource issues
286
+ if system_status:
287
+ if system_status['cpu']['status'] == 'warning':
288
+ score -= 0.1
289
+ elif system_status['cpu']['status'] == 'critical':
290
+ score -= 0.2
291
+
292
+ if system_status['memory']['status'] == 'warning':
293
+ score -= 0.1
294
+ elif system_status['memory']['status'] == 'critical':
295
+ score -= 0.2
296
+
297
+ if system_status['disk']['status'] == 'warning':
298
+ score -= 0.1
299
+ elif system_status['disk']['status'] == 'critical':
300
+ score -= 0.2
301
+
302
+ # Deduct for API issues
303
+ if api_metrics:
304
+ success_rate = api_metrics.get('success_rate', 1.0)
305
+ if success_rate < 0.95:
306
+ score -= (0.95 - success_rate)
307
+
308
+ # Deduct for recent alerts
309
+ recent_critical_alerts = sum(1 for alert in alerts if alert['severity'] == 'critical')
310
+ if recent_critical_alerts > 0:
311
+ score -= min(0.3, recent_critical_alerts * 0.1)
312
+
313
+ return max(0.0, score)
314
+
315
+ def _get_health_status(self, score: float) -> str:
316
+ """Get health status text"""
317
+ if score >= 0.9:
318
+ return "Excellent"
319
+ elif score >= 0.8:
320
+ return "Good"
321
+ elif score >= 0.7:
322
+ return "Fair"
323
+ elif score >= 0.5:
324
+ return "Poor"
325
+ else:
326
+ return "Critical"
327
+
328
+ def save_dashboard_state(self):
329
+ """Save current dashboard state"""
330
+ try:
331
+ state = {
332
+ 'timestamp': datetime.now().isoformat(),
333
+ 'system_status': self.get_system_status(),
334
+ 'api_metrics': self.get_api_metrics(),
335
+ 'alerts': self.get_alerts()
336
+ }
337
+
338
+ with open(self.state_file, 'w') as f:
339
+ json.dump(state, f, indent=2)
340
+
341
+ except Exception as e:
342
+ logger.error(f"Failed to save dashboard state: {e}")
343
+
344
+ def run_continuous_monitoring(self, update_interval: int = 30):
345
+ """Run continuous dashboard monitoring"""
346
+ try:
347
+ while True:
348
+ self.display_dashboard()
349
+ self.save_dashboard_state()
350
+ time.sleep(update_interval)
351
+ except KeyboardInterrupt:
352
+ print("\n👋 Monitoring dashboard stopped")
353
+ except Exception as e:
354
+ logger.error(f"Dashboard error: {e}")
355
+
356
+
357
+ def main():
358
+ """Main dashboard execution"""
359
+ import argparse
360
+
361
+ parser = argparse.ArgumentParser(description="Sheikh-Kitty Monitoring Dashboard")
362
+ parser.add_argument('--interval', type=int, default=30, help='Update interval in seconds')
363
+ parser.add_argument('--once', action='store_true', help='Display once and exit')
364
+
365
+ args = parser.parse_args()
366
+
367
+ dashboard = SimpleDashboard()
368
+
369
+ if args.once:
370
+ dashboard.display_dashboard()
371
+ else:
372
+ print("Starting Sheikh-Kitty monitoring dashboard...")
373
+ print("Press Ctrl+C to exit")
374
+ dashboard.run_continuous_monitoring(args.interval)
375
+
376
+
377
+ if __name__ == "__main__":
378
+ main()
monitoring/monitoring.py ADDED
@@ -0,0 +1,742 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sheikh-Kitty Monitoring System
3
+ Real-time metrics aggregation and system health monitoring
4
+
5
+ Features:
6
+ - API request metrics tracking
7
+ - Sandbox execution monitoring
8
+ - System resource monitoring
9
+ - Security violation alerts
10
+ - Performance analytics
11
+ - Health check endpoints
12
+
13
+ Author: MiniMax Agent
14
+ Date: 2025-11-14
15
+ """
16
+
17
+ import json
18
+ import time
19
+ import psutil
20
+ import threading
21
+ import queue
22
+ from datetime import datetime, timedelta
23
+ from pathlib import Path
24
+ from typing import Dict, List, Optional, Any, Callable
25
+ from dataclasses import dataclass, asdict
26
+ from enum import Enum
27
+ import logging
28
+ import statistics
29
+ from collections import deque, defaultdict
30
+ import os
31
+
32
+ # Configure logging
33
+ logging.basicConfig(level=logging.INFO)
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class MetricType(Enum):
38
+ """Types of metrics to track"""
39
+ COUNTER = "counter"
40
+ GAUGE = "gauge"
41
+ HISTOGRAM = "histogram"
42
+ TIMER = "timer"
43
+
44
+
45
+ class AlertSeverity(Enum):
46
+ """Alert severity levels"""
47
+ INFO = "info"
48
+ WARNING = "warning"
49
+ ERROR = "error"
50
+ CRITICAL = "critical"
51
+
52
+
53
+ @dataclass
54
+ class Metric:
55
+ """Individual metric data point"""
56
+ name: str
57
+ value: float
58
+ metric_type: MetricType
59
+ timestamp: datetime
60
+ labels: Dict[str, str] = None
61
+ tags: List[str] = None
62
+
63
+
64
+ @dataclass
65
+ class Alert:
66
+ """System alert"""
67
+ id: str
68
+ severity: AlertSeverity
69
+ message: str
70
+ timestamp: datetime
71
+ metric_name: str
72
+ threshold: float
73
+ current_value: float
74
+ resolved: bool = False
75
+ resolved_at: Optional[datetime] = None
76
+
77
+
78
+ class MetricCollector:
79
+ """Collect and store metrics"""
80
+
81
+ def __init__(self, max_history: int = 10000):
82
+ self.max_history = max_history
83
+ self.metrics = deque(maxlen=max_history)
84
+ self.current_values = {} # For gauge metrics
85
+ self.counters = defaultdict(float) # For counter metrics
86
+ self.lock = threading.Lock()
87
+
88
+ def record(self, metric: Metric):
89
+ """Record a metric"""
90
+ with self.lock:
91
+ self.metrics.append(metric)
92
+
93
+ # Update current values for gauge metrics
94
+ if metric.metric_type == MetricType.GAUGE:
95
+ self.current_values[metric.name] = metric.value
96
+ elif metric.metric_type == MetricType.COUNTER:
97
+ self.counters[metric.name] += metric.value
98
+
99
+ def get_metrics(self, name: str = None, since: datetime = None) -> List[Metric]:
100
+ """Get metrics by name and time range"""
101
+ with self.lock:
102
+ filtered_metrics = []
103
+
104
+ for metric in self.metrics:
105
+ # Filter by name
106
+ if name and metric.name != name:
107
+ continue
108
+
109
+ # Filter by time
110
+ if since and metric.timestamp < since:
111
+ continue
112
+
113
+ filtered_metrics.append(metric)
114
+
115
+ return filtered_metrics
116
+
117
+ def get_current_value(self, name: str) -> Optional[float]:
118
+ """Get current value for gauge metric"""
119
+ with self.lock:
120
+ return self.current_values.get(name)
121
+
122
+ def get_counter(self, name: str) -> float:
123
+ """Get counter value"""
124
+ with self.lock:
125
+ return self.counters.get(name, 0.0)
126
+
127
+ def get_stats(self, name: str, window_minutes: int = 60) -> Dict[str, float]:
128
+ """Get statistics for a metric over time window"""
129
+ since = datetime.now() - timedelta(minutes=window_minutes)
130
+ metrics = self.get_metrics(name, since)
131
+
132
+ if not metrics:
133
+ return {}
134
+
135
+ values = [m.value for m in metrics]
136
+
137
+ return {
138
+ 'count': len(values),
139
+ 'min': min(values),
140
+ 'max': max(values),
141
+ 'avg': statistics.mean(values),
142
+ 'median': statistics.median(values),
143
+ 'p95': self._percentile(values, 95),
144
+ 'p99': self._percentile(values, 99),
145
+ 'latest': values[-1] if values else 0.0
146
+ }
147
+
148
+ def _percentile(self, values: List[float], percentile: int) -> float:
149
+ """Calculate percentile"""
150
+ if not values:
151
+ return 0.0
152
+
153
+ sorted_values = sorted(values)
154
+ index = int(len(sorted_values) * percentile / 100)
155
+ return sorted_values[min(index, len(sorted_values) - 1)]
156
+
157
+
158
+ class AlertManager:
159
+ """Manage system alerts and notifications"""
160
+
161
+ def __init__(self, storage_path: str = "logs/alerts.jsonl"):
162
+ self.storage_path = Path(storage_path)
163
+ self.storage_path.parent.mkdir(parents=True, exist_ok=True)
164
+
165
+ self.active_alerts = {}
166
+ self.alert_history = deque(maxlen=1000)
167
+ self.rules = [] # Alert rules
168
+ self.lock = threading.Lock()
169
+
170
+ def add_rule(self, name: str, metric_name: str, threshold: float,
171
+ comparison: str = "greater_than", severity: AlertSeverity = AlertSeverity.WARNING):
172
+ """Add alert rule"""
173
+ rule = {
174
+ 'name': name,
175
+ 'metric_name': metric_name,
176
+ 'threshold': threshold,
177
+ 'comparison': comparison,
178
+ 'severity': severity,
179
+ 'enabled': True
180
+ }
181
+ self.rules.append(rule)
182
+ logger.info(f"Added alert rule: {name}")
183
+
184
+ def check_alerts(self, metric_collector: MetricCollector):
185
+ """Check metrics against alert rules"""
186
+ for rule in self.rules:
187
+ if not rule['enabled']:
188
+ continue
189
+
190
+ try:
191
+ current_value = metric_collector.get_current_value(rule['metric_name'])
192
+ if current_value is None:
193
+ continue
194
+
195
+ triggered = self._evaluate_condition(
196
+ current_value, rule['threshold'], rule['comparison']
197
+ )
198
+
199
+ if triggered:
200
+ self._trigger_alert(rule, current_value, metric_collector)
201
+ else:
202
+ self._resolve_alert(rule['name'], metric_collector)
203
+
204
+ except Exception as e:
205
+ logger.error(f"Alert check failed for {rule['name']}: {e}")
206
+
207
+ def _evaluate_condition(self, value: float, threshold: float, comparison: str) -> bool:
208
+ """Evaluate if condition is met"""
209
+ if comparison == "greater_than":
210
+ return value > threshold
211
+ elif comparison == "less_than":
212
+ return value < threshold
213
+ elif comparison == "equals":
214
+ return abs(value - threshold) < 0.001
215
+ elif comparison == "greater_equal":
216
+ return value >= threshold
217
+ elif comparison == "less_equal":
218
+ return value <= threshold
219
+ else:
220
+ return False
221
+
222
+ def _trigger_alert(self, rule: Dict[str, Any], current_value: float,
223
+ metric_collector: MetricCollector):
224
+ """Trigger an alert"""
225
+ alert_id = rule['name']
226
+
227
+ # Check if alert is already active
228
+ if alert_id in self.active_alerts:
229
+ return
230
+
231
+ # Create new alert
232
+ alert = Alert(
233
+ id=alert_id,
234
+ severity=rule['severity'],
235
+ message=f"{rule['metric_name']} is {current_value:.2f} (threshold: {rule['threshold']})",
236
+ timestamp=datetime.now(),
237
+ metric_name=rule['metric_name'],
238
+ threshold=rule['threshold'],
239
+ current_value=current_value
240
+ )
241
+
242
+ with self.lock:
243
+ self.active_alerts[alert_id] = alert
244
+ self.alert_history.append(alert)
245
+ self._save_alert(alert)
246
+
247
+ logger.warning(f"Alert triggered: {alert.message}")
248
+
249
+ def _resolve_alert(self, alert_id: str, metric_collector: MetricCollector):
250
+ """Resolve an active alert"""
251
+ if alert_id not in self.active_alerts:
252
+ return
253
+
254
+ with self.lock:
255
+ alert = self.active_alerts[alert_id]
256
+ alert.resolved = True
257
+ alert.resolved_at = datetime.now()
258
+
259
+ # Move to history
260
+ del self.active_alerts[alert_id]
261
+ self._save_alert(alert)
262
+
263
+ logger.info(f"Alert resolved: {alert_id}")
264
+
265
+ def _save_alert(self, alert: Alert):
266
+ """Save alert to persistent storage"""
267
+ try:
268
+ with open(self.storage_path, 'a') as f:
269
+ alert_data = asdict(alert)
270
+ alert_data['timestamp'] = alert.timestamp.isoformat()
271
+ if alert.resolved_at:
272
+ alert_data['resolved_at'] = alert.resolved_at.isoformat()
273
+ f.write(json.dumps(alert_data) + '\n')
274
+ except Exception as e:
275
+ logger.error(f"Failed to save alert: {e}")
276
+
277
+ def get_active_alerts(self) -> List[Alert]:
278
+ """Get currently active alerts"""
279
+ with self.lock:
280
+ return list(self.active_alerts.values())
281
+
282
+ def get_alert_history(self, limit: int = 100) -> List[Alert]:
283
+ """Get alert history"""
284
+ with self.lock:
285
+ return list(self.alert_history)[-limit:]
286
+
287
+
288
+ class SystemMonitor:
289
+ """Monitor system resources and health"""
290
+
291
+ def __init__(self, check_interval: int = 30):
292
+ self.check_interval = check_interval
293
+ self.running = False
294
+ self.monitor_thread = None
295
+
296
+ # System thresholds
297
+ self.thresholds = {
298
+ 'cpu_usage': 80.0, # %
299
+ 'memory_usage': 85.0, # %
300
+ 'disk_usage': 90.0, # %
301
+ 'temperature': 70.0, # Celsius
302
+ 'load_average': 2.0 # per CPU core
303
+ }
304
+
305
+ def start(self, metric_collector: MetricCollector):
306
+ """Start system monitoring"""
307
+ if self.running:
308
+ return
309
+
310
+ self.running = True
311
+ self.monitor_thread = threading.Thread(
312
+ target=self._monitor_loop,
313
+ args=(metric_collector,),
314
+ daemon=True
315
+ )
316
+ self.monitor_thread.start()
317
+ logger.info("System monitoring started")
318
+
319
+ def stop(self):
320
+ """Stop system monitoring"""
321
+ self.running = False
322
+ if self.monitor_thread:
323
+ self.monitor_thread.join()
324
+ logger.info("System monitoring stopped")
325
+
326
+ def _monitor_loop(self, metric_collector: MetricCollector):
327
+ """Main monitoring loop"""
328
+ while self.running:
329
+ try:
330
+ self._collect_system_metrics(metric_collector)
331
+ time.sleep(self.check_interval)
332
+ except Exception as e:
333
+ logger.error(f"System monitoring error: {e}")
334
+ time.sleep(5) # Brief pause on error
335
+
336
+ def _collect_system_metrics(self, metric_collector: MetricCollector):
337
+ """Collect system resource metrics"""
338
+ timestamp = datetime.now()
339
+
340
+ try:
341
+ # CPU metrics
342
+ cpu_percent = psutil.cpu_percent(interval=1)
343
+ cpu_count = psutil.cpu_count()
344
+ load_avg = psutil.getloadavg()[0] if hasattr(psutil, 'getloadavg') else 0.0
345
+
346
+ metric_collector.record(Metric(
347
+ name="system.cpu.usage",
348
+ value=cpu_percent,
349
+ metric_type=MetricType.GAUGE,
350
+ timestamp=timestamp,
351
+ labels={"core": "total"}
352
+ ))
353
+
354
+ metric_collector.record(Metric(
355
+ name="system.cpu.count",
356
+ value=cpu_count,
357
+ metric_type=MetricType.GAUGE,
358
+ timestamp=timestamp
359
+ ))
360
+
361
+ if load_avg > 0:
362
+ metric_collector.record(Metric(
363
+ name="system.load.average",
364
+ value=load_avg,
365
+ metric_type=MetricType.GAUGE,
366
+ timestamp=timestamp
367
+ ))
368
+
369
+ # Memory metrics
370
+ memory = psutil.virtual_memory()
371
+ metric_collector.record(Metric(
372
+ name="system.memory.usage",
373
+ value=memory.percent,
374
+ metric_type=MetricType.GAUGE,
375
+ timestamp=timestamp
376
+ ))
377
+
378
+ metric_collector.record(Metric(
379
+ name="system.memory.available",
380
+ value=memory.available / (1024**3), # GB
381
+ metric_type=MetricType.GAUGE,
382
+ timestamp=timestamp
383
+ ))
384
+
385
+ # Disk metrics
386
+ disk = psutil.disk_usage('/')
387
+ metric_collector.record(Metric(
388
+ name="system.disk.usage",
389
+ value=(disk.used / disk.total) * 100,
390
+ metric_type=MetricType.GAUGE,
391
+ timestamp=timestamp
392
+ ))
393
+
394
+ # Network metrics (if available)
395
+ try:
396
+ network = psutil.net_io_counters()
397
+ metric_collector.record(Metric(
398
+ name="system.network.bytes_sent",
399
+ value=network.bytes_sent,
400
+ metric_type=MetricType.COUNTER,
401
+ timestamp=timestamp
402
+ ))
403
+
404
+ metric_collector.record(Metric(
405
+ name="system.network.bytes_recv",
406
+ value=network.bytes_recv,
407
+ metric_type=MetricType.COUNTER,
408
+ timestamp=timestamp
409
+ ))
410
+ except:
411
+ pass
412
+
413
+ # Process metrics
414
+ process_count = len(psutil.pids())
415
+ metric_collector.record(Metric(
416
+ name="system.processes.count",
417
+ value=process_count,
418
+ metric_type=MetricType.GAUGE,
419
+ timestamp=timestamp
420
+ ))
421
+
422
+ except Exception as e:
423
+ logger.error(f"Failed to collect system metrics: {e}")
424
+
425
+
426
+ class APIMonitor:
427
+ """Monitor API performance and usage"""
428
+
429
+ def __init__(self):
430
+ self.request_times = deque(maxlen=1000)
431
+ self.endpoint_stats = defaultdict(list)
432
+ self.error_counts = defaultdict(int)
433
+ self.lock = threading.Lock()
434
+
435
+ def record_request(self, endpoint: str, response_time: float, status_code: int):
436
+ """Record API request metrics"""
437
+ timestamp = datetime.now()
438
+
439
+ with self.lock:
440
+ self.request_times.append({
441
+ 'timestamp': timestamp,
442
+ 'endpoint': endpoint,
443
+ 'response_time': response_time,
444
+ 'status_code': status_code
445
+ })
446
+
447
+ self.endpoint_stats[endpoint].append(response_time)
448
+
449
+ if status_code >= 400:
450
+ self.error_counts[endpoint] += 1
451
+
452
+ def get_api_stats(self, window_minutes: int = 60) -> Dict[str, Any]:
453
+ """Get API statistics"""
454
+ since = datetime.now() - timedelta(minutes=window_minutes)
455
+
456
+ with self.lock:
457
+ recent_requests = [
458
+ req for req in self.request_times
459
+ if req['timestamp'] >= since
460
+ ]
461
+
462
+ if not recent_requests:
463
+ return {}
464
+
465
+ response_times = [req['response_time'] for req in recent_requests]
466
+ error_requests = [req for req in recent_requests if req['status_code'] >= 400]
467
+
468
+ return {
469
+ 'total_requests': len(recent_requests),
470
+ 'error_requests': len(error_requests),
471
+ 'error_rate': len(error_requests) / len(recent_requests),
472
+ 'avg_response_time': statistics.mean(response_times),
473
+ 'p95_response_time': self._percentile(response_times, 95),
474
+ 'endpoints': {
475
+ endpoint: {
476
+ 'count': len(times),
477
+ 'avg_time': statistics.mean(times),
478
+ 'errors': self.error_counts.get(endpoint, 0)
479
+ }
480
+ for endpoint, times in self.endpoint_stats.items()
481
+ if any(req['endpoint'] == endpoint for req in recent_requests)
482
+ }
483
+ }
484
+
485
+ def _percentile(self, values: List[float], percentile: int) -> float:
486
+ """Calculate percentile"""
487
+ if not values:
488
+ return 0.0
489
+
490
+ sorted_values = sorted(values)
491
+ index = int(len(sorted_values) * percentile / 100)
492
+ return sorted_values[min(index, len(sorted_values) - 1)]
493
+
494
+
495
+ class MonitoringDashboard:
496
+ """Real-time monitoring dashboard"""
497
+
498
+ def __init__(self, data_dir: str = "logs"):
499
+ self.data_dir = Path(data_dir)
500
+ self.data_dir.mkdir(exist_ok=True)
501
+
502
+ self.metric_collector = MetricCollector()
503
+ self.alert_manager = AlertManager(str(self.data_dir / "alerts.jsonl"))
504
+ self.system_monitor = SystemMonitor()
505
+ self.api_monitor = APIMonitor()
506
+
507
+ # Setup default alert rules
508
+ self._setup_default_alerts()
509
+
510
+ self.running = False
511
+ self.dashboard_thread = None
512
+
513
+ def _setup_default_alerts(self):
514
+ """Setup default alert rules"""
515
+ # High CPU usage
516
+ self.alert_manager.add_rule(
517
+ name="high_cpu_usage",
518
+ metric_name="system.cpu.usage",
519
+ threshold=80.0,
520
+ comparison="greater_than",
521
+ severity=AlertSeverity.WARNING
522
+ )
523
+
524
+ # High memory usage
525
+ self.alert_manager.add_rule(
526
+ name="high_memory_usage",
527
+ metric_name="system.memory.usage",
528
+ threshold=85.0,
529
+ comparison="greater_than",
530
+ severity=AlertSeverity.WARNING
531
+ )
532
+
533
+ # High disk usage
534
+ self.alert_manager.add_rule(
535
+ name="high_disk_usage",
536
+ metric_name="system.disk.usage",
537
+ threshold=90.0,
538
+ comparison="greater_than",
539
+ severity=AlertSeverity.CRITICAL
540
+ )
541
+
542
+ # High API response time
543
+ self.alert_manager.add_rule(
544
+ name="high_api_response_time",
545
+ metric_name="api.response.time",
546
+ threshold=5.0,
547
+ comparison="greater_than",
548
+ severity=AlertSeverity.WARNING
549
+ )
550
+
551
+ # High error rate
552
+ self.alert_manager.add_rule(
553
+ name="high_error_rate",
554
+ metric_name="api.error.rate",
555
+ threshold=0.1, # 10%
556
+ comparison="greater_than",
557
+ severity=AlertSeverity.ERROR
558
+ )
559
+
560
+ def start(self):
561
+ """Start monitoring dashboard"""
562
+ if self.running:
563
+ return
564
+
565
+ self.running = True
566
+
567
+ # Start system monitoring
568
+ self.system_monitor.start(self.metric_collector)
569
+
570
+ # Start dashboard update thread
571
+ self.dashboard_thread = threading.Thread(
572
+ target=self._dashboard_loop,
573
+ daemon=True
574
+ )
575
+ self.dashboard_thread.start()
576
+
577
+ logger.info("Monitoring dashboard started")
578
+
579
+ def stop(self):
580
+ """Stop monitoring dashboard"""
581
+ self.running = False
582
+ self.system_monitor.stop()
583
+
584
+ if self.dashboard_thread:
585
+ self.dashboard_thread.join()
586
+
587
+ logger.info("Monitoring dashboard stopped")
588
+
589
+ def _dashboard_loop(self):
590
+ """Main dashboard update loop"""
591
+ while self.running:
592
+ try:
593
+ # Update metrics
594
+ self._update_api_metrics()
595
+
596
+ # Check alerts
597
+ self.alert_manager.check_alerts(self.metric_collector)
598
+
599
+ # Save dashboard state
600
+ self._save_dashboard_state()
601
+
602
+ time.sleep(30) # Update every 30 seconds
603
+
604
+ except Exception as e:
605
+ logger.error(f"Dashboard update error: {e}")
606
+ time.sleep(10)
607
+
608
+ def _update_api_metrics(self):
609
+ """Update API-related metrics"""
610
+ timestamp = datetime.now()
611
+
612
+ # Get API stats
613
+ api_stats = self.api_monitor.get_api_stats(window_minutes=5)
614
+
615
+ if 'avg_response_time' in api_stats:
616
+ self.metric_collector.record(Metric(
617
+ name="api.response.time",
618
+ value=api_stats['avg_response_time'],
619
+ metric_type=MetricType.GAUGE,
620
+ timestamp=timestamp
621
+ ))
622
+
623
+ if 'error_rate' in api_stats:
624
+ self.metric_collector.record(Metric(
625
+ name="api.error.rate",
626
+ value=api_stats['error_rate'],
627
+ metric_type=MetricType.GAUGE,
628
+ timestamp=timestamp
629
+ ))
630
+
631
+ def _save_dashboard_state(self):
632
+ """Save current dashboard state to file"""
633
+ try:
634
+ state = {
635
+ 'timestamp': datetime.now().isoformat(),
636
+ 'active_alerts': [
637
+ asdict(alert) for alert in self.alert_manager.get_active_alerts()
638
+ ],
639
+ 'system_metrics': {
640
+ name: self.metric_collector.get_current_value(name)
641
+ for name in [
642
+ 'system.cpu.usage',
643
+ 'system.memory.usage',
644
+ 'system.disk.usage'
645
+ ]
646
+ },
647
+ 'api_stats': self.api_monitor.get_api_stats()
648
+ }
649
+
650
+ # Convert datetime objects
651
+ for alert in state['active_alerts']:
652
+ alert['timestamp'] = alert['timestamp'].isoformat()
653
+ if alert['resolved_at']:
654
+ alert['resolved_at'] = alert['resolved_at'].isoformat()
655
+
656
+ state_file = self.data_dir / "dashboard_state.json"
657
+ with open(state_file, 'w') as f:
658
+ json.dump(state, f, indent=2)
659
+
660
+ except Exception as e:
661
+ logger.error(f"Failed to save dashboard state: {e}")
662
+
663
+ def record_api_request(self, endpoint: str, response_time: float, status_code: int):
664
+ """Record API request for monitoring"""
665
+ self.api_monitor.record_request(endpoint, response_time, status_code)
666
+
667
+ def get_dashboard_data(self) -> Dict[str, Any]:
668
+ """Get current dashboard data"""
669
+ return {
670
+ 'active_alerts': [
671
+ asdict(alert) for alert in self.alert_manager.get_active_alerts()
672
+ ],
673
+ 'system_health': {
674
+ 'cpu_usage': self.metric_collector.get_current_value('system.cpu.usage'),
675
+ 'memory_usage': self.metric_collector.get_current_value('system.memory.usage'),
676
+ 'disk_usage': self.metric_collector.get_current_value('system.disk.usage'),
677
+ },
678
+ 'api_performance': self.api_monitor.get_api_stats(),
679
+ 'recent_alerts': self.alert_manager.get_alert_history(limit=10)
680
+ }
681
+
682
+ def export_metrics(self, format: str = "json", hours: int = 24) -> str:
683
+ """Export metrics in specified format"""
684
+ since = datetime.now() - timedelta(hours=hours)
685
+
686
+ if format.lower() == "json":
687
+ metrics_data = {
688
+ 'export_timestamp': datetime.now().isoformat(),
689
+ 'time_range': f"last_{hours}_hours",
690
+ 'metrics': [
691
+ {
692
+ 'name': metric.name,
693
+ 'value': metric.value,
694
+ 'timestamp': metric.timestamp.isoformat(),
695
+ 'labels': metric.labels,
696
+ 'type': metric.metric_type.value
697
+ }
698
+ for metric in self.metric_collector.get_metrics(since=since)
699
+ ]
700
+ }
701
+ return json.dumps(metrics_data, indent=2)
702
+
703
+ else:
704
+ raise ValueError(f"Unsupported export format: {format}")
705
+
706
+
707
+ # Global dashboard instance
708
+ monitoring_dashboard = MonitoringDashboard()
709
+
710
+
711
+ # Utility functions
712
+ def test_monitoring_system():
713
+ """Test the monitoring system"""
714
+ print("Testing monitoring system...")
715
+
716
+ dashboard = MonitoringDashboard()
717
+
718
+ # Record some test metrics
719
+ dashboard.record_api_request('/generate', 1.5, 200)
720
+ dashboard.record_api_request('/generate', 2.1, 200)
721
+ dashboard.record_api_request('/generate', 0.8, 500)
722
+
723
+ # Get dashboard data
724
+ data = dashboard.get_dashboard_data()
725
+ print(f"Active alerts: {len(data['active_alerts'])}")
726
+ print(f"API performance: {data['api_performance']}")
727
+
728
+ # Export metrics
729
+ exported = dashboard.export_metrics(format="json", hours=1)
730
+ print(f"Exported metrics: {len(exported)} characters")
731
+
732
+ print("Monitoring system test complete")
733
+
734
+
735
+ if __name__ == "__main__":
736
+ # Create logs directory
737
+ Path("logs").mkdir(exist_ok=True)
738
+
739
+ # Test monitoring functionality
740
+ test_monitoring_system()
741
+
742
+ print("\nMonitoring system ready for integration")