sheikh-kitty / monitoring /dashboard.py
likhonsheikh's picture
Upload folder using huggingface_hub
0efaf6e verified
"""
Sheikh-Kitty Monitoring Dashboard
Real-time system monitoring and visualization
Features:
- System resource monitoring (CPU, memory, disk)
- API performance metrics
- Security alerts display
- Execution history tracking
- Health status indicators
Author: MiniMax Agent
Date: 2025-11-14
"""
import json
import time
import psutil
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Any, Optional
from dataclasses import asdict
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class SimpleDashboard:
"""Simple terminal-based dashboard for monitoring"""
def __init__(self, log_dir: str = "logs"):
self.log_dir = Path(log_dir)
self.log_dir.mkdir(exist_ok=True)
self.state_file = self.log_dir / "dashboard_state.json"
# System thresholds
self.thresholds = {
'cpu_warning': 70.0,
'cpu_critical': 90.0,
'memory_warning': 75.0,
'memory_critical': 90.0,
'disk_warning': 80.0,
'disk_critical': 95.0
}
def get_system_status(self) -> Dict[str, Any]:
"""Get current system status"""
try:
# CPU usage
cpu_percent = psutil.cpu_percent(interval=1)
cpu_count = psutil.cpu_count()
# Memory usage
memory = psutil.virtual_memory()
# Disk usage
disk = psutil.disk_usage('/')
# Load average (Unix systems)
try:
load_avg = psutil.getloadavg()[0] if hasattr(psutil, 'getloadavg') else 0.0
except AttributeError:
load_avg = 0.0
return {
'timestamp': datetime.now().isoformat(),
'cpu': {
'usage_percent': cpu_percent,
'count': cpu_count,
'load_average': load_avg,
'status': self._get_status_level(cpu_percent, 'cpu')
},
'memory': {
'usage_percent': memory.percent,
'available_gb': memory.available / (1024**3),
'total_gb': memory.total / (1024**3),
'status': self._get_status_level(memory.percent, 'memory')
},
'disk': {
'usage_percent': (disk.used / disk.total) * 100,
'free_gb': disk.free / (1024**3),
'total_gb': disk.total / (1024**3),
'status': self._get_status_level((disk.used / disk.total) * 100, 'disk')
}
}
except Exception as e:
logger.error(f"Failed to get system status: {e}")
return {}
def _get_status_level(self, value: float, resource_type: str) -> str:
"""Determine status level based on thresholds"""
if resource_type == 'cpu':
if value >= self.thresholds['cpu_critical']:
return 'critical'
elif value >= self.thresholds['cpu_warning']:
return 'warning'
elif resource_type == 'memory':
if value >= self.thresholds['memory_critical']:
return 'critical'
elif value >= self.thresholds['memory_warning']:
return 'warning'
elif resource_type == 'disk':
if value >= self.thresholds['disk_critical']:
return 'critical'
elif value >= self.thresholds['disk_warning']:
return 'warning'
return 'healthy'
def get_api_metrics(self) -> Dict[str, Any]:
"""Get API metrics from log files"""
try:
api_log = self.log_dir / "api_requests.jsonl"
if not api_log.exists():
return {}
# Read recent API requests
recent_requests = []
with open(api_log, 'r') as f:
for line in f:
try:
request = json.loads(line.strip())
recent_requests.append(request)
except json.JSONDecodeError:
continue
# Filter requests from last hour
one_hour_ago = datetime.now() - timedelta(hours=1)
recent_requests = [
req for req in recent_requests
if datetime.fromisoformat(req['timestamp']) > one_hour_ago
]
if not recent_requests:
return {}
# Calculate metrics
execution_times = [req['execution_time'] for req in recent_requests]
successes = [req['response_data']['success'] for req in recent_requests]
return {
'total_requests': len(recent_requests),
'successful_requests': sum(successes),
'success_rate': sum(successes) / len(successes) if successes else 0,
'average_execution_time': sum(execution_times) / len(execution_times) if execution_times else 0,
'p95_execution_time': sorted(execution_times)[int(len(execution_times) * 0.95)] if execution_times else 0,
'endpoints': {
req['endpoint']: {
'count': 1,
'success': req['response_data']['success']
}
for req in recent_requests
}
}
except Exception as e:
logger.error(f"Failed to get API metrics: {e}")
return {}
def get_alerts(self) -> List[Dict[str, Any]]:
"""Get recent alerts"""
try:
alerts_file = self.log_dir / "alerts.jsonl"
if not alerts_file.exists():
return []
alerts = []
with open(alerts_file, 'r') as f:
for line in f:
try:
alert = json.loads(line.strip())
alerts.append(alert)
except json.JSONDecodeError:
continue
# Return recent alerts (last 24 hours)
one_day_ago = datetime.now() - timedelta(days=1)
recent_alerts = [
alert for alert in alerts
if datetime.fromisoformat(alert['timestamp']) > one_day_ago
]
return sorted(recent_alerts, key=lambda x: x['timestamp'], reverse=True)[:10]
except Exception as e:
logger.error(f"Failed to get alerts: {e}")
return []
def display_dashboard(self):
"""Display dashboard in terminal"""
# Clear screen (ANSI escape code)
print("\033[2J\033[H")
print("=" * 60)
print("🏗️ SHEIKH-KITTY MONITORING DASHBOARD")
print("=" * 60)
print(f"📅 {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print()
# System Status
system_status = self.get_system_status()
if system_status:
print("🖥️ SYSTEM STATUS")
print("-" * 20)
# CPU
cpu = system_status['cpu']
status_icon = self._get_status_icon(cpu['status'])
print(f"{status_icon} CPU: {cpu['usage_percent']:6.1f}% (Cores: {cpu['count']}, Load: {cpu['load_average']:.2f})")
# Memory
memory = system_status['memory']
status_icon = self._get_status_icon(memory['status'])
print(f"{status_icon} Memory: {memory['usage_percent']:6.1f}% (Available: {memory['available_gb']:.1f}GB)")
# Disk
disk = system_status['disk']
status_icon = self._get_status_icon(disk['status'])
print(f"{status_icon} Disk: {disk['usage_percent']:6.1f}% (Free: {disk['free_gb']:.1f}GB)")
print()
# API Metrics
api_metrics = self.get_api_metrics()
if api_metrics:
print("🌐 API METRICS (Last Hour)")
print("-" * 25)
print(f"📊 Requests: {api_metrics['total_requests']}")
print(f"✅ Success: {api_metrics['successful_requests']} ({api_metrics['success_rate']:.1%})")
print(f"⏱️ Avg Time: {api_metrics['average_execution_time']:.3f}s")
print(f"🚀 P95 Time: {api_metrics['p95_execution_time']:.3f}s")
# Endpoint breakdown
if api_metrics['endpoints']:
print("🔗 Endpoints:")
for endpoint, stats in api_metrics['endpoints'].items():
print(f" {endpoint}: {stats['count']} requests, {stats['success']:.1%} success")
print()
# Recent Alerts
alerts = self.get_alerts()
if alerts:
print("🚨 RECENT ALERTS")
print("-" * 15)
for alert in alerts[:5]: # Show last 5 alerts
severity_icon = self._get_alert_icon(alert['severity'])
print(f"{severity_icon} {alert['severity'].upper()}: {alert['message']}")
print(f" 📅 {alert['timestamp']}")
print()
# Health Summary
print("💚 SYSTEM HEALTH")
print("-" * 15)
health_score = self._calculate_health_score(system_status, api_metrics, alerts)
health_status = self._get_health_status(health_score)
print(f"Overall: {health_status} ({health_score:.1%})")
print()
print("Press Ctrl+C to exit")
def _get_status_icon(self, status: str) -> str:
"""Get icon for status"""
icons = {
'healthy': '🟢',
'warning': '🟡',
'critical': '🔴'
}
return icons.get(status, '⚪')
def _get_alert_icon(self, severity: str) -> str:
"""Get icon for alert severity"""
icons = {
'info': 'ℹ️',
'warning': '⚠️',
'error': '❌',
'critical': '🚨'
}
return icons.get(severity, '📢')
def _calculate_health_score(self, system_status: Dict, api_metrics: Dict, alerts: List) -> float:
"""Calculate overall health score"""
score = 1.0
# Deduct for system resource issues
if system_status:
if system_status['cpu']['status'] == 'warning':
score -= 0.1
elif system_status['cpu']['status'] == 'critical':
score -= 0.2
if system_status['memory']['status'] == 'warning':
score -= 0.1
elif system_status['memory']['status'] == 'critical':
score -= 0.2
if system_status['disk']['status'] == 'warning':
score -= 0.1
elif system_status['disk']['status'] == 'critical':
score -= 0.2
# Deduct for API issues
if api_metrics:
success_rate = api_metrics.get('success_rate', 1.0)
if success_rate < 0.95:
score -= (0.95 - success_rate)
# Deduct for recent alerts
recent_critical_alerts = sum(1 for alert in alerts if alert['severity'] == 'critical')
if recent_critical_alerts > 0:
score -= min(0.3, recent_critical_alerts * 0.1)
return max(0.0, score)
def _get_health_status(self, score: float) -> str:
"""Get health status text"""
if score >= 0.9:
return "Excellent"
elif score >= 0.8:
return "Good"
elif score >= 0.7:
return "Fair"
elif score >= 0.5:
return "Poor"
else:
return "Critical"
def save_dashboard_state(self):
"""Save current dashboard state"""
try:
state = {
'timestamp': datetime.now().isoformat(),
'system_status': self.get_system_status(),
'api_metrics': self.get_api_metrics(),
'alerts': self.get_alerts()
}
with open(self.state_file, 'w') as f:
json.dump(state, f, indent=2)
except Exception as e:
logger.error(f"Failed to save dashboard state: {e}")
def run_continuous_monitoring(self, update_interval: int = 30):
"""Run continuous dashboard monitoring"""
try:
while True:
self.display_dashboard()
self.save_dashboard_state()
time.sleep(update_interval)
except KeyboardInterrupt:
print("\n👋 Monitoring dashboard stopped")
except Exception as e:
logger.error(f"Dashboard error: {e}")
def main():
"""Main dashboard execution"""
import argparse
parser = argparse.ArgumentParser(description="Sheikh-Kitty Monitoring Dashboard")
parser.add_argument('--interval', type=int, default=30, help='Update interval in seconds')
parser.add_argument('--once', action='store_true', help='Display once and exit')
args = parser.parse_args()
dashboard = SimpleDashboard()
if args.once:
dashboard.display_dashboard()
else:
print("Starting Sheikh-Kitty monitoring dashboard...")
print("Press Ctrl+C to exit")
dashboard.run_continuous_monitoring(args.interval)
if __name__ == "__main__":
main()