zenith-backend / monitoring /enhanced_dashboard_with_history.py
teoat's picture
Upload folder using huggingface_hub
4ae946d verified
#!/usr/bin/env python3
"""
Enhanced Production Monitoring Dashboard with Historical Data
Real-time monitoring with historical trends and analytics
"""
import json
import os
import sys
import time
import urllib.parse
import urllib.request
from datetime import datetime
from http.server import BaseHTTPRequestHandler, HTTPServer
# Import metrics database
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from metrics_database import metrics_db
# Alert system for frontend errors
alert_history = []
alert_config = {
"enabled": True,
"thresholds": {
"cpu_warning": 70,
"cpu_critical": 90,
"memory_warning": 80,
"memory_critical": 95,
"disk_warning": 85,
"disk_critical": 95,
},
}
def send_console_alert(alert):
"""Send alert to console"""
severity_emoji = {"warning": "⚠️", "critical": "🚨"}
emoji = severity_emoji.get(alert["severity"], "πŸ“’")
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"\\n{emoji} PRODUCTION ALERT - {timestamp}")
print(f" Type: {alert['type'].upper()}")
print(f" Severity: {alert['severity'].upper()}")
print(f" Message: {alert['message']}")
if "value" in alert:
print(f" Value: {alert['value']}")
if "context" in alert and alert["context"]:
print(f" Context: {alert['context']}")
print("-" * 50)
class EnhancedMonitoringHandler(BaseHTTPRequestHandler):
def do_GET(self):
if self.path == "/":
self.serve_dashboard()
elif self.path == "/api/metrics":
self.serve_metrics()
elif self.path == "/api/health":
self.serve_health()
elif self.path == "/api/environments":
self.serve_environments()
elif self.path == "/api/historical":
self.serve_historical_data()
elif self.path == "/api/summary":
self.serve_metrics_summary()
else:
self.send_404()
def serve_dashboard(self):
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Enhanced Production Monitoring Dashboard</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
color: white; min-height: 100vh;
}
.container { max-width: 1400px; margin: 0 auto; padding: 20px; }
.header { text-align: center; margin-bottom: 40px; }
.header h1 { font-size: 2.5rem; margin-bottom: 10px; }
.header p { font-size: 1.1rem; opacity: 0.8; }
.section-title { font-size: 1.4rem; margin: 30px 0 20px 0; color: #ffd700; }
.metrics-grid {
display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 20px; margin-bottom: 40px;
}
.metric-card {
background: rgba(255, 255, 255, 0.1); backdrop-filter: blur(10px);
border-radius: 15px; padding: 25px; border: 1px solid rgba(255, 255, 255, 0.2);
transition: transform 0.3s ease;
}
.metric-card:hover { transform: translateY(-5px); }
.metric-title { font-size: 1.2rem; margin-bottom: 15px; color: #64b5f6; }
.metric-value { font-size: 2rem; font-weight: bold; margin-bottom: 5px; }
.metric-label { font-size: 0.9rem; opacity: 0.7; }
.env-card {
background: rgba(255, 255, 255, 0.15); backdrop-filter: blur(10px);
border-radius: 15px; padding: 20px; margin-bottom: 20px;
border: 2px solid rgba(255, 255, 255, 0.3);
}
.env-blue { border-color: #4fc3f7; }
.env-green { border-color: #66bb6a; }
.env-status { display: flex; align-items: center; margin-bottom: 10px; }
.env-indicator { width: 16px; height: 16px; border-radius: 50%; margin-right: 10px; }
.status-healthy { background: #4caf50; }
.status-unhealthy { background: #f44336; }
.status-unknown { background: #ff9800; }
.chart-container {
background: rgba(255, 255, 255, 0.1); backdrop-filter: blur(10px);
border-radius: 15px; padding: 25px; margin-bottom: 20px;
border: 1px solid rgba(255, 255, 255, 0.2);
}
.refresh-btn {
background: #64b5f6; color: white; border: none; padding: 10px 20px;
border-radius: 25px; cursor: pointer; font-size: 1rem;
transition: background 0.3s ease;
}
.refresh-btn:hover { background: #42a5f5; }
.last-updated { text-align: center; opacity: 0.7; margin-top: 20px; }
.two-column { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; }
.summary-card {
background: rgba(255, 255, 255, 0.1); backdrop-filter: blur(10px);
border-radius: 15px; padding: 20px; margin-bottom: 20px;
border: 1px solid rgba(255, 255, 255, 0.2);
}
.summary-stat { font-size: 1.1rem; margin: 5px 0; }
.trend-up { color: #4caf50; }
.trend-down { color: #f44336; }
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>πŸš€ Enhanced Production Monitoring</h1>
<p>Real-time monitoring with historical data and analytics</p>
<button class="refresh-btn" onclick="refreshData()">πŸ”„ Refresh Data</button>
</div>
<div class="section-title">πŸ’» System Resources</div>
<div class="metrics-grid">
<div class="metric-card">
<div class="metric-title">πŸ–₯️ CPU Usage</div>
<div class="metric-value" id="cpu-value">--%</div>
<div class="metric-label">Processing load</div>
</div>
<div class="metric-card">
<div class="metric-title">🧠 Memory Usage</div>
<div class="metric-value" id="memory-value">--%</div>
<div class="metric-label">Memory consumption</div>
</div>
<div class="metric-card">
<div class="metric-title">πŸ’Ύ Disk Usage</div>
<div class="metric-value" id="disk-value">--%</div>
<div class="metric-label">Storage utilization</div>
</div>
<div class="metric-card">
<div class="metric-title">⏱️ Uptime</div>
<div class="metric-value" id="uptime-value">--</div>
<div class="metric-label">System running time</div>
</div>
</div>
<div class="section-title">🌍 Deployment Environments</div>
<div class="two-column">
<div class="env-card env-blue">
<div style="font-size: 1.3rem; margin-bottom: 15px;">πŸ”΅ Blue Environment</div>
<div class="env-status">
<div id="blue-status-indicator" class="env-indicator status-unknown"></div>
<span id="blue-status-text">Checking...</span>
</div>
<div style="font-size: 1.1rem;" id="blue-response-time">Response Time: --ms</div>
<div style="font-size: 1.1rem;" id="blue-availability">24h Availability: --%</div>
</div>
<div class="env-card env-green">
<div style="font-size: 1.3rem; margin-bottom: 15px;">🟒 Green Environment</div>
<div class="env-status">
<div id="green-status-indicator" class="env-indicator status-unknown"></div>
<span id="green-status-text">Checking...</span>
</div>
<div style="font-size: 1.1rem;" id="green-response-time">Response Time: --ms</div>
<div style="font-size: 1.1rem;" id="green-availability">24h Availability: --%</div>
</div>
</div>
<div class="section-title">πŸ“Š Historical Summary (Last 24 Hours)</div>
<div class="summary-card">
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px;">
<div>
<div style="font-size: 1.2rem; margin-bottom: 10px;">πŸ“ˆ System Performance</div>
<div class="summary-stat" id="avg-cpu">Avg CPU: --%</div>
<div class="summary-stat" id="max-cpu">Peak CPU: --%</div>
<div class="summary-stat" id="avg-memory">Avg Memory: --%</div>
<div class="summary-stat" id="max-memory">Peak Memory: --%</div>
</div>
<div>
<div style="font-size: 1.2rem; margin-bottom: 10px;">🌐 Environment Health</div>
<div class="summary-stat" id="blue-availability-summary">Blue Availability: --%</div>
<div class="summary-stat" id="green-availability-summary">Green Availability: --%</div>
<div class="summary-stat" id="total-readings">Total Readings: --</div>
</div>
<div>
<div style="font-size: 1.2rem; margin-bottom: 10px;">πŸ’Ύ Database Health</div>
<div class="summary-stat" id="db-availability">DB Availability: --%</div>
<div class="summary-stat" id="db-checks">Health Checks: --</div>
<div class="summary-stat" id="data-points">Stored Data Points: --</div>
</div>
</div>
</div>
<div class="chart-container">
<div class="chart-title">πŸ“ˆ Performance Trends (Last 2 Hours)</div>
<canvas id="performance-canvas" style="width: 100%; height: 200px;"></canvas>
</div>
<div class="last-updated" id="last-updated">
Last updated: Loading...
</div>
</div>
<script>
let metricsData = [];
let historicalData = [];
async function fetchEndpoint(endpoint) {
try {
const response = await fetch(endpoint);
return await response.json();
} catch (error) {
console.error('Error fetching', endpoint, ':', error);
return null;
}
}
function updateDashboard(data) {
if (!data) return;
document.getElementById('cpu-value').textContent = data.cpu_percent?.toFixed(1) + '%' || '--%';
document.getElementById('memory-value').textContent = data.memory_percent?.toFixed(1) + '%' || '--%';
document.getElementById('disk-value').textContent = data.disk_percent?.toFixed(1) + '%' || '--%';
document.getElementById('uptime-value').textContent = data.uptime || '--';
document.getElementById('last-updated').textContent =
'Last updated: ' + new Date().toLocaleTimeString();
}
function updateEnvironments(data) {
if (!data) return;
const blueHealthy = data.blue_environment?.healthy || false;
const blueTime = data.blue_environment?.response_time || 0;
document.getElementById('blue-status-indicator').className =
'env-indicator ' + (blueHealthy ? 'status-healthy' : 'status-unhealthy');
document.getElementById('blue-status-text').textContent =
blueHealthy ? 'Healthy' : 'Unhealthy';
document.getElementById('blue-response-time').textContent =
'Response Time: ' + blueTime + 'ms';
const greenHealthy = data.green_environment?.healthy || false;
const greenTime = data.green_environment?.response_time || 0;
document.getElementById('green-status-indicator').className =
'env-indicator ' + (greenHealthy ? 'status-healthy' : 'status-unhealthy');
document.getElementById('green-status-text').textContent =
greenHealthy ? 'Healthy' : 'Unhealthy';
document.getElementById('green-response-time').textContent =
'Response Time: ' + greenTime + 'ms';
}
function updateSummary(data) {
if (!data) return;
const system = data.system_metrics || {};
document.getElementById('avg-cpu').textContent = 'Avg CPU: ' + (system.avg_cpu?.toFixed(1) || '--') + '%';
document.getElementById('max-cpu').textContent = 'Peak CPU: ' + (system.max_cpu?.toFixed(1) || '--') + '%';
document.getElementById('avg-memory').textContent = 'Avg Memory: ' + (system.avg_memory?.toFixed(1) || '--') + '%';
document.getElementById('max-memory').textContent = 'Peak Memory: ' + (system.max_memory?.toFixed(1) || '--') + '%';
const envAvailability = data.environment_availability || [];
const blueEnv = envAvailability.find(e => e.environment === 'blue') || {};
const greenEnv = envAvailability.find(e => e.environment === 'green') || {};
document.getElementById('blue-availability').textContent = '24h Availability: ' + (blueEnv.availability_percent || '--') + '%';
document.getElementById('green-availability').textContent = '24h Availability: ' + (greenEnv.availability_percent || '--') + '%';
document.getElementById('blue-availability-summary').textContent = 'Blue Availability: ' + (blueEnv.availability_percent || '--') + '%';
document.getElementById('green-availability-summary').textContent = 'Green Availability: ' + (greenEnv.availability_percent || '--') + '%';
document.getElementById('total-readings').textContent = 'Total Readings: ' + (blueEnv.total_checks || 0) + ' (Blue) + ' + (greenEnv.total_checks || 0) + ' (Green)';
const db = data.database_health || {};
document.getElementById('db-availability').textContent = 'DB Availability: ' + (db.db_availability_percent || '--') + '%';
document.getElementById('db-checks').textContent = 'Health Checks: ' + (db.total_checks || '--');
document.getElementById('data-points').textContent = 'Stored Data Points: ' + ((system.total_readings || 0) + (db.total_checks || 0) + (blueEnv.total_checks || 0) + (greenEnv.total_checks || 0));
}
function drawChart() {
const canvas = document.getElementById('performance-canvas');
const ctx = canvas.getContext('2d');
canvas.width = canvas.offsetWidth;
canvas.height = canvas.offsetHeight;
if (historicalData.length < 2) return;
const width = canvas.width;
const height = canvas.height;
const padding = 20;
ctx.clearRect(0, 0, width, height);
ctx.strokeStyle = 'rgba(255, 255, 255, 0.1)';
ctx.lineWidth = 1;
for (let i = 0; i <= 4; i++) {
const y = padding + (height - 2 * padding) * i / 4;
ctx.beginPath();
ctx.moveTo(padding, y);
ctx.lineTo(width - padding, y);
ctx.stroke();
}
// Draw CPU line
ctx.strokeStyle = '#ff6b6b';
ctx.lineWidth = 2;
ctx.beginPath();
for (let i = 0; i < historicalData.length; i++) {
const x = padding + (width - 2 * padding) * i / (historicalData.length - 1);
const y = height - padding - (height - 2 * padding) * (historicalData[i].cpu_percent || 0) / 100;
if (i === 0) ctx.moveTo(x, y);
else ctx.lineTo(x, y);
}
ctx.stroke();
// Draw Memory line
ctx.strokeStyle = '#4ecdc4';
ctx.lineWidth = 2;
ctx.beginPath();
for (let i = 0; i < historicalData.length; i++) {
const x = padding + (width - 2 * padding) * i / (historicalData.length - 1);
const y = height - padding - (height - 2 * padding) * (historicalData[i].memory_percent || 0) / 100;
if (i === 0) ctx.moveTo(x, y);
else ctx.lineTo(x, y);
}
ctx.stroke();
}
async function refreshData() {
const [metrics, environments, historical, summary] = await Promise.all([
fetchEndpoint('/api/metrics'),
fetchEndpoint('/api/environments'),
fetchEndpoint('/api/historical'),
fetchEndpoint('/api/summary')
]);
updateDashboard(metrics);
updateEnvironments(environments);
updateSummary(summary);
if (historical && historical.system_metrics) {
historicalData = historical.system_metrics.slice(0, 50); // Last 50 readings
drawChart();
}
}
setInterval(refreshData, 10000); // Refresh every 10 seconds
refreshData();
</script>
</body>
</html>
"""
self.wfile.write(html.encode())
def serve_metrics(self):
self.send_response(200)
self.send_header("Content-type", "application/json")
self.end_headers()
current_metrics = collect_system_metrics()
response = json.dumps(current_metrics, indent=2)
self.wfile.write(response.encode())
def serve_health(self):
self.send_response(200)
self.send_header("Content-type", "application/json")
self.end_headers()
health_data = {
"status": "healthy",
"timestamp": datetime.now().isoformat(),
"version": "2.0.0",
"uptime": time.time() - start_time,
}
response = json.dumps(health_data, indent=2)
self.wfile.write(response.encode())
def serve_environments(self):
self.send_response(200)
self.send_header("Content-type", "application/json")
self.end_headers()
env_data = collect_environment_metrics()
response = json.dumps(env_data, indent=2)
self.wfile.write(response.encode())
def serve_historical_data(self):
self.send_response(200)
self.send_header("Content-type", "application/json")
self.end_headers()
historical_data = {
"system_metrics": metrics_db.get_system_metrics(hours=2), # Last 2 hours
"blue_environment": metrics_db.get_environment_metrics("blue", hours=2),
"green_environment": metrics_db.get_environment_metrics("green", hours=2),
"database_metrics": metrics_db.get_database_metrics(hours=2),
}
response = json.dumps(historical_data, indent=2)
self.wfile.write(response.encode())
def serve_metrics_summary(self):
self.send_response(200)
self.send_header("Content-type", "application/json")
self.end_headers()
summary = metrics_db.get_metrics_summary(hours=24) # Last 24 hours
response = json.dumps(summary, indent=2)
self.wfile.write(response.encode())
def send_404(self):
self.send_response(404)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(b"404 Not Found")
def log_message(self, format, *args):
pass
def collect_system_metrics():
"""Collect system metrics"""
try:
import psutil
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
disk = psutil.disk_usage("/")
return {
"cpu_percent": cpu_percent,
"memory_percent": memory.percent,
"memory_available_gb": memory.available / (1024**3),
"disk_percent": disk.percent,
"disk_free_gb": disk.free / (1024**3),
"uptime": time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)),
"timestamp": datetime.now().isoformat(),
"process_count": len(psutil.pids()),
}
except ImportError:
import random
cpu = random.uniform(20, 80)
memory = random.uniform(30, 70)
disk = random.uniform(40, 60)
return {
"cpu_percent": cpu,
"memory_percent": memory,
"memory_available_gb": random.uniform(2, 8),
"disk_percent": disk,
"disk_free_gb": random.uniform(10, 50),
"uptime": time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)),
"timestamp": datetime.now().isoformat(),
"process_count": random.randint(100, 300),
}
def collect_environment_metrics():
"""Collect environment health metrics"""
environments = {}
# Check Blue Environment (port 5002)
blue_health = check_environment_health("localhost", 5002)
environments["blue_environment"] = blue_health
# Check Green Environment (port 5003)
green_health = check_environment_health("localhost", 5003)
environments["green_environment"] = green_health
environments["timestamp"] = datetime.now().isoformat()
return environments
def check_environment_health(host, port):
"""Check environment health"""
try:
start_time = time.time()
# Use urllib instead of requests for standard library compatibility
req = urllib.request.Request(f"http://{host}:{port}/api/health")
with urllib.request.urlopen(req, timeout=5) as response:
response_time = (time.time() - start_time) * 1000
return {
"healthy": response.status == 200,
"response_time": int(response_time),
"status": "healthy" if response.status == 200 else "unhealthy",
"endpoint": f"http://{host}:{port}",
"last_check": datetime.now().isoformat(),
}
except Exception:
return {
"healthy": False,
"response_time": 5000,
"status": "unreachable",
"endpoint": f"http://{host}:{port}",
"last_check": datetime.now().isoformat(),
}
def do_post(self):
print(f"POST request received: {self.path}") # Debug logging
if self.path == "/api/errors":
self.handle_error_report()
else:
self.send_404()
def handle_error_report(self):
"""Handle frontend error reports"""
try:
content_length = int(self.headers["Content-Length"])
post_data = self.rfile.read(content_length)
error_data = json.loads(post_data.decode("utf-8"))
# Record the frontend error
frontend_error = {
"type": "frontend_error",
"severity": error_data.get("severity", "medium"),
"message": f"Frontend Error: {error_data['error']['message']}",
"value": error_data["error"]["message"],
"context": error_data.get("context", {}),
"timestamp": datetime.now().isoformat(),
"source": "frontend",
}
# Add to alert history
global alert_history
alert_history.append(frontend_error)
# Keep only recent alerts
alert_history = alert_history[-100:]
# Trigger console alert for frontend errors
send_console_alert(frontend_error)
self.send_response(200)
self.send_header("Content-type", "application/json")
self.end_headers()
self.wfile.write(b'{"status": "recorded"}')
except Exception as e:
print(f"Error handling frontend error report: {e}")
self.send_response(500)
self.send_header("Content-type", "application/json")
self.end_headers()
self.wfile.write(b'{"error": "Failed to record error"}')
def run_server():
"""Run the enhanced monitoring server"""
server_address = ("", 8080)
httpd = HTTPServer(server_address, EnhancedMonitoringHandler)
print("πŸš€ Enhanced Production Monitoring Dashboard started!")
print("πŸ“Š Access URL: http://localhost:8080")
print("πŸ“ˆ Historical Data: Enabled")
print("πŸ’Ύ Time-Series Database: Active")
print("πŸ”„ Auto-refresh: Every 10 seconds")
print("⏹️ To stop: Press Ctrl+C")
print("")
try:
httpd.serve_forever()
except KeyboardInterrupt:
print("\\nπŸ›‘ Enhanced Dashboard stopped by user")
httpd.server_close()
if __name__ == "__main__":
start_time = time.time()
run_server()