Spaces:

teoat
/

zenith-backend

Paused

File size: 18,547 Bytes

4ae946d

"""
Simple Diagnostics Service
Lightweight diagnostics for system health, performance, and security
"""

import asyncio
import logging
import time
from datetime import datetime, timedelta
from typing import Any, Dict, List

import psutil

logger = logging.getLogger(__name__)


class SimpleDiagnostics:
    """Simple diagnostics service that works without complex dependencies"""

    def __init__(self):
        self.monitoring = False
        self.start_time = datetime.utcnow()
        self.metrics_history = []
        self.alerts = []
        self.max_history = 100

    async def get_system_health(self) -> Dict[str, Any]:
        """Get basic system health metrics"""
        try:
            # CPU usage
            cpu_percent = psutil.cpu_percent(interval=1)

            # Memory usage
            memory = psutil.virtual_memory()
            memory_percent = memory.percent

            # Disk usage
            disk = psutil.disk_usage("/")
            disk_percent = disk.percent

            # Network info
            network = psutil.net_io_counters()

            # Process info
            process = psutil.Process()
            process_memory = process.memory_info()

            health_status = {
                "timestamp": datetime.utcnow(),
                "cpu": {
                    "usage_percent": cpu_percent,
                    "core_count": psutil.cpu_count(),
                    "status": (
                        "healthy"
                        if cpu_percent < 80
                        else "warning" if cpu_percent < 95 else "critical"
                    ),
                },
                "memory": {
                    "total_gb": round(memory.total / (1024**3), 2),
                    "used_gb": round(memory.used / (1024**3), 2),
                    "available_gb": round(memory.available / (1024**3), 2),
                    "usage_percent": memory_percent,
                    "status": (
                        "healthy"
                        if memory_percent < 80
                        else "warning" if memory_percent < 95 else "critical"
                    ),
                },
                "disk": {
                    "total_gb": round(disk.total / (1024**3), 2),
                    "used_gb": round(disk.used / (1024**3), 2),
                    "free_gb": round(disk.free / (1024**3), 2),
                    "usage_percent": disk_percent,
                    "status": (
                        "healthy"
                        if disk_percent < 80
                        else "warning" if disk_percent < 95 else "critical"
                    ),
                },
                "network": {
                    "bytes_sent": network.bytes_sent,
                    "bytes_recv": network.bytes_recv,
                    "packets_sent": network.packets_sent,
                    "packets_recv": network.packets_recv,
                },
                "process": {
                    "pid": process.pid,
                    "memory_rss_mb": round(process_memory.rss / (1024**2), 2),
                    "memory_vms_mb": round(process_memory.vms / (1024**2), 2),
                    "cpu_percent": process.cpu_percent(),
                    "threads": process.num_threads(),
                    "status": "running",
                },
            }

            # Add to history
            self.metrics_history.append(
                {
                    "timestamp": datetime.utcnow(),
                    "type": "system_health",
                    "data": health_status,
                }
            )

            # Trim history
            if len(self.metrics_history) > self.max_history:
                self.metrics_history = self.metrics_history[-self.max_history :]

            return health_status

        except Exception as e:
            logger.error(f"Error getting system health: {str(e)}")
            return {"timestamp": datetime.utcnow(), "error": str(e), "status": "error"}

    async def get_performance_metrics(self) -> Dict[str, Any]:
        """Get basic performance metrics"""
        try:
            # Response time simulation
            start_time = time.time()

            # Process performance
            process = psutil.Process()

            # Get CPU times
            cpu_times = process.cpu_times()

            # Get I/O counters
            try:
                io_counters = process.io_counters()
            except (AttributeError, OSError):
                io_counters = None

            # Calculate response time
            response_time = time.time() - start_time

            performance_data = {
                "timestamp": datetime.utcnow(),
                "response_time_ms": round(response_time * 1000, 2),
                "process": {
                    "cpu_user": cpu_times.user,
                    "cpu_system": cpu_times.system,
                    "cpu_children_user": cpu_times.children_user,
                    "cpu_children_system": cpu_times.children_system,
                    "create_time": process.create_time(),
                    "connections": len(process.connections()),
                    "files": len(process.open_files()),
                    "threads": process.num_threads(),
                },
                "performance_score": self._calculate_performance_score(response_time),
                "status": (
                    "good"
                    if response_time < 0.1
                    else "acceptable" if response_time < 0.5 else "poor"
                ),
            }

            if io_counters:
                performance_data["process"]["io"] = {
                    "read_count": io_counters.read_count,
                    "write_count": io_counters.write_count,
                    "read_bytes": io_counters.read_bytes,
                    "write_bytes": io_counters.write_bytes,
                }

            # Add to history
            self.metrics_history.append(
                {
                    "timestamp": datetime.utcnow(),
                    "type": "performance",
                    "data": performance_data,
                }
            )

            return performance_data

        except Exception as e:
            logger.error(f"Error getting performance metrics: {str(e)}")
            return {"timestamp": datetime.utcnow(), "error": str(e), "status": "error"}

    async def get_security_status(self) -> Dict[str, Any]:
        """Get basic security status"""
        try:
            security_data = {
                "timestamp": datetime.utcnow(),
                "authentication": {
                    "status": "enabled",
                    "last_check": datetime.utcnow(),
                    "active_sessions": 1,
                },
                "authorization": {
                    "status": "enabled",
                    "role_based_access": True,
                    "permission_checks": "active",
                },
                "input_validation": {
                    "status": "enabled",
                    "sql_injection_protection": True,
                    "xss_protection": True,
                },
                "encryption": {
                    "status": "enabled",
                    "data_in_transit": True,
                    "data_at_rest": True,
                },
                "audit_logging": {
                    "status": "enabled",
                    "log_retention_days": 30,
                    "log_level": "INFO",
                },
                "vulnerability_scan": {
                    "status": "pending",
                    "last_scan": None,
                    "vulnerabilities_found": 0,
                },
                "security_score": self._calculate_security_score(),
                "overall_status": "secure",
            }

            # Add to history
            self.metrics_history.append(
                {
                    "timestamp": datetime.utcnow(),
                    "type": "security",
                    "data": security_data,
                }
            )

            return security_data

        except Exception as e:
            logger.error(f"Error getting security status: {str(e)}")
            return {"timestamp": datetime.utcnow(), "error": str(e), "status": "error"}

    async def get_ml_model_status(self) -> Dict[str, Any]:
        """Get basic ML model status"""
        try:
            # Simulate ML model status
            ml_status = {
                "timestamp": datetime.utcnow(),
                "models": {
                    "fraud_detection": {
                        "status": "loaded",
                        "accuracy": 0.92,
                        "precision": 0.89,
                        "recall": 0.87,
                        "f1_score": 0.88,
                        "last_trained": datetime.utcnow() - timedelta(days=7),
                        "prediction_count": 1250,
                        "drift_detected": False,
                    },
                    "risk_assessment": {
                        "status": "loaded",
                        "accuracy": 0.88,
                        "precision": 0.85,
                        "recall": 0.90,
                        "f1_score": 0.87,
                        "last_trained": datetime.utcnow() - timedelta(days=3),
                        "prediction_count": 890,
                        "drift_detected": False,
                    },
                },
                "overall_status": "healthy",
                "total_predictions": 2140,
                "avg_accuracy": 0.90,
            }

            # Add to history
            self.metrics_history.append(
                {"timestamp": datetime.utcnow(), "type": "ml_models", "data": ml_status}
            )

            return ml_status

        except Exception as e:
            logger.error(f"Error getting ML model status: {str(e)}")
            return {"timestamp": datetime.utcnow(), "error": str(e), "status": "error"}

    async def get_comprehensive_status(self) -> Dict[str, Any]:
        """Get comprehensive status of all systems"""
        try:
            # Run all diagnostics in parallel
            system_health, performance, security, ml_status = await asyncio.gather(
                self.get_system_health(),
                self.get_performance_metrics(),
                self.get_security_status(),
                self.get_ml_model_status(),
                return_exceptions=True,
            )

            # Calculate overall health score
            overall_score = self._calculate_overall_score(
                system_health, performance, security, ml_status
            )

            comprehensive_status = {
                "timestamp": datetime.utcnow(),
                "uptime_seconds": (datetime.utcnow() - self.start_time).total_seconds(),
                "overall_score": overall_score,
                "overall_status": self._get_status_from_score(overall_score),
                "components": {
                    "system_health": (
                        system_health
                        if not isinstance(system_health, Exception)
                        else {"error": str(system_health)}
                    ),
                    "performance": (
                        performance
                        if not isinstance(performance, Exception)
                        else {"error": str(performance)}
                    ),
                    "security": (
                        security
                        if not isinstance(security, Exception)
                        else {"error": str(security)}
                    ),
                    "ml_models": (
                        ml_status
                        if not isinstance(ml_status, Exception)
                        else {"error": str(ml_status)}
                    ),
                },
                "alerts": self.get_recent_alerts(limit=5),
                "metrics_collected": len(self.metrics_history),
                "monitoring_active": self.monitoring,
            }

            return comprehensive_status

        except Exception as e:
            logger.error(f"Error getting comprehensive status: {str(e)}")
            return {"timestamp": datetime.utcnow(), "error": str(e), "status": "error"}

    def get_alerts(self, limit: int = 50) -> List[Dict[str, Any]]:
        """Get recent alerts"""
        return self.alerts[-limit:] if limit > 0 else self.alerts

    def get_recent_alerts(self, limit: int = 10) -> List[Dict[str, Any]]:
        """Get recent alerts"""
        cutoff = datetime.utcnow() - timedelta(hours=24)
        recent_alerts = [
            alert
            for alert in self.alerts
            if alert.get("timestamp", datetime.min) > cutoff
        ]
        return recent_alerts[-limit:] if limit > 0 else recent_alerts

    def add_alert(
        self,
        alert_type: str,
        message: str,
        severity: str = "warning",
        component: str = "general",
    ):
        """Add an alert"""
        alert = {
            "id": len(self.alerts) + 1,
            "timestamp": datetime.utcnow(),
            "type": alert_type,
            "component": component,
            "message": message,
            "severity": severity,  # info, warning, error, critical
            "resolved": False,
        }
        self.alerts.append(alert)

        # Keep only last 200 alerts
        if len(self.alerts) > 200:
            self.alerts = self.alerts[-200:]

        logger.warning(f"ALERT [{severity.upper()}] {component}: {message}")

        # Auto-resolve old info alerts
        if severity == "info":
            for old_alert in self.alerts:
                if (
                    old_alert["severity"] == "info"
                    and not old_alert["resolved"]
                    and (datetime.utcnow() - old_alert["timestamp"]).total_seconds()
                    > 3600
                ):
                    old_alert["resolved"] = True

    def get_metrics_history(
        self, metric_type: str = None, limit: int = 50
    ) -> List[Dict[str, Any]]:
        """Get metrics history"""
        if metric_type:
            filtered = [m for m in self.metrics_history if m.get("type") == metric_type]
        else:
            filtered = self.metrics_history

        return filtered[-limit:] if limit > 0 else filtered

    def start_monitoring(self):
        """Start monitoring"""
        self.monitoring = True
        self.add_alert("monitoring", "Diagnostics monitoring started", "info", "system")

    def stop_monitoring(self):
        """Stop monitoring"""
        self.monitoring = False
        self.add_alert("monitoring", "Diagnostics monitoring stopped", "info", "system")

    def _calculate_performance_score(self, response_time: float) -> float:
        """Calculate performance score based on response time"""
        if response_time < 0.1:
            return 100.0
        elif response_time < 0.5:
            return 80.0
        elif response_time < 1.0:
            return 60.0
        elif response_time < 2.0:
            return 40.0
        else:
            return 20.0

    def _calculate_security_score(self) -> float:
        """Calculate security score"""
        # Basic security scoring
        score = 100.0

        # Deduct points for various security issues
        # This is a simplified version - real implementation would check actual security posture
        score -= 5.0  # Basic deduction for potential vulnerabilities

        return max(0.0, score)

    def _calculate_overall_score(
        self, system_health, performance, security, ml_status
    ) -> float:
        """Calculate overall health score"""
        try:
            scores = []

            # System health score
            if isinstance(system_health, dict) and "cpu" in system_health:
                cpu_score = 100 - system_health["cpu"]["usage_percent"]
                mem_score = 100 - system_health["memory"]["usage_percent"]
                disk_score = 100 - system_health["disk"]["usage_percent"]
                system_score = (cpu_score + mem_score + disk_score) / 3
                scores.append(system_score)

            # Performance score
            if isinstance(performance, dict) and "performance_score" in performance:
                scores.append(performance["performance_score"])

            # Security score
            if isinstance(security, dict) and "security_score" in security:
                scores.append(security["security_score"])

            # ML model score
            if isinstance(ml_status, dict) and "models" in ml_status:
                ml_score = ml_status.get("avg_accuracy", 0) * 100
                scores.append(ml_score)

            return sum(scores) / len(scores) if scores else 0.0

        except Exception:
            return 50.0  # Default score if calculation fails

    def _get_status_from_score(self, score: float) -> str:
        """Get status from score"""
        if score >= 90:
            return "excellent"
        elif score >= 75:
            return "good"
        elif score >= 60:
            return "acceptable"
        elif score >= 40:
            return "poor"
        else:
            return "critical"


# Global instance
simple_diagnostics = SimpleDiagnostics()


# Convenience functions
async def get_system_health():
    """Get system health"""
    return await simple_diagnostics.get_system_health()


async def get_performance_metrics():
    """Get performance metrics"""
    return await simple_diagnostics.get_performance_metrics()


async def get_security_status():
    """Get security status"""
    return await simple_diagnostics.get_security_status()


async def get_ml_model_status():
    """Get ML model status"""
    return await simple_diagnostics.get_ml_model_status()


async def get_comprehensive_status():
    """Get comprehensive status"""
    return await simple_diagnostics.get_comprehensive_status()


def start_diagnostics():
    """Start diagnostics monitoring"""
    simple_diagnostics.start_monitoring()


def stop_diagnostics():
    """Stop diagnostics monitoring"""
    simple_diagnostics.stop_monitoring()


def get_alerts(limit: int = 50):
    """Get recent alerts"""
    return simple_diagnostics.get_alerts(limit)


def add_alert(
    alert_type: str, message: str, severity: str = "warning", component: str = "general"
):
    """Add an alert"""
    simple_diagnostics.add_alert(alert_type, message, severity, component)