Spaces:

AumCoreAI
/

AumCore-AI

Sleeping

App Files Files Community

AumCoreAI commited on Dec 28, 2025

Commit

3d0d89c

verified ·

1 Parent(s): cebfe45

Update system_orchestrator.py

Browse files

Files changed (1) hide show

system_orchestrator.py +294 -4

system_orchestrator.py CHANGED Viewed

@@ -25,6 +25,7 @@ class SystemConfig:
     memory_limit_mb: int = 512
     db_path: str = "data/system_state.json"
     log_file: str = "logs/aumcore_main.log"
     enable_telemetry: bool = True
 # ==========================================
@@ -109,7 +110,7 @@ class StateManager:
         if not os.path.exists(os.path.dirname(self.path)):
             os.makedirs(os.path.dirname(self.path))
         if not os.path.exists(self.path):
-            self.save_state({"sessions": [], "metrics": {}})
     def save_state(self, data: Dict):
         with self._lock:
@@ -122,7 +123,267 @@ class StateManager:
                 return json.load(f)
 # ==========================================
-# 7. ASYNC AI TASK PROCESSOR
 # ==========================================
 class TaskProcessor(IEngine):
     def __init__(self, config: SystemConfig):
@@ -169,7 +430,7 @@ class TaskProcessor(IEngine):
         return f"PROCESSED_DATA_{data.upper()}"
 # ==========================================
-# 8. MASTER ORCHESTRATOR
 # ==========================================
 class AumCoreMaster:
     def __init__(self):
@@ -177,6 +438,8 @@ class AumCoreMaster:
         self.logger = AumLogger("Master", self.config.log_file)
         self.state = StateManager(self.config.db_path)
         self.processor = TaskProcessor(self.config)
         self.is_running = True
         # Register OS Signals for Graceful Exit
@@ -191,6 +454,13 @@ class AumCoreMaster:
         await self.processor.startup()
         self.logger.info("Master loop started. Listening for tasks...")
         try:
             while self.is_running:
                 # In a real app, this would be a web server or message queue listener
@@ -210,7 +480,27 @@ class AumCoreMaster:
             await self.processor.shutdown()
 # ==========================================
-# 9. EXECUTION ENTRY POINT
 # ==========================================
 async def bootstrap():
     """Main Entry point with proper lifecycle management"""

     memory_limit_mb: int = 512
     db_path: str = "data/system_state.json"
     log_file: str = "logs/aumcore_main.log"
+    diagnostics_log: str = "logs/diagnostics.log"
     enable_telemetry: bool = True
 # ==========================================
         if not os.path.exists(os.path.dirname(self.path)):
             os.makedirs(os.path.dirname(self.path))
         if not os.path.exists(self.path):
+            self.save_state({"sessions": [], "metrics": {}, "diagnostics_history": []})
     def save_state(self, data: Dict):
         with self._lock:
                 return json.load(f)
 # ==========================================
+# 7. SYSTEM DIAGNOSTICS ENGINE (LEVEL 2 ADDITION)
+# ==========================================
+class DiagnosticsEngine:
+    """Complete System Health Monitoring and Diagnostics"""
+    def __init__(self, config: SystemConfig):
+        self.config = config
+        self.logger = AumLogger("Diagnostics", config.diagnostics_log)
+        self.diagnostics_history = []
+    async def run_full_diagnostics(self) -> Dict:
+        """Run complete system health check with detailed report"""
+        self.logger.info("Starting comprehensive system diagnostics...")
+        report = {
+            "timestamp": datetime.now().isoformat(),
+            "system_id": f"AUMCORE-{uuid.uuid4().hex[:8]}",
+            "status": "RUNNING",
+            "health_score": 0,
+            "sections": {}
+        }
+        # 1. SYSTEM RESOURCES CHECK
+        system_health = await self._check_system_resources()
+        report["sections"]["system_resources"] = system_health
+        # 2. APPLICATION HEALTH CHECK
+        app_health = await self._check_application_health()
+        report["sections"]["application"] = app_health
+        # 3. EXTERNAL SERVICES CHECK
+        services_health = await self._check_external_services()
+        report["sections"]["external_services"] = services_health
+        # 4. LOGS ANALYSIS
+        logs_analysis = await self._analyze_logs()
+        report["sections"]["logs_analysis"] = logs_analysis
+        # 5. PERFORMANCE METRICS
+        performance = await self._collect_performance_metrics()
+        report["sections"]["performance"] = performance
+        # Calculate overall health score (0-100)
+        report["health_score"] = self._calculate_health_score(report)
+        report["status"] = "HEALTHY" if report["health_score"] >= 80 else "DEGRADED" if report["health_score"] >= 50 else "CRITICAL"
+        # Save to history
+        self._save_diagnostics_to_history(report)
+        self.logger.info(f"Diagnostics completed. Health Score: {report['health_score']}/100")
+        return report
+    async def _check_system_resources(self) -> Dict:
+        """Check CPU, Memory, Disk, Network"""
+        try:
+            cpu_usage = psutil.cpu_percent(interval=1)
+            memory = psutil.virtual_memory()
+            disk = psutil.disk_usage('/')
+            net_io = psutil.net_io_counters()
+            return {
+                "cpu": {
+                    "usage_percent": cpu_usage,
+                    "cores": psutil.cpu_count(),
+                    "load_avg": os.getloadavg() if hasattr(os, 'getloadavg') else "N/A"
+                },
+                "memory": {
+                    "total_gb": round(memory.total / (1024**3), 2),
+                    "available_gb": round(memory.available / (1024**3), 2),
+                    "used_percent": memory.percent,
+                    "free_percent": 100 - memory.percent
+                },
+                "disk": {
+                    "total_gb": round(disk.total / (1024**3), 2),
+                    "used_gb": round(disk.used / (1024**3), 2),
+                    "free_gb": round(disk.free / (1024**3), 2),
+                    "used_percent": disk.percent
+                },
+                "network": {
+                    "bytes_sent": net_io.bytes_sent,
+                    "bytes_recv": net_io.bytes_recv
+                },
+                "processes": {
+                    "total": len(psutil.pids()),
+                    "aumcore_processes": len([p for p in psutil.process_iter(['name']) if 'python' in p.info['name'].lower()])
+                }
+            }
+        except Exception as e:
+            self.logger.error(f"System resources check failed: {e}")
+            return {"error": str(e)}
+    async def _check_application_health(self) -> Dict:
+        """Check application specific health"""
+        try:
+            # Check if app.py is running
+            app_running = any('app.py' in p.info().get('cmdline', []) for p in psutil.process_iter(['cmdline']))
+            # Check logs directory
+            logs_exist = os.path.exists('logs')
+            data_dir_exist = os.path.exists('data')
+            # Check recent errors in main log
+            error_count = 0
+            if os.path.exists(self.config.log_file):
+                with open(self.config.log_file, 'r') as f:
+                    lines = f.readlines()[-100:]  # Last 100 lines
+                    error_count = sum(1 for line in lines if 'ERROR' in line.upper())
+            return {
+                "application_running": app_running,
+                "directories": {
+                    "logs": logs_exist,
+                    "data": data_dir_exist
+                },
+                "recent_errors": error_count,
+                "uptime_estimate": "N/A"  # Can be enhanced with process start time
+            }
+        except Exception as e:
+            self.logger.error(f"Application health check failed: {e}")
+            return {"error": str(e)}
+    async def _check_external_services(self) -> Dict:
+        """Check Groq API, TiDB, and other external services"""
+        services = {
+            "groq_api": {"status": "UNKNOWN", "latency_ms": 0},
+            "tidb_database": {"status": "UNKNOWN", "connected": False}
+        }
+        # Check Groq API
+        try:
+            import os
+            from groq import Groq
+            start = time.time()
+            client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+            test = client.chat.completions.create(
+                model="llama-3.3-70b-versatile",
+                messages=[{"role": "user", "content": "ping"}],
+                max_tokens=1,
+                timeout=10
+            )
+            services["groq_api"] = {
+                "status": "HEALTHY",
+                "latency_ms": round((time.time() - start) * 1000, 2),
+                "model_available": True
+            }
+        except Exception as e:
+            services["groq_api"] = {
+                "status": "UNHEALTHY",
+                "error": str(e),
+                "latency_ms": 0
+            }
+        # Check TiDB (if memory_db exists)
+        try:
+            from memory_db import tidb_memory
+            services["tidb_database"] = {
+                "status": "HEALTHY",
+                "connected": True,
+                "type": "TiDB Cloud"
+            }
+        except ImportError:
+            services["tidb_database"] = {
+                "status": "NOT_CONFIGURED",
+                "connected": False,
+                "note": "memory_db module not found"
+            }
+        except Exception as e:
+            services["tidb_database"] = {
+                "status": "UNHEALTHY",
+                "connected": False,
+                "error": str(e)
+            }
+        return services
+    async def _analyze_logs(self) -> Dict:
+        """Analyze application logs for patterns and errors"""
+        try:
+            if not os.path.exists(self.config.log_file):
+                return {"error": "Log file not found", "file": self.config.log_file}
+            with open(self.config.log_file, 'r') as f:
+                lines = f.readlines()[-500:]  # Last 500 lines
+            analysis = {
+                "total_lines_analyzed": len(lines),
+                "error_count": sum(1 for line in lines if 'ERROR' in line.upper()),
+                "warning_count": sum(1 for line in lines if 'WARNING' in line.upper()),
+                "info_count": sum(1 for line in lines if 'INFO' in line.upper()),
+                "recent_errors": [],
+                "log_file_size_mb": round(os.path.getsize(self.config.log_file) / (1024*1024), 3)
+            }
+            # Extract recent errors
+            for line in lines[-20:]:  # Last 20 lines
+                if 'ERROR' in line.upper():
+                    analysis["recent_errors"].append(line.strip())
+                    if len(analysis["recent_errors"]) >= 5:
+                        break
+            return analysis
+        except Exception as e:
+            return {"error": f"Log analysis failed: {str(e)}"}
+    async def _collect_performance_metrics(self) -> Dict:
+        """Collect performance metrics and statistics"""
+        try:
+            state = self.state.load_state() if hasattr(self, 'state') else {}
+            return {
+                "total_sessions": len(state.get("sessions", [])),
+                "total_tasks_processed": len(state.get("metrics", {}).get("tasks", [])),
+                "average_response_time": "N/A",  # Can be calculated from actual data
+                "peak_memory_usage_mb": "N/A",
+                "system_uptime": "N/A",
+                "last_diagnostics": state.get("diagnostics_history", [])[-1]["timestamp"] if state.get("diagnostics_history") else "N/A"
+            }
+        except Exception as e:
+            return {"error": str(e)}
+    def _calculate_health_score(self, report: Dict) -> int:
+        """Calculate overall health score 0-100"""
+        score = 100
+        # Deduct points for issues
+        if report["sections"]["system_resources"].get("memory", {}).get("used_percent", 0) > 90:
+            score -= 20
+        if report["sections"]["system_resources"].get("disk", {}).get("used_percent", 0) > 90:
+            score -= 15
+        if report["sections"]["application"].get("recent_errors", 0) > 5:
+            score -= 10 * min(report["sections"]["application"]["recent_errors"], 10)
+        if report["sections"]["external_services"].get("groq_api", {}).get("status") != "HEALTHY":
+            score -= 25
+        if report["sections"]["external_services"].get("tidb_database", {}).get("status") != "HEALTHY":
+            score -= 15
+        return max(0, min(100, score))
+    def _save_diagnostics_to_history(self, report: Dict):
+        """Save diagnostics report to history"""
+        try:
+            state = self.state.load_state() if hasattr(self, 'state') else {"diagnostics_history": []}
+            history = state.get("diagnostics_history", [])
+            # Keep only last 10 reports
+            history.append({
+                "timestamp": report["timestamp"],
+                "health_score": report["health_score"],
+                "status": report["status"],
+                "summary": f"Health: {report['health_score']}/100, Status: {report['status']}"
+            })
+            if len(history) > 10:
+                history = history[-10:]
+            state["diagnostics_history"] = history
+            self.state.save_state(state)
+        except Exception as e:
+            self.logger.error(f"Failed to save diagnostics history: {e}")
+# ==========================================
+# 8. ASYNC AI TASK PROCESSOR
 # ==========================================
 class TaskProcessor(IEngine):
     def __init__(self, config: SystemConfig):
         return f"PROCESSED_DATA_{data.upper()}"
 # ==========================================
+# 9. MASTER ORCHESTRATOR (UPDATED WITH DIAGNOSTICS)
 # ==========================================
 class AumCoreMaster:
     def __init__(self):
         self.logger = AumLogger("Master", self.config.log_file)
         self.state = StateManager(self.config.db_path)
         self.processor = TaskProcessor(self.config)
+        self.diagnostics = DiagnosticsEngine(self.config)  # NEW: Diagnostics Engine
+        self.diagnostics.state = self.state  # Share state manager
         self.is_running = True
         # Register OS Signals for Graceful Exit
         await self.processor.startup()
         self.logger.info("Master loop started. Listening for tasks...")
+        # Run initial diagnostics
+        try:
+            initial_report = await self.diagnostics.run_full_diagnostics()
+            self.logger.info(f"Initial diagnostics: Health Score {initial_report['health_score']}/100")
+        except Exception as e:
+            self.logger.error(f"Initial diagnostics failed: {e}")
         try:
             while self.is_running:
                 # In a real app, this would be a web server or message queue listener
             await self.processor.shutdown()
 # ==========================================
+# 10. DIAGNOSTICS API ENDPOINT FUNCTION
+# ==========================================
+async def get_system_diagnostics() -> Dict:
+    """Public function to get system diagnostics (for app.py integration)"""
+    try:
+        master = AumCoreMaster()
+        report = await master.diagnostics.run_full_diagnostics()
+        return {
+            "success": True,
+            "diagnostics": report,
+            "timestamp": datetime.now().isoformat()
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "timestamp": datetime.now().isoformat()
+        }
+# ==========================================
+# 11. EXECUTION ENTRY POINT
 # ==========================================
 async def bootstrap():
     """Main Entry point with proper lifecycle management"""