| | import os
|
| | import sys
|
| | import time
|
| | import psutil
|
| | import logging
|
| | import threading
|
| | from typing import Optional
|
| | from datetime import datetime
|
| |
|
| |
|
| | logging.basicConfig(
|
| | level=logging.INFO,
|
| | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| | )
|
| | logger = logging.getLogger(__name__)
|
| |
|
| | class EmergencyMonitor:
|
| | """Monitor for detecting training hangs and generating emergency reports."""
|
| |
|
| | def __init__(self, check_interval=300, report_dir=None):
|
| | """
|
| | Initialize the emergency monitor.
|
| |
|
| | Args:
|
| | check_interval (int): Interval between checks in seconds (default: 300s)
|
| | report_dir (str): Directory to save emergency reports (default: emergency_reports in same directory as this file)
|
| | """
|
| | self.check_interval = check_interval
|
| | self.stop_event = threading.Event()
|
| | self.last_activity_time = time.time()
|
| | self.monitor_thread = None
|
| |
|
| |
|
| | if report_dir:
|
| | self.report_dir = os.path.abspath(report_dir)
|
| | else:
|
| |
|
| | current_dir = os.path.dirname(os.path.abspath(__file__))
|
| | self.report_dir = os.path.abspath(os.path.join(current_dir, "emergency_reports"))
|
| |
|
| |
|
| | os.makedirs(self.report_dir, exist_ok=True)
|
| |
|
| |
|
| | logger.info(f"Current working directory: {os.getcwd()}")
|
| | logger.info(f"Emergency reports will be saved to absolute path: {self.report_dir}")
|
| |
|
| | def start_monitoring(self):
|
| | """Start the monitoring thread."""
|
| | self.stop_event.clear()
|
| | self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
| | self.monitor_thread.start()
|
| | logger.info(f"Emergency monitoring started with interval {self.check_interval}s")
|
| |
|
| | def stop_monitoring(self):
|
| | """Stop the monitoring thread."""
|
| | if self.monitor_thread:
|
| | self.stop_event.set()
|
| | self.monitor_thread.join(timeout=5)
|
| | logger.info("Emergency monitoring stopped")
|
| |
|
| | def _monitor_loop(self):
|
| | """Main monitoring loop."""
|
| | while not self.stop_event.is_set():
|
| | current_time = time.time()
|
| | if current_time - self.last_activity_time > self.check_interval:
|
| |
|
| | self._create_emergency_report()
|
| |
|
| | self.last_activity_time = current_time
|
| |
|
| |
|
| | self.stop_event.wait(self.check_interval / 10)
|
| |
|
| | def update_activity(self):
|
| | """Update the last activity time."""
|
| | self.last_activity_time = time.time()
|
| |
|
| | def _create_emergency_report(self):
|
| | """Create an emergency report when training appears to be hanging."""
|
| | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| | report_dir = os.path.abspath(self.report_dir)
|
| | report_path = os.path.join(report_dir, f"emergency_report_{timestamp}.txt")
|
| | logger.info(f"Creating emergency report at: {report_path}")
|
| |
|
| | try:
|
| |
|
| | os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
| |
|
| | with open(report_path, "w") as f:
|
| |
|
| | f.write(f"Emergency report generated at {timestamp}\n")
|
| | f.write(f"System has been inactive for {self.check_interval} seconds\n\n")
|
| |
|
| |
|
| | f.write("=== SYSTEM INFORMATION ===\n")
|
| | mem = psutil.virtual_memory()
|
| | f.write(f"Memory: {mem.percent}% used ({mem.used / (1024**3):.2f} GB / {mem.total / (1024**3):.2f} GB)\n")
|
| | f.write(f"CPU usage: {psutil.cpu_percent(interval=1)}%\n")
|
| |
|
| |
|
| | process = psutil.Process()
|
| | f.write(f"Process memory: {process.memory_info().rss / (1024**3):.2f} GB\n")
|
| | f.write(f"Process CPU: {process.cpu_percent(interval=1)}%\n")
|
| | f.write(f"Process creation time: {datetime.fromtimestamp(process.create_time()).strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
| |
|
| |
|
| | current_frames = sys._current_frames()
|
| | f.write("=== THREAD INFORMATION ===\n")
|
| | for thread_id, frame in current_frames.items():
|
| | f.write(f"\nThread ID: {thread_id}\n")
|
| | import traceback
|
| | stack_trace = ''.join(traceback.format_stack(frame))
|
| | f.write(stack_trace)
|
| |
|
| |
|
| | f.write("\n=== GPU INFORMATION ===\n")
|
| | try:
|
| | import torch
|
| | if torch.cuda.is_available():
|
| | device_count = torch.cuda.device_count()
|
| | f.write(f"CUDA devices: {device_count}\n")
|
| | for i in range(device_count):
|
| | f.write(f"GPU {i}: {torch.cuda.get_device_name(i)}\n")
|
| | f.write(f" Memory allocated: {torch.cuda.memory_allocated(i) / (1024**3):.2f} GB\n")
|
| | f.write(f" Memory reserved: {torch.cuda.memory_reserved(i) / (1024**3):.2f} GB\n")
|
| | else:
|
| | f.write("CUDA not available\n")
|
| | except ImportError:
|
| | f.write("PyTorch not available for GPU information\n")
|
| | except Exception as e:
|
| | f.write(f"Error getting GPU information: {str(e)}\n")
|
| |
|
| | logger.warning(f"Created emergency report at {report_path}")
|
| | except Exception as e:
|
| | logger.error(f"Failed to create emergency report: {str(e)}")
|
| |
|
| | try:
|
| |
|
| | script_dir = os.path.dirname(os.path.abspath(__file__))
|
| | fallback_path = os.path.join(script_dir, f"emergency_report_{timestamp}.txt")
|
| | with open(fallback_path, "w") as f:
|
| | f.write(f"FALLBACK Emergency report generated at {timestamp}\n")
|
| | f.write(f"Original path failed: {report_path}\n")
|
| | f.write(f"Error: {str(e)}\n")
|
| | logger.warning(f"Created fallback emergency report at {fallback_path}")
|
| | except Exception as fallback_error:
|
| | logger.error(f"Failed to create fallback report: {str(fallback_error)}")
|
| |
|
| |
|
| | TrainingMonitor = EmergencyMonitor
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|