|
|
import os
|
|
|
import sys
|
|
|
import time
|
|
|
import psutil
|
|
|
import traceback
|
|
|
import logging
|
|
|
import threading
|
|
|
from typing import Dict, Any, Optional, List
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DebugHelper:
|
|
|
"""
|
|
|
Helper class for debugging hanging issues in STDP training.
|
|
|
|
|
|
This provides tools for:
|
|
|
1. Process monitoring and status reporting
|
|
|
2. Timeout management
|
|
|
3. Recovery mechanisms for hanging processes
|
|
|
4. Detailed diagnostics
|
|
|
"""
|
|
|
|
|
|
@staticmethod
|
|
|
def get_process_info(pid: Optional[int] = None) -> Dict[str, Any]:
|
|
|
"""Get detailed information about the current process."""
|
|
|
pid = pid or os.getpid()
|
|
|
|
|
|
try:
|
|
|
process = psutil.Process(pid)
|
|
|
|
|
|
|
|
|
info = {
|
|
|
'pid': pid,
|
|
|
'name': process.name(),
|
|
|
'status': process.status(),
|
|
|
'cpu_percent': process.cpu_percent(),
|
|
|
'memory_percent': process.memory_percent(),
|
|
|
'memory_info': dict(process.memory_info()._asdict()),
|
|
|
'create_time': process.create_time(),
|
|
|
'runtime': time.time() - process.create_time(),
|
|
|
'num_threads': process.num_threads(),
|
|
|
'open_files': len(process.open_files()),
|
|
|
'connections': len(process.connections()),
|
|
|
}
|
|
|
|
|
|
|
|
|
try:
|
|
|
import threading
|
|
|
info['active_threads'] = [t.name for t in threading.enumerate()]
|
|
|
except:
|
|
|
info['active_threads'] = "Could not retrieve thread information"
|
|
|
|
|
|
|
|
|
info['current_stack'] = traceback.format_stack()
|
|
|
|
|
|
return info
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.error(f"Error getting process info: {e}")
|
|
|
return {'error': str(e)}
|
|
|
|
|
|
@staticmethod
|
|
|
def check_resource_leaks() -> Dict[str, Any]:
|
|
|
"""Check for potential resource leaks."""
|
|
|
import gc
|
|
|
|
|
|
leaks = {
|
|
|
'gc_counts': gc.get_count(),
|
|
|
'gc_objects': len(gc.get_objects()),
|
|
|
}
|
|
|
|
|
|
|
|
|
try:
|
|
|
import torch
|
|
|
if torch.cuda.is_available():
|
|
|
leaks['torch_memory_allocated'] = torch.cuda.memory_allocated()
|
|
|
leaks['torch_memory_reserved'] = torch.cuda.memory_reserved()
|
|
|
leaks['torch_max_memory_allocated'] = torch.cuda.max_memory_allocated()
|
|
|
except ImportError:
|
|
|
pass
|
|
|
|
|
|
return leaks
|
|
|
|
|
|
@staticmethod
|
|
|
def register_timeout(seconds: int, callback=None):
|
|
|
"""Register a timeout that calls the callback after specified seconds."""
|
|
|
def _timeout_handler():
|
|
|
time.sleep(seconds)
|
|
|
if callback:
|
|
|
callback()
|
|
|
else:
|
|
|
print(f"TIMEOUT: Operation took longer than {seconds} seconds")
|
|
|
info = DebugHelper.get_process_info()
|
|
|
print(f"Process info: {info}")
|
|
|
traceback.print_stack()
|
|
|
|
|
|
thread = threading.Thread(target=_timeout_handler)
|
|
|
thread.daemon = True
|
|
|
thread.start()
|
|
|
return thread
|
|
|
|
|
|
@staticmethod
|
|
|
def dump_debug_info(filename: str):
|
|
|
"""Dump debug information to a file."""
|
|
|
process_info = DebugHelper.get_process_info()
|
|
|
leak_info = DebugHelper.check_resource_leaks()
|
|
|
|
|
|
with open(filename, 'w') as f:
|
|
|
f.write("===== PROCESS INFORMATION =====\n")
|
|
|
for key, value in process_info.items():
|
|
|
f.write(f"{key}: {value}\n")
|
|
|
|
|
|
f.write("\n===== RESOURCE LEAK INFORMATION =====\n")
|
|
|
for key, value in leak_info.items():
|
|
|
f.write(f"{key}: {value}\n")
|
|
|
|
|
|
f.write("\n===== ENVIRONMENT VARIABLES =====\n")
|
|
|
for key, value in os.environ.items():
|
|
|
f.write(f"{key}: {value}\n")
|
|
|
|
|
|
f.write("\n===== STACK TRACE =====\n")
|
|
|
f.write(''.join(traceback.format_stack()))
|
|
|
|
|
|
logger.info(f"Debug info dumped to {filename}")
|
|
|
|