File size: 4,456 Bytes
0861a59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
import sys
import time
import psutil
import traceback
import logging
import threading
from typing import Dict, Any, Optional, List

logger = logging.getLogger(__name__)

class DebugHelper:
    """

    Helper class for debugging hanging issues in STDP training.

    

    This provides tools for:

    1. Process monitoring and status reporting

    2. Timeout management

    3. Recovery mechanisms for hanging processes

    4. Detailed diagnostics

    """
    
    @staticmethod
    def get_process_info(pid: Optional[int] = None) -> Dict[str, Any]:
        """Get detailed information about the current process."""
        pid = pid or os.getpid()
        
        try:
            process = psutil.Process(pid)
            
            # Get basic process info
            info = {
                'pid': pid,
                'name': process.name(),
                'status': process.status(),
                'cpu_percent': process.cpu_percent(),
                'memory_percent': process.memory_percent(),
                'memory_info': dict(process.memory_info()._asdict()),
                'create_time': process.create_time(),
                'runtime': time.time() - process.create_time(),
                'num_threads': process.num_threads(),
                'open_files': len(process.open_files()),
                'connections': len(process.connections()),
            }
            
            # Get thread details
            try:
                import threading
                info['active_threads'] = [t.name for t in threading.enumerate()]
            except:
                info['active_threads'] = "Could not retrieve thread information"
                
            # Get current stack trace
            info['current_stack'] = traceback.format_stack()
            
            return info
            
        except Exception as e:
            logger.error(f"Error getting process info: {e}")
            return {'error': str(e)}
            
    @staticmethod
    def check_resource_leaks() -> Dict[str, Any]:
        """Check for potential resource leaks."""
        import gc
        
        leaks = {
            'gc_counts': gc.get_count(),
            'gc_objects': len(gc.get_objects()),
        }
        
        # Check for torch memory usage if available
        try:
            import torch
            if torch.cuda.is_available():
                leaks['torch_memory_allocated'] = torch.cuda.memory_allocated()
                leaks['torch_memory_reserved'] = torch.cuda.memory_reserved()
                leaks['torch_max_memory_allocated'] = torch.cuda.max_memory_allocated()
        except ImportError:
            pass
            
        return leaks
    
    @staticmethod
    def register_timeout(seconds: int, callback=None):
        """Register a timeout that calls the callback after specified seconds."""
        def _timeout_handler():
            time.sleep(seconds)
            if callback:
                callback()
            else:
                print(f"TIMEOUT: Operation took longer than {seconds} seconds")
                info = DebugHelper.get_process_info()
                print(f"Process info: {info}")
                traceback.print_stack()
                
        thread = threading.Thread(target=_timeout_handler)
        thread.daemon = True
        thread.start()
        return thread
        
    @staticmethod
    def dump_debug_info(filename: str):
        """Dump debug information to a file."""
        process_info = DebugHelper.get_process_info()
        leak_info = DebugHelper.check_resource_leaks()
        
        with open(filename, 'w') as f:
            f.write("===== PROCESS INFORMATION =====\n")
            for key, value in process_info.items():
                f.write(f"{key}: {value}\n")
                
            f.write("\n===== RESOURCE LEAK INFORMATION =====\n")
            for key, value in leak_info.items():
                f.write(f"{key}: {value}\n")
                
            f.write("\n===== ENVIRONMENT VARIABLES =====\n")
            for key, value in os.environ.items():
                f.write(f"{key}: {value}\n")
                
            f.write("\n===== STACK TRACE =====\n")
            f.write(''.join(traceback.format_stack()))
        
        logger.info(f"Debug info dumped to {filename}")