MogensR commited on
Commit
59aded7
·
1 Parent(s): f7961f3

Update utils/hardware/device_manager.py

Browse files
Files changed (1) hide show
  1. utils/hardware/device_manager.py +293 -376
utils/hardware/device_manager.py CHANGED
@@ -1,432 +1,349 @@
1
  """
2
- Device Management Module
3
- Handles hardware detection, optimization, and device switching
4
  """
5
 
6
- import torch
7
- import logging
8
  import platform
9
  import subprocess
10
- import os
11
- from typing import Optional, Dict, Any, List
12
- from core.exceptions import DeviceError # Updated import path
 
13
 
14
- # Fix OpenMP threads early with validation
15
- if 'OMP_NUM_THREADS' not in os.environ:
16
- os.environ['OMP_NUM_THREADS'] = '4'
17
- if 'MKL_NUM_THREADS' not in os.environ:
18
- os.environ['MKL_NUM_THREADS'] = '4'
19
 
20
  logger = logging.getLogger(__name__)
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  class DeviceManager:
23
- """
24
- Manages device detection, validation, and optimization for video processing
25
- """
26
 
27
  def __init__(self):
28
- self._optimal_device = None
29
- self._device_info = {}
30
- self._cuda_tested = False
31
- self._mps_tested = False
32
- self._initialize_device_info()
 
 
 
 
 
33
 
34
- def _initialize_device_info(self):
35
- """Initialize comprehensive device information"""
36
- self._device_info = {
37
- 'platform': platform.system(),
38
- 'python_version': platform.python_version(),
39
- 'pytorch_version': torch.__version__,
40
- 'cuda_available': torch.cuda.is_available(),
41
- 'cuda_version': torch.version.cuda if torch.cuda.is_available() else None,
42
- 'mps_available': self._check_mps_availability(),
43
- 'cpu_count': torch.get_num_threads(),
44
- }
45
 
46
- if self._device_info['cuda_available']:
47
- self._device_info.update(self._get_cuda_info())
 
 
 
 
 
 
 
 
 
 
48
 
49
- if self._device_info['mps_available']:
50
- self._device_info.update(self._get_mps_info())
 
 
 
 
 
 
 
 
51
 
52
- logger.debug(f"Device info initialized: {self._device_info}")
53
-
54
- def _check_mps_availability(self) -> bool:
55
- """Check if Metal Performance Shaders (MPS) is available on macOS"""
56
  try:
57
- if platform.system() == 'Darwin': # macOS
58
- return hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
59
- except Exception:
60
- pass
61
- return False
62
-
63
- def _get_cuda_info(self) -> Dict[str, Any]:
64
- """Get detailed CUDA information"""
65
- cuda_info = {}
66
- try:
67
- if torch.cuda.is_available():
68
- cuda_info.update({
69
- 'cuda_device_count': torch.cuda.device_count(),
70
- 'cuda_current_device': torch.cuda.current_device(),
71
- 'cuda_devices': []
72
- })
73
-
74
- for i in range(torch.cuda.device_count()):
75
- device_props = torch.cuda.get_device_properties(i)
76
- device_info = {
77
- 'index': i,
78
- 'name': device_props.name,
79
- 'memory_total_gb': device_props.total_memory / (1024**3),
80
- 'memory_total_mb': device_props.total_memory / (1024**2),
81
- 'multiprocessor_count': device_props.multiprocessor_count,
82
- 'compute_capability': f"{device_props.major}.{device_props.minor}"
83
- }
84
-
85
- # Get current memory usage
86
- try:
87
- memory_allocated = torch.cuda.memory_allocated(i) / (1024**3)
88
- memory_reserved = torch.cuda.memory_reserved(i) / (1024**3)
89
- device_info.update({
90
- 'memory_allocated_gb': memory_allocated,
91
- 'memory_reserved_gb': memory_reserved,
92
- 'memory_free_gb': device_info['memory_total_gb'] - memory_reserved
93
- })
94
- except Exception as e:
95
- logger.warning(f"Could not get memory info for CUDA device {i}: {e}")
96
-
97
- cuda_info['cuda_devices'].append(device_info)
98
-
99
- except Exception as e:
100
- logger.error(f"Error getting CUDA info: {e}")
101
 
102
- return cuda_info
 
 
 
 
 
 
103
 
104
- def _get_mps_info(self) -> Dict[str, Any]:
105
- """Get Metal Performance Shaders information"""
106
- mps_info = {}
107
  try:
108
- if self._device_info['mps_available']:
109
- # Get system memory as MPS uses unified memory
110
- try:
111
- result = subprocess.run(['sysctl', 'hw.memsize'],
112
- capture_output=True, text=True, timeout=5)
113
- if result.returncode == 0:
114
- memory_bytes = int(result.stdout.split(':')[1].strip())
115
- mps_info['mps_system_memory_gb'] = memory_bytes / (1024**3)
116
- except Exception as e:
117
- logger.warning(f"Could not get system memory info: {e}")
118
-
119
- mps_info['mps_device'] = 'Apple Silicon GPU'
120
-
121
- except Exception as e:
122
- logger.error(f"Error getting MPS info: {e}")
123
 
124
- return mps_info
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- def get_optimal_device(self) -> torch.device:
127
- """
128
- Get the optimal device for video processing with comprehensive testing
129
- """
130
- if self._optimal_device is not None:
131
- return self._optimal_device
132
-
133
- logger.info("Determining optimal device for video processing...")
134
-
135
- # Try CUDA first (most common for AI workloads)
136
- if self._device_info['cuda_available'] and not self._cuda_tested:
137
- cuda_device = self._test_cuda_device()
138
- if cuda_device is not None:
139
- self._optimal_device = cuda_device
140
- logger.info(f"Selected CUDA device: {self._get_device_name(cuda_device)}")
141
- return self._optimal_device
142
 
143
- # Try MPS on Apple Silicon
144
- if self._device_info['mps_available'] and not self._mps_tested:
145
- mps_device = self._test_mps_device()
146
- if mps_device is not None:
147
- self._optimal_device = mps_device
148
- logger.info(f"Selected MPS device: {self._get_device_name(mps_device)}")
149
- return self._optimal_device
150
 
151
- # Fallback to CPU
152
- self._optimal_device = torch.device("cpu")
153
- logger.info("Using CPU device (no suitable GPU found or GPU tests failed)")
154
- return self._optimal_device
155
 
156
- def _test_cuda_device(self) -> Optional[torch.device]:
157
- """Test CUDA device functionality"""
158
- self._cuda_tested = True
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  try:
161
- # Find best CUDA device (highest memory)
162
- best_device_idx = 0
163
- best_memory = 0
164
-
165
- for device_info in self._device_info.get('cuda_devices', []):
166
- if device_info['memory_free_gb'] > best_memory:
167
- best_memory = device_info['memory_free_gb']
168
- best_device_idx = device_info['index']
169
-
170
- device = torch.device(f"cuda:{best_device_idx}")
171
-
172
- # Test basic functionality
173
- test_tensor = torch.tensor([1.0], device=device)
174
- result = test_tensor * 2
175
-
176
- # Test memory operations
177
- large_tensor = torch.randn(1000, 1000, device=device)
178
- del large_tensor, test_tensor, result
179
- torch.cuda.empty_cache()
180
- torch.cuda.synchronize()
181
-
182
- logger.info(f"CUDA device {best_device_idx} passed functionality tests")
183
- return device
184
 
 
 
 
 
 
 
 
 
 
 
185
  except Exception as e:
186
- logger.warning(f"CUDA device test failed: {e}")
187
- return None
188
 
189
- def _test_mps_device(self) -> Optional[torch.device]:
190
- """Test MPS device functionality"""
191
- self._mps_tested = True
192
-
193
  try:
194
- device = torch.device("mps")
 
 
 
195
 
196
- # Test basic functionality
197
- test_tensor = torch.tensor([1.0], device=device)
198
- result = test_tensor * 2
199
 
200
- # Test memory operations
201
- large_tensor = torch.randn(1000, 1000, device=device)
202
- del large_tensor, test_tensor, result
 
 
203
 
204
- # MPS doesn't have explicit cache clearing like CUDA
205
- logger.info("MPS device passed functionality tests")
206
- return device
207
 
208
- except Exception as e:
209
- logger.warning(f"MPS device test failed: {e}")
210
- return None
211
-
212
- def _get_device_name(self, device: torch.device) -> str:
213
- """Get human-readable device name"""
214
- if device.type == 'cuda':
215
- if self._device_info.get('cuda_devices'):
216
- device_idx = device.index or 0
217
- for cuda_device in self._device_info['cuda_devices']:
218
- if cuda_device['index'] == device_idx:
219
- return cuda_device['name']
220
- return f"CUDA Device {device.index or 0}"
221
- elif device.type == 'mps':
222
- return "Apple Silicon GPU (MPS)"
223
- else:
224
- return "CPU"
225
-
226
- def get_device_capabilities(self, device: Optional[torch.device] = None) -> Dict[str, Any]:
227
- """Get capabilities of the specified device"""
228
- if device is None:
229
- device = self.get_optimal_device()
230
-
231
- capabilities = {
232
- 'device_type': device.type,
233
- 'device_name': self._get_device_name(device),
234
- 'supports_mixed_precision': False,
235
- 'recommended_batch_size': 1,
236
- 'memory_efficiency': 'medium'
237
- }
238
-
239
- if device.type == 'cuda':
240
- device_idx = device.index or 0
241
- for cuda_device in self._device_info.get('cuda_devices', []):
242
- if cuda_device['index'] == device_idx:
243
- # Check compute capability for mixed precision
244
- compute_version = float(cuda_device.get('compute_capability', '0.0'))
245
- capabilities['supports_mixed_precision'] = compute_version >= 7.0
246
-
247
- # Estimate batch size based on memory
248
- memory_gb = cuda_device.get('memory_free_gb', 0)
249
- if memory_gb >= 24:
250
- capabilities['recommended_batch_size'] = 4
251
- capabilities['memory_efficiency'] = 'high'
252
- elif memory_gb >= 12:
253
- capabilities['recommended_batch_size'] = 2
254
- capabilities['memory_efficiency'] = 'high'
255
- elif memory_gb >= 6:
256
- capabilities['recommended_batch_size'] = 1
257
- capabilities['memory_efficiency'] = 'medium'
258
- else:
259
- capabilities['memory_efficiency'] = 'low'
260
-
261
- capabilities['memory_available_gb'] = memory_gb
262
- break
263
-
264
- elif device.type == 'mps':
265
- capabilities['supports_mixed_precision'] = True # MPS supports fp16
266
- capabilities['memory_efficiency'] = 'high' # Unified memory
267
- system_memory = self._device_info.get('mps_system_memory_gb', 8)
268
- if system_memory >= 16:
269
- capabilities['recommended_batch_size'] = 2
270
- capabilities['memory_available_gb'] = system_memory * 0.7 # Rough estimate
271
-
272
- else: # CPU
273
- capabilities['memory_efficiency'] = 'low'
274
- capabilities['supports_mixed_precision'] = False
275
 
276
- return capabilities
277
-
278
- def switch_device(self, device_type: str) -> torch.device:
279
- """
280
- Switch to a specific device type
281
-
282
- Args:
283
- device_type: 'cuda', 'mps', or 'cpu'
284
- """
285
- try:
286
- if device_type.lower() == 'cuda':
287
- if not self._device_info['cuda_available']:
288
- raise DeviceError('cuda', 'CUDA not available on this system')
289
-
290
- device = self._test_cuda_device()
291
- if device is None:
292
- raise DeviceError('cuda', 'CUDA device failed functionality tests')
293
-
294
- elif device_type.lower() == 'mps':
295
- if not self._device_info['mps_available']:
296
- raise DeviceError('mps', 'MPS not available on this system')
297
-
298
- device = self._test_mps_device()
299
- if device is None:
300
- raise DeviceError('mps', 'MPS device failed functionality tests')
301
-
302
- elif device_type.lower() == 'cpu':
303
- device = torch.device('cpu')
304
-
305
- else:
306
- raise DeviceError('unknown', f'Unknown device type: {device_type}')
307
 
308
- self._optimal_device = device
309
- logger.info(f"Switched to device: {self._get_device_name(device)}")
310
- return device
311
 
312
- except DeviceError:
313
- raise
314
  except Exception as e:
315
- raise DeviceError(device_type, f"Failed to switch to {device_type}: {str(e)}")
 
 
 
316
 
317
- def get_available_devices(self) -> List[str]:
318
- """Get list of available device types"""
319
- devices = ['cpu'] # CPU always available
320
-
321
- if self._device_info['cuda_available']:
322
- devices.append('cuda')
323
-
324
- if self._device_info['mps_available']:
325
- devices.append('mps')
326
-
327
- return devices
328
-
329
- def get_device_status(self) -> Dict[str, Any]:
330
- """Get comprehensive device status"""
331
- current_device = self.get_optimal_device()
332
-
333
- status = {
334
- 'current_device': str(current_device),
335
- 'current_device_name': self._get_device_name(current_device),
336
- 'available_devices': self.get_available_devices(),
337
- 'device_info': self._device_info.copy(),
338
- 'capabilities': self.get_device_capabilities(current_device)
339
  }
340
 
341
- # Add current memory usage if on GPU
342
- if current_device.type == 'cuda':
343
- try:
344
- device_idx = current_device.index or 0
345
- status['current_memory_usage'] = {
346
- 'allocated_gb': torch.cuda.memory_allocated(device_idx) / (1024**3),
347
- 'reserved_gb': torch.cuda.memory_reserved(device_idx) / (1024**3),
348
- 'max_allocated_gb': torch.cuda.max_memory_allocated(device_idx) / (1024**3),
349
- 'max_reserved_gb': torch.cuda.max_memory_reserved(device_idx) / (1024**3)
350
- }
351
- except Exception as e:
352
- logger.warning(f"Could not get current memory usage: {e}")
353
 
354
- return status
355
-
356
- def optimize_for_processing(self) -> Dict[str, Any]:
357
- """Optimize device settings for video processing"""
358
- device = self.get_optimal_device()
359
- optimizations = {
360
- 'device': str(device),
361
- 'optimizations_applied': []
362
- }
363
 
364
- try:
365
- if device.type == 'cuda':
366
- # Enable cuDNN benchmarking for consistent input sizes
367
- torch.backends.cudnn.benchmark = True
368
- optimizations['optimizations_applied'].append('cudnn_benchmark')
369
-
370
- # Enable cuDNN deterministic mode if needed for reproducibility
371
- # torch.backends.cudnn.deterministic = True
372
-
373
- # Set memory allocation strategy
374
- # os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
375
- optimizations['optimizations_applied'].append('cuda_memory_strategy')
376
-
377
- elif device.type == 'mps':
378
- # MPS-specific optimizations would go here
379
- optimizations['optimizations_applied'].append('mps_optimized')
380
-
381
- else: # CPU
382
- # Set optimal number of threads for CPU processing
383
- torch.set_num_threads(min(torch.get_num_threads(), 8))
384
- optimizations['optimizations_applied'].append('cpu_thread_optimization')
385
-
386
- logger.info(f"Applied optimizations for {device}: {optimizations['optimizations_applied']}")
387
-
388
- except Exception as e:
389
- logger.warning(f"Some optimizations failed: {e}")
390
- optimizations['optimization_errors'] = str(e)
391
 
392
- return optimizations
 
 
 
 
 
 
 
 
 
393
 
394
- def cleanup_device_memory(self):
395
- """Clean up device memory"""
396
- device = self.get_optimal_device()
397
 
398
- if device.type == 'cuda':
399
- try:
400
- torch.cuda.empty_cache()
401
- torch.cuda.synchronize()
402
- logger.debug("CUDA memory cache cleared")
403
- except Exception as e:
404
- logger.warning(f"CUDA memory cleanup failed: {e}")
405
 
406
- elif device.type == 'mps':
407
- try:
408
- # MPS uses unified memory, less explicit cleanup needed
409
- # But we can still run garbage collection
410
- import gc
411
- gc.collect()
412
- logger.debug("MPS memory cleanup completed")
413
- except Exception as e:
414
- logger.warning(f"MPS memory cleanup failed: {e}")
415
 
416
- else: # CPU
417
- try:
418
- import gc
419
- gc.collect()
420
- logger.debug("CPU memory cleanup completed")
421
- except Exception as e:
422
- logger.warning(f"CPU memory cleanup failed: {e}")
423
 
424
- # Global instance for singleton pattern
425
  _device_manager_instance = None
426
 
 
427
  def get_device_manager() -> DeviceManager:
428
- """Get or create a singleton DeviceManager instance"""
429
  global _device_manager_instance
430
  if _device_manager_instance is None:
431
  _device_manager_instance = DeviceManager()
432
- return _device_manager_instance
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Device Manager for BackgroundFX Pro
3
+ Handles device detection, optimization, and hardware compatibility
4
  """
5
 
6
+ import os
7
+ import sys
8
  import platform
9
  import subprocess
10
+ import logging
11
+ from typing import Dict, Any, Optional, Tuple
12
+ from dataclasses import dataclass
13
+ from enum import Enum
14
 
15
+ import torch
16
+ import psutil
17
+ import cpuinfo
 
 
18
 
19
  logger = logging.getLogger(__name__)
20
 
21
+
22
+ class DeviceType(Enum):
23
+ """Enumeration of supported device types"""
24
+ CUDA = "cuda"
25
+ MPS = "mps"
26
+ CPU = "cpu"
27
+
28
+
29
+ @dataclass
30
+ class DeviceInfo:
31
+ """Information about a compute device"""
32
+ type: DeviceType
33
+ index: int
34
+ name: str
35
+ memory_total: int
36
+ memory_available: int
37
+ compute_capability: Optional[Tuple[int, int]] = None
38
+
39
+
40
  class DeviceManager:
41
+ """Manages compute devices and system optimization"""
42
+
43
+ _instance = None
44
 
45
  def __init__(self):
46
+ """Initialize device manager"""
47
+ self.devices = []
48
+ self.optimal_device = None
49
+ self.cpu_info = None
50
+ self.system_info = {}
51
+
52
+ # Initialize device detection
53
+ self._detect_devices()
54
+ self._gather_system_info()
55
+ self._determine_optimal_device()
56
 
57
+ def _detect_devices(self):
58
+ """Detect available compute devices"""
59
+ self.devices = []
 
 
 
 
 
 
 
 
60
 
61
+ # Check for CUDA devices
62
+ if torch.cuda.is_available():
63
+ for i in range(torch.cuda.device_count()):
64
+ props = torch.cuda.get_device_properties(i)
65
+ self.devices.append(DeviceInfo(
66
+ type=DeviceType.CUDA,
67
+ index=i,
68
+ name=props.name,
69
+ memory_total=props.total_memory,
70
+ memory_available=props.total_memory - torch.cuda.memory_allocated(i),
71
+ compute_capability=(props.major, props.minor)
72
+ ))
73
 
74
+ # Check for MPS (Apple Silicon)
75
+ if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
76
+ # MPS doesn't provide detailed device info like CUDA
77
+ self.devices.append(DeviceInfo(
78
+ type=DeviceType.MPS,
79
+ index=0,
80
+ name="Apple Silicon GPU",
81
+ memory_total=psutil.virtual_memory().total,
82
+ memory_available=psutil.virtual_memory().available
83
+ ))
84
 
85
+ # CPU is always available
 
 
 
86
  try:
87
+ cpu_info = cpuinfo.get_cpu_info()
88
+ cpu_name = cpu_info.get('brand_raw', 'Unknown CPU')
89
+ except:
90
+ cpu_name = platform.processor() or "Unknown CPU"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ self.devices.append(DeviceInfo(
93
+ type=DeviceType.CPU,
94
+ index=0,
95
+ name=cpu_name,
96
+ memory_total=psutil.virtual_memory().total,
97
+ memory_available=psutil.virtual_memory().available
98
+ ))
99
 
100
+ def _gather_system_info(self):
101
+ """Gather system information"""
 
102
  try:
103
+ cpu_info = cpuinfo.get_cpu_info()
104
+ self.cpu_info = cpu_info
105
+ except:
106
+ self.cpu_info = {}
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ self.system_info = {
109
+ 'platform': platform.system(),
110
+ 'platform_release': platform.release(),
111
+ 'platform_version': platform.version(),
112
+ 'architecture': platform.machine(),
113
+ 'processor': platform.processor(),
114
+ 'cpu_count': psutil.cpu_count(logical=False),
115
+ 'cpu_count_logical': psutil.cpu_count(logical=True),
116
+ 'ram_total': psutil.virtual_memory().total,
117
+ 'ram_available': psutil.virtual_memory().available,
118
+ 'python_version': sys.version,
119
+ 'torch_version': torch.__version__,
120
+ }
121
 
122
+ def _determine_optimal_device(self):
123
+ """Determine the optimal device for computation"""
124
+ # Priority: CUDA > MPS > CPU
125
+ cuda_devices = [d for d in self.devices if d.type == DeviceType.CUDA]
126
+ mps_devices = [d for d in self.devices if d.type == DeviceType.MPS]
127
+ cpu_devices = [d for d in self.devices if d.type == DeviceType.CPU]
 
 
 
 
 
 
 
 
 
 
128
 
129
+ if cuda_devices:
130
+ # Choose CUDA device with most available memory
131
+ self.optimal_device = max(cuda_devices, key=lambda d: d.memory_available)
132
+ elif mps_devices:
133
+ self.optimal_device = mps_devices[0]
134
+ else:
135
+ self.optimal_device = cpu_devices[0]
136
 
137
+ logger.info(f"Optimal device: {self.optimal_device.name} ({self.optimal_device.type.value})")
 
 
 
138
 
139
+ def get_optimal_device(self) -> str:
140
+ """Get the optimal device string for PyTorch"""
141
+ if self.optimal_device.type == DeviceType.CUDA:
142
+ return f"cuda:{self.optimal_device.index}"
143
+ elif self.optimal_device.type == DeviceType.MPS:
144
+ return "mps"
145
+ else:
146
+ return "cpu"
147
+
148
+ def fix_cuda_compatibility(self):
149
+ """Apply CUDA compatibility fixes"""
150
+ if not torch.cuda.is_available():
151
+ logger.info("CUDA not available, skipping compatibility fixes")
152
+ return
153
 
154
  try:
155
+ # Set CUDA environment variables for better compatibility
156
+ os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ # For older GPUs, enable TF32 for better performance
159
+ if torch.cuda.is_available():
160
+ torch.backends.cuda.matmul.allow_tf32 = True
161
+ torch.backends.cudnn.allow_tf32 = True
162
+
163
+ # Set memory fraction for stability
164
+ if 'PYTORCH_CUDA_ALLOC_CONF' not in os.environ:
165
+ os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
166
+
167
+ logger.info("CUDA compatibility settings applied")
168
  except Exception as e:
169
+ logger.warning(f"Error applying CUDA compatibility fixes: {e}")
 
170
 
171
+ def setup_optimal_threading(self):
172
+ """Configure optimal threading for the system"""
 
 
173
  try:
174
+ # Get physical CPU count
175
+ physical_cores = psutil.cpu_count(logical=False)
176
+ if physical_cores is None:
177
+ physical_cores = 4 # Default fallback
178
 
179
+ # Validate and set the number of threads
180
+ num_threads = str(min(physical_cores, 8)) # Cap at 8 threads
 
181
 
182
+ # Set OpenMP threads (validate the value is a positive integer)
183
+ if num_threads.isdigit() and int(num_threads) > 0:
184
+ os.environ['OMP_NUM_THREADS'] = num_threads
185
+ else:
186
+ os.environ['OMP_NUM_THREADS'] = '4' # Safe default
187
 
188
+ # Set MKL threads for Intel processors
189
+ if 'intel' in self.system_info.get('processor', '').lower():
190
+ os.environ['MKL_NUM_THREADS'] = os.environ['OMP_NUM_THREADS']
191
 
192
+ # Set PyTorch threads
193
+ torch.set_num_threads(int(os.environ['OMP_NUM_THREADS']))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ # For CUDA, set the number of threads for CPU operations
196
+ if torch.cuda.is_available():
197
+ torch.set_num_interop_threads(2) # Inter-op parallelism
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
+ logger.info(f"Threading configured: OMP_NUM_THREADS={os.environ.get('OMP_NUM_THREADS')}")
 
 
200
 
 
 
201
  except Exception as e:
202
+ logger.warning(f"Error setting up threading: {e}")
203
+ # Set safe defaults
204
+ os.environ['OMP_NUM_THREADS'] = '4'
205
+ os.environ['MKL_NUM_THREADS'] = '4'
206
 
207
+ def get_system_diagnostics(self) -> Dict[str, Any]:
208
+ """Get comprehensive system diagnostics"""
209
+ diagnostics = {
210
+ 'system': self.system_info.copy(),
211
+ 'devices': [],
212
+ 'optimal_device': None,
213
+ 'threading': {
214
+ 'omp_num_threads': os.environ.get('OMP_NUM_THREADS', 'not set'),
215
+ 'mkl_num_threads': os.environ.get('MKL_NUM_THREADS', 'not set'),
216
+ 'torch_num_threads': torch.get_num_threads(),
217
+ }
 
 
 
 
 
 
 
 
 
 
 
218
  }
219
 
220
+ # Add device information
221
+ for device in self.devices:
222
+ device_info = {
223
+ 'type': device.type.value,
224
+ 'index': device.index,
225
+ 'name': device.name,
226
+ 'memory_total_gb': device.memory_total / (1024**3),
227
+ 'memory_available_gb': device.memory_available / (1024**3),
228
+ }
229
+ if device.compute_capability:
230
+ device_info['compute_capability'] = f"{device.compute_capability[0]}.{device.compute_capability[1]}"
231
+ diagnostics['devices'].append(device_info)
232
 
233
+ # Add optimal device
234
+ if self.optimal_device:
235
+ diagnostics['optimal_device'] = {
236
+ 'type': self.optimal_device.type.value,
237
+ 'name': self.optimal_device.name,
238
+ 'pytorch_device': self.get_optimal_device()
239
+ }
 
 
240
 
241
+ # Add CUDA-specific diagnostics
242
+ if torch.cuda.is_available():
243
+ diagnostics['cuda'] = {
244
+ 'available': True,
245
+ 'version': torch.version.cuda,
246
+ 'device_count': torch.cuda.device_count(),
247
+ 'current_device': torch.cuda.current_device() if torch.cuda.is_initialized() else None,
248
+ }
249
+ else:
250
+ diagnostics['cuda'] = {'available': False}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
+ # Add MPS-specific diagnostics
253
+ if hasattr(torch.backends, 'mps'):
254
+ diagnostics['mps'] = {
255
+ 'available': torch.backends.mps.is_available(),
256
+ 'built': torch.backends.mps.is_built()
257
+ }
258
+ else:
259
+ diagnostics['mps'] = {'available': False}
260
+
261
+ return diagnostics
262
 
263
+ def get_device_for_model(self, model_size_gb: float = 2.0) -> str:
264
+ """Get appropriate device based on model size requirements"""
265
+ required_memory = model_size_gb * 1024**3 * 1.5 # 1.5x for overhead
266
 
267
+ # Check CUDA devices first
268
+ cuda_devices = [d for d in self.devices if d.type == DeviceType.CUDA]
269
+ for device in cuda_devices:
270
+ if device.memory_available > required_memory:
271
+ return f"cuda:{device.index}"
 
 
272
 
273
+ # Check MPS
274
+ mps_devices = [d for d in self.devices if d.type == DeviceType.MPS]
275
+ if mps_devices and mps_devices[0].memory_available > required_memory:
276
+ return "mps"
 
 
 
 
 
277
 
278
+ # Fallback to CPU
279
+ return "cpu"
280
+
 
 
 
 
281
 
282
+ # Singleton instance holder
283
  _device_manager_instance = None
284
 
285
+
286
  def get_device_manager() -> DeviceManager:
287
+ """Get or create the singleton DeviceManager instance"""
288
  global _device_manager_instance
289
  if _device_manager_instance is None:
290
  _device_manager_instance = DeviceManager()
291
+ return _device_manager_instance
292
+
293
+
294
+ def get_optimal_device() -> str:
295
+ """
296
+ Get the optimal device string for PyTorch operations.
297
+
298
+ Returns:
299
+ str: Device string like 'cuda:0', 'mps', or 'cpu'
300
+ """
301
+ manager = get_device_manager()
302
+ return manager.get_optimal_device()
303
+
304
+
305
+ def fix_cuda_compatibility():
306
+ """
307
+ Apply CUDA compatibility settings for stable operation.
308
+ Sets environment variables and PyTorch settings for CUDA compatibility.
309
+ """
310
+ manager = get_device_manager()
311
+ manager.fix_cuda_compatibility()
312
+
313
+
314
+ def setup_optimal_threading():
315
+ """
316
+ Configure optimal threading settings for the current system.
317
+ Sets OMP_NUM_THREADS, MKL_NUM_THREADS, and PyTorch thread counts.
318
+ """
319
+ manager = get_device_manager()
320
+ manager.setup_optimal_threading()
321
+
322
+
323
+ def get_system_diagnostics() -> Dict[str, Any]:
324
+ """
325
+ Get comprehensive system diagnostics information.
326
+
327
+ Returns:
328
+ Dict containing system info, device info, and configuration details
329
+ """
330
+ manager = get_device_manager()
331
+ return manager.get_system_diagnostics()
332
+
333
+
334
+ # Initialize and configure on module import
335
+ if __name__ != "__main__":
336
+ # When imported, automatically set up threading to avoid the libgomp error
337
+ try:
338
+ # Ensure OMP_NUM_THREADS is set before any OpenMP operations
339
+ if 'OMP_NUM_THREADS' not in os.environ:
340
+ # Set a safe default immediately
341
+ os.environ['OMP_NUM_THREADS'] = '4'
342
+
343
+ # Get the manager instance and configure threading properly
344
+ manager = get_device_manager()
345
+ manager.setup_optimal_threading()
346
+ except Exception as e:
347
+ logger.warning(f"Error during device manager initialization: {e}")
348
+ # Ensure we have safe defaults even if initialization fails
349
+ os.environ['OMP_NUM_THREADS'] = '4'