Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 25, 2025

Commit

59aded7

1 Parent(s): f7961f3

Update utils/hardware/device_manager.py

Browse files

Files changed (1) hide show

utils/hardware/device_manager.py +293 -376

utils/hardware/device_manager.py CHANGED Viewed

@@ -1,432 +1,349 @@
 """
-Device Management Module
-Handles hardware detection, optimization, and device switching
 """
-import torch
-import logging
 import platform
 import subprocess
-import os
-from typing import Optional, Dict, Any, List
-from core.exceptions import DeviceError  # Updated import path
-# Fix OpenMP threads early with validation
-if 'OMP_NUM_THREADS' not in os.environ:
-    os.environ['OMP_NUM_THREADS'] = '4'
-if 'MKL_NUM_THREADS' not in os.environ:
-    os.environ['MKL_NUM_THREADS'] = '4'
 logger = logging.getLogger(__name__)
 class DeviceManager:
-    """
-    Manages device detection, validation, and optimization for video processing
-    """
     def __init__(self):
-        self._optimal_device = None
-        self._device_info = {}
-        self._cuda_tested = False
-        self._mps_tested = False
-        self._initialize_device_info()
-    def _initialize_device_info(self):
-        """Initialize comprehensive device information"""
-        self._device_info = {
-            'platform': platform.system(),
-            'python_version': platform.python_version(),
-            'pytorch_version': torch.__version__,
-            'cuda_available': torch.cuda.is_available(),
-            'cuda_version': torch.version.cuda if torch.cuda.is_available() else None,
-            'mps_available': self._check_mps_availability(),
-            'cpu_count': torch.get_num_threads(),
-        }
-        if self._device_info['cuda_available']:
-            self._device_info.update(self._get_cuda_info())
-        if self._device_info['mps_available']:
-            self._device_info.update(self._get_mps_info())
-        logger.debug(f"Device info initialized: {self._device_info}")
-    def _check_mps_availability(self) -> bool:
-        """Check if Metal Performance Shaders (MPS) is available on macOS"""
         try:
-            if platform.system() == 'Darwin':  # macOS
-                return hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
-        except Exception:
-            pass
-        return False
-    def _get_cuda_info(self) -> Dict[str, Any]:
-        """Get detailed CUDA information"""
-        cuda_info = {}
-        try:
-            if torch.cuda.is_available():
-                cuda_info.update({
-                    'cuda_device_count': torch.cuda.device_count(),
-                    'cuda_current_device': torch.cuda.current_device(),
-                    'cuda_devices': []
-                })
-                for i in range(torch.cuda.device_count()):
-                    device_props = torch.cuda.get_device_properties(i)
-                    device_info = {
-                        'index': i,
-                        'name': device_props.name,
-                        'memory_total_gb': device_props.total_memory / (1024**3),
-                        'memory_total_mb': device_props.total_memory / (1024**2),
-                        'multiprocessor_count': device_props.multiprocessor_count,
-                        'compute_capability': f"{device_props.major}.{device_props.minor}"
-                    }
-                    # Get current memory usage
-                    try:
-                        memory_allocated = torch.cuda.memory_allocated(i) / (1024**3)
-                        memory_reserved = torch.cuda.memory_reserved(i) / (1024**3)
-                        device_info.update({
-                            'memory_allocated_gb': memory_allocated,
-                            'memory_reserved_gb': memory_reserved,
-                            'memory_free_gb': device_info['memory_total_gb'] - memory_reserved
-                        })
-                    except Exception as e:
-                        logger.warning(f"Could not get memory info for CUDA device {i}: {e}")
-                    cuda_info['cuda_devices'].append(device_info)
-        except Exception as e:
-            logger.error(f"Error getting CUDA info: {e}")
-        return cuda_info
-    def _get_mps_info(self) -> Dict[str, Any]:
-        """Get Metal Performance Shaders information"""
-        mps_info = {}
         try:
-            if self._device_info['mps_available']:
-                # Get system memory as MPS uses unified memory
-                try:
-                    result = subprocess.run(['sysctl', 'hw.memsize'],
-                                          capture_output=True, text=True, timeout=5)
-                    if result.returncode == 0:
-                        memory_bytes = int(result.stdout.split(':')[1].strip())
-                        mps_info['mps_system_memory_gb'] = memory_bytes / (1024**3)
-                except Exception as e:
-                    logger.warning(f"Could not get system memory info: {e}")
-                mps_info['mps_device'] = 'Apple Silicon GPU'
-        except Exception as e:
-            logger.error(f"Error getting MPS info: {e}")
-        return mps_info
-    def get_optimal_device(self) -> torch.device:
-        """
-        Get the optimal device for video processing with comprehensive testing
-        """
-        if self._optimal_device is not None:
-            return self._optimal_device
-        logger.info("Determining optimal device for video processing...")
-        # Try CUDA first (most common for AI workloads)
-        if self._device_info['cuda_available'] and not self._cuda_tested:
-            cuda_device = self._test_cuda_device()
-            if cuda_device is not None:
-                self._optimal_device = cuda_device
-                logger.info(f"Selected CUDA device: {self._get_device_name(cuda_device)}")
-                return self._optimal_device
-        # Try MPS on Apple Silicon
-        if self._device_info['mps_available'] and not self._mps_tested:
-            mps_device = self._test_mps_device()
-            if mps_device is not None:
-                self._optimal_device = mps_device
-                logger.info(f"Selected MPS device: {self._get_device_name(mps_device)}")
-                return self._optimal_device
-        # Fallback to CPU
-        self._optimal_device = torch.device("cpu")
-        logger.info("Using CPU device (no suitable GPU found or GPU tests failed)")
-        return self._optimal_device
-    def _test_cuda_device(self) -> Optional[torch.device]:
-        """Test CUDA device functionality"""
-        self._cuda_tested = True
         try:
-            # Find best CUDA device (highest memory)
-            best_device_idx = 0
-            best_memory = 0
-            for device_info in self._device_info.get('cuda_devices', []):
-                if device_info['memory_free_gb'] > best_memory:
-                    best_memory = device_info['memory_free_gb']
-                    best_device_idx = device_info['index']
-            device = torch.device(f"cuda:{best_device_idx}")
-            # Test basic functionality
-            test_tensor = torch.tensor([1.0], device=device)
-            result = test_tensor * 2
-            # Test memory operations
-            large_tensor = torch.randn(1000, 1000, device=device)
-            del large_tensor, test_tensor, result
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-            logger.info(f"CUDA device {best_device_idx} passed functionality tests")
-            return device
         except Exception as e:
-            logger.warning(f"CUDA device test failed: {e}")
-            return None
-    def _test_mps_device(self) -> Optional[torch.device]:
-        """Test MPS device functionality"""
-        self._mps_tested = True
         try:
-            device = torch.device("mps")
-            # Test basic functionality
-            test_tensor = torch.tensor([1.0], device=device)
-            result = test_tensor * 2
-            # Test memory operations
-            large_tensor = torch.randn(1000, 1000, device=device)
-            del large_tensor, test_tensor, result
-            # MPS doesn't have explicit cache clearing like CUDA
-            logger.info("MPS device passed functionality tests")
-            return device
-        except Exception as e:
-            logger.warning(f"MPS device test failed: {e}")
-            return None
-    def _get_device_name(self, device: torch.device) -> str:
-        """Get human-readable device name"""
-        if device.type == 'cuda':
-            if self._device_info.get('cuda_devices'):
-                device_idx = device.index or 0
-                for cuda_device in self._device_info['cuda_devices']:
-                    if cuda_device['index'] == device_idx:
-                        return cuda_device['name']
-            return f"CUDA Device {device.index or 0}"
-        elif device.type == 'mps':
-            return "Apple Silicon GPU (MPS)"
-        else:
-            return "CPU"
-    def get_device_capabilities(self, device: Optional[torch.device] = None) -> Dict[str, Any]:
-        """Get capabilities of the specified device"""
-        if device is None:
-            device = self.get_optimal_device()
-        capabilities = {
-            'device_type': device.type,
-            'device_name': self._get_device_name(device),
-            'supports_mixed_precision': False,
-            'recommended_batch_size': 1,
-            'memory_efficiency': 'medium'
-        }
-        if device.type == 'cuda':
-            device_idx = device.index or 0
-            for cuda_device in self._device_info.get('cuda_devices', []):
-                if cuda_device['index'] == device_idx:
-                    # Check compute capability for mixed precision
-                    compute_version = float(cuda_device.get('compute_capability', '0.0'))
-                    capabilities['supports_mixed_precision'] = compute_version >= 7.0
-                    # Estimate batch size based on memory
-                    memory_gb = cuda_device.get('memory_free_gb', 0)
-                    if memory_gb >= 24:
-                        capabilities['recommended_batch_size'] = 4
-                        capabilities['memory_efficiency'] = 'high'
-                    elif memory_gb >= 12:
-                        capabilities['recommended_batch_size'] = 2
-                        capabilities['memory_efficiency'] = 'high'
-                    elif memory_gb >= 6:
-                        capabilities['recommended_batch_size'] = 1
-                        capabilities['memory_efficiency'] = 'medium'
-                    else:
-                        capabilities['memory_efficiency'] = 'low'
-                    capabilities['memory_available_gb'] = memory_gb
-                    break
-        elif device.type == 'mps':
-            capabilities['supports_mixed_precision'] = True  # MPS supports fp16
-            capabilities['memory_efficiency'] = 'high'  # Unified memory
-            system_memory = self._device_info.get('mps_system_memory_gb', 8)
-            if system_memory >= 16:
-                capabilities['recommended_batch_size'] = 2
-            capabilities['memory_available_gb'] = system_memory * 0.7  # Rough estimate
-        else:  # CPU
-            capabilities['memory_efficiency'] = 'low'
-            capabilities['supports_mixed_precision'] = False
-        return capabilities
-    def switch_device(self, device_type: str) -> torch.device:
-        """
-        Switch to a specific device type
-        Args:
-            device_type: 'cuda', 'mps', or 'cpu'
-        """
-        try:
-            if device_type.lower() == 'cuda':
-                if not self._device_info['cuda_available']:
-                    raise DeviceError('cuda', 'CUDA not available on this system')
-                device = self._test_cuda_device()
-                if device is None:
-                    raise DeviceError('cuda', 'CUDA device failed functionality tests')
-            elif device_type.lower() == 'mps':
-                if not self._device_info['mps_available']:
-                    raise DeviceError('mps', 'MPS not available on this system')
-                device = self._test_mps_device()
-                if device is None:
-                    raise DeviceError('mps', 'MPS device failed functionality tests')
-            elif device_type.lower() == 'cpu':
-                device = torch.device('cpu')
-            else:
-                raise DeviceError('unknown', f'Unknown device type: {device_type}')
-            self._optimal_device = device
-            logger.info(f"Switched to device: {self._get_device_name(device)}")
-            return device
-        except DeviceError:
-            raise
         except Exception as e:
-            raise DeviceError(device_type, f"Failed to switch to {device_type}: {str(e)}")
-    def get_available_devices(self) -> List[str]:
-        """Get list of available device types"""
-        devices = ['cpu']  # CPU always available
-        if self._device_info['cuda_available']:
-            devices.append('cuda')
-        if self._device_info['mps_available']:
-            devices.append('mps')
-        return devices
-    def get_device_status(self) -> Dict[str, Any]:
-        """Get comprehensive device status"""
-        current_device = self.get_optimal_device()
-        status = {
-            'current_device': str(current_device),
-            'current_device_name': self._get_device_name(current_device),
-            'available_devices': self.get_available_devices(),
-            'device_info': self._device_info.copy(),
-            'capabilities': self.get_device_capabilities(current_device)
         }
-        # Add current memory usage if on GPU
-        if current_device.type == 'cuda':
-            try:
-                device_idx = current_device.index or 0
-                status['current_memory_usage'] = {
-                    'allocated_gb': torch.cuda.memory_allocated(device_idx) / (1024**3),
-                    'reserved_gb': torch.cuda.memory_reserved(device_idx) / (1024**3),
-                    'max_allocated_gb': torch.cuda.max_memory_allocated(device_idx) / (1024**3),
-                    'max_reserved_gb': torch.cuda.max_memory_reserved(device_idx) / (1024**3)
-                }
-            except Exception as e:
-                logger.warning(f"Could not get current memory usage: {e}")
-        return status
-    def optimize_for_processing(self) -> Dict[str, Any]:
-        """Optimize device settings for video processing"""
-        device = self.get_optimal_device()
-        optimizations = {
-            'device': str(device),
-            'optimizations_applied': []
-        }
-        try:
-            if device.type == 'cuda':
-                # Enable cuDNN benchmarking for consistent input sizes
-                torch.backends.cudnn.benchmark = True
-                optimizations['optimizations_applied'].append('cudnn_benchmark')
-                # Enable cuDNN deterministic mode if needed for reproducibility
-                # torch.backends.cudnn.deterministic = True
-                # Set memory allocation strategy
-                # os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
-                optimizations['optimizations_applied'].append('cuda_memory_strategy')
-            elif device.type == 'mps':
-                # MPS-specific optimizations would go here
-                optimizations['optimizations_applied'].append('mps_optimized')
-            else:  # CPU
-                # Set optimal number of threads for CPU processing
-                torch.set_num_threads(min(torch.get_num_threads(), 8))
-                optimizations['optimizations_applied'].append('cpu_thread_optimization')
-            logger.info(f"Applied optimizations for {device}: {optimizations['optimizations_applied']}")
-        except Exception as e:
-            logger.warning(f"Some optimizations failed: {e}")
-            optimizations['optimization_errors'] = str(e)
-        return optimizations
-    def cleanup_device_memory(self):
-        """Clean up device memory"""
-        device = self.get_optimal_device()
-        if device.type == 'cuda':
-            try:
-                torch.cuda.empty_cache()
-                torch.cuda.synchronize()
-                logger.debug("CUDA memory cache cleared")
-            except Exception as e:
-                logger.warning(f"CUDA memory cleanup failed: {e}")
-        elif device.type == 'mps':
-            try:
-                # MPS uses unified memory, less explicit cleanup needed
-                # But we can still run garbage collection
-                import gc
-                gc.collect()
-                logger.debug("MPS memory cleanup completed")
-            except Exception as e:
-                logger.warning(f"MPS memory cleanup failed: {e}")
-        else:  # CPU
-            try:
-                import gc
-                gc.collect()
-                logger.debug("CPU memory cleanup completed")
-            except Exception as e:
-                logger.warning(f"CPU memory cleanup failed: {e}")
-# Global instance for singleton pattern
 _device_manager_instance = None
 def get_device_manager() -> DeviceManager:
-    """Get or create a singleton DeviceManager instance"""
     global _device_manager_instance
     if _device_manager_instance is None:
         _device_manager_instance = DeviceManager()
-    return _device_manager_instance

 """
+Device Manager for BackgroundFX Pro
+Handles device detection, optimization, and hardware compatibility
 """
+import os
+import sys
 import platform
 import subprocess
+import logging
+from typing import Dict, Any, Optional, Tuple
+from dataclasses import dataclass
+from enum import Enum
+import torch
+import psutil
+import cpuinfo
 logger = logging.getLogger(__name__)
+class DeviceType(Enum):
+    """Enumeration of supported device types"""
+    CUDA = "cuda"
+    MPS = "mps"
+    CPU = "cpu"
+@dataclass
+class DeviceInfo:
+    """Information about a compute device"""
+    type: DeviceType
+    index: int
+    name: str
+    memory_total: int
+    memory_available: int
+    compute_capability: Optional[Tuple[int, int]] = None
 class DeviceManager:
+    """Manages compute devices and system optimization"""
+    _instance = None
     def __init__(self):
+        """Initialize device manager"""
+        self.devices = []
+        self.optimal_device = None
+        self.cpu_info = None
+        self.system_info = {}
+        # Initialize device detection
+        self._detect_devices()
+        self._gather_system_info()
+        self._determine_optimal_device()
+    def _detect_devices(self):
+        """Detect available compute devices"""
+        self.devices = []
+        # Check for CUDA devices
+        if torch.cuda.is_available():
+            for i in range(torch.cuda.device_count()):
+                props = torch.cuda.get_device_properties(i)
+                self.devices.append(DeviceInfo(
+                    type=DeviceType.CUDA,
+                    index=i,
+                    name=props.name,
+                    memory_total=props.total_memory,
+                    memory_available=props.total_memory - torch.cuda.memory_allocated(i),
+                    compute_capability=(props.major, props.minor)
+                ))
+        # Check for MPS (Apple Silicon)
+        if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            # MPS doesn't provide detailed device info like CUDA
+            self.devices.append(DeviceInfo(
+                type=DeviceType.MPS,
+                index=0,
+                name="Apple Silicon GPU",
+                memory_total=psutil.virtual_memory().total,
+                memory_available=psutil.virtual_memory().available
+            ))
+        # CPU is always available
         try:
+            cpu_info = cpuinfo.get_cpu_info()
+            cpu_name = cpu_info.get('brand_raw', 'Unknown CPU')
+        except:
+            cpu_name = platform.processor() or "Unknown CPU"
+        self.devices.append(DeviceInfo(
+            type=DeviceType.CPU,
+            index=0,
+            name=cpu_name,
+            memory_total=psutil.virtual_memory().total,
+            memory_available=psutil.virtual_memory().available
+        ))
+    def _gather_system_info(self):
+        """Gather system information"""
         try:
+            cpu_info = cpuinfo.get_cpu_info()
+            self.cpu_info = cpu_info
+        except:
+            self.cpu_info = {}
+        self.system_info = {
+            'platform': platform.system(),
+            'platform_release': platform.release(),
+            'platform_version': platform.version(),
+            'architecture': platform.machine(),
+            'processor': platform.processor(),
+            'cpu_count': psutil.cpu_count(logical=False),
+            'cpu_count_logical': psutil.cpu_count(logical=True),
+            'ram_total': psutil.virtual_memory().total,
+            'ram_available': psutil.virtual_memory().available,
+            'python_version': sys.version,
+            'torch_version': torch.__version__,
+        }
+    def _determine_optimal_device(self):
+        """Determine the optimal device for computation"""
+        # Priority: CUDA > MPS > CPU
+        cuda_devices = [d for d in self.devices if d.type == DeviceType.CUDA]
+        mps_devices = [d for d in self.devices if d.type == DeviceType.MPS]
+        cpu_devices = [d for d in self.devices if d.type == DeviceType.CPU]
+        if cuda_devices:
+            # Choose CUDA device with most available memory
+            self.optimal_device = max(cuda_devices, key=lambda d: d.memory_available)
+        elif mps_devices:
+            self.optimal_device = mps_devices[0]
+        else:
+            self.optimal_device = cpu_devices[0]
+        logger.info(f"Optimal device: {self.optimal_device.name} ({self.optimal_device.type.value})")
+    def get_optimal_device(self) -> str:
+        """Get the optimal device string for PyTorch"""
+        if self.optimal_device.type == DeviceType.CUDA:
+            return f"cuda:{self.optimal_device.index}"
+        elif self.optimal_device.type == DeviceType.MPS:
+            return "mps"
+        else:
+            return "cpu"
+    def fix_cuda_compatibility(self):
+        """Apply CUDA compatibility fixes"""
+        if not torch.cuda.is_available():
+            logger.info("CUDA not available, skipping compatibility fixes")
+            return
         try:
+            # Set CUDA environment variables for better compatibility
+            os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+            # For older GPUs, enable TF32 for better performance
+            if torch.cuda.is_available():
+                torch.backends.cuda.matmul.allow_tf32 = True
+                torch.backends.cudnn.allow_tf32 = True
+                # Set memory fraction for stability
+                if 'PYTORCH_CUDA_ALLOC_CONF' not in os.environ:
+                    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
+                logger.info("CUDA compatibility settings applied")
         except Exception as e:
+            logger.warning(f"Error applying CUDA compatibility fixes: {e}")
+    def setup_optimal_threading(self):
+        """Configure optimal threading for the system"""
         try:
+            # Get physical CPU count
+            physical_cores = psutil.cpu_count(logical=False)
+            if physical_cores is None:
+                physical_cores = 4  # Default fallback
+            # Validate and set the number of threads
+            num_threads = str(min(physical_cores, 8))  # Cap at 8 threads
+            # Set OpenMP threads (validate the value is a positive integer)
+            if num_threads.isdigit() and int(num_threads) > 0:
+                os.environ['OMP_NUM_THREADS'] = num_threads
+            else:
+                os.environ['OMP_NUM_THREADS'] = '4'  # Safe default
+            # Set MKL threads for Intel processors
+            if 'intel' in self.system_info.get('processor', '').lower():
+                os.environ['MKL_NUM_THREADS'] = os.environ['OMP_NUM_THREADS']
+            # Set PyTorch threads
+            torch.set_num_threads(int(os.environ['OMP_NUM_THREADS']))
+            # For CUDA, set the number of threads for CPU operations
+            if torch.cuda.is_available():
+                torch.set_num_interop_threads(2)  # Inter-op parallelism
+            logger.info(f"Threading configured: OMP_NUM_THREADS={os.environ.get('OMP_NUM_THREADS')}")
         except Exception as e:
+            logger.warning(f"Error setting up threading: {e}")
+            # Set safe defaults
+            os.environ['OMP_NUM_THREADS'] = '4'
+            os.environ['MKL_NUM_THREADS'] = '4'
+    def get_system_diagnostics(self) -> Dict[str, Any]:
+        """Get comprehensive system diagnostics"""
+        diagnostics = {
+            'system': self.system_info.copy(),
+            'devices': [],
+            'optimal_device': None,
+            'threading': {
+                'omp_num_threads': os.environ.get('OMP_NUM_THREADS', 'not set'),
+                'mkl_num_threads': os.environ.get('MKL_NUM_THREADS', 'not set'),
+                'torch_num_threads': torch.get_num_threads(),
+            }
         }
+        # Add device information
+        for device in self.devices:
+            device_info = {
+                'type': device.type.value,
+                'index': device.index,
+                'name': device.name,
+                'memory_total_gb': device.memory_total / (1024**3),
+                'memory_available_gb': device.memory_available / (1024**3),
+            }
+            if device.compute_capability:
+                device_info['compute_capability'] = f"{device.compute_capability[0]}.{device.compute_capability[1]}"
+            diagnostics['devices'].append(device_info)
+        # Add optimal device
+        if self.optimal_device:
+            diagnostics['optimal_device'] = {
+                'type': self.optimal_device.type.value,
+                'name': self.optimal_device.name,
+                'pytorch_device': self.get_optimal_device()
+            }
+        # Add CUDA-specific diagnostics
+        if torch.cuda.is_available():
+            diagnostics['cuda'] = {
+                'available': True,
+                'version': torch.version.cuda,
+                'device_count': torch.cuda.device_count(),
+                'current_device': torch.cuda.current_device() if torch.cuda.is_initialized() else None,
+            }
+        else:
+            diagnostics['cuda'] = {'available': False}
+        # Add MPS-specific diagnostics
+        if hasattr(torch.backends, 'mps'):
+            diagnostics['mps'] = {
+                'available': torch.backends.mps.is_available(),
+                'built': torch.backends.mps.is_built()
+            }
+        else:
+            diagnostics['mps'] = {'available': False}
+        return diagnostics
+    def get_device_for_model(self, model_size_gb: float = 2.0) -> str:
+        """Get appropriate device based on model size requirements"""
+        required_memory = model_size_gb * 1024**3 * 1.5  # 1.5x for overhead
+        # Check CUDA devices first
+        cuda_devices = [d for d in self.devices if d.type == DeviceType.CUDA]
+        for device in cuda_devices:
+            if device.memory_available > required_memory:
+                return f"cuda:{device.index}"
+        # Check MPS
+        mps_devices = [d for d in self.devices if d.type == DeviceType.MPS]
+        if mps_devices and mps_devices[0].memory_available > required_memory:
+            return "mps"
+        # Fallback to CPU
+        return "cpu"
+# Singleton instance holder
 _device_manager_instance = None
 def get_device_manager() -> DeviceManager:
+    """Get or create the singleton DeviceManager instance"""
     global _device_manager_instance
     if _device_manager_instance is None:
         _device_manager_instance = DeviceManager()
+    return _device_manager_instance
+def get_optimal_device() -> str:
+    """
+    Get the optimal device string for PyTorch operations.
+    Returns:
+        str: Device string like 'cuda:0', 'mps', or 'cpu'
+    """
+    manager = get_device_manager()
+    return manager.get_optimal_device()
+def fix_cuda_compatibility():
+    """
+    Apply CUDA compatibility settings for stable operation.
+    Sets environment variables and PyTorch settings for CUDA compatibility.
+    """
+    manager = get_device_manager()
+    manager.fix_cuda_compatibility()
+def setup_optimal_threading():
+    """
+    Configure optimal threading settings for the current system.
+    Sets OMP_NUM_THREADS, MKL_NUM_THREADS, and PyTorch thread counts.
+    """
+    manager = get_device_manager()
+    manager.setup_optimal_threading()
+def get_system_diagnostics() -> Dict[str, Any]:
+    """
+    Get comprehensive system diagnostics information.
+    Returns:
+        Dict containing system info, device info, and configuration details
+    """
+    manager = get_device_manager()
+    return manager.get_system_diagnostics()
+# Initialize and configure on module import
+if __name__ != "__main__":
+    # When imported, automatically set up threading to avoid the libgomp error
+    try:
+        # Ensure OMP_NUM_THREADS is set before any OpenMP operations
+        if 'OMP_NUM_THREADS' not in os.environ:
+            # Set a safe default immediately
+            os.environ['OMP_NUM_THREADS'] = '4'
+        # Get the manager instance and configure threading properly
+        manager = get_device_manager()
+        manager.setup_optimal_threading()
+    except Exception as e:
+        logger.warning(f"Error during device manager initialization: {e}")
+        # Ensure we have safe defaults even if initialization fails
+        os.environ['OMP_NUM_THREADS'] = '4'