""" ZeroGPU-compatible startup script to handle NVMe offloading directory setup """ import os import sys import subprocess def setup_zerogpu_environment(): """ Setup ZeroGPU environment with proper offload directory handling """ print("Setting up ZeroGPU environment...") # Primary ZeroGPU offload directory zerogpu_offload_dir = '/data-nvme/zerogpu-offload' # Try to create the expected NVMe directory if not os.path.exists(zerogpu_offload_dir): try: # Try to create the directory os.makedirs(zerogpu_offload_dir, exist_ok=True) # Test write permissions test_file = os.path.join(zerogpu_offload_dir, 'test_write.tmp') with open(test_file, 'w') as f: f.write('test') os.remove(test_file) print(f"✅ Created ZeroGPU offload directory: {zerogpu_offload_dir}") except (OSError, PermissionError) as e: print(f"⚠️ Cannot create NVMe directory ({e}), setting up fallback...") setup_fallback_offload() else: print(f"✅ ZeroGPU offload directory exists: {zerogpu_offload_dir}") def setup_fallback_offload(): """ Setup fallback offload directory when NVMe is not available """ # Try different fallback locations fallback_options = [ '/tmp/zerogpu-offload', '/home/user/zerogpu-offload', './zerogpu-offload' ] for fallback_dir in fallback_options: try: os.makedirs(fallback_dir, exist_ok=True) # Test write permissions test_file = os.path.join(fallback_dir, 'test_write.tmp') with open(test_file, 'w') as f: f.write('test') os.remove(test_file) # Set environment variables for ZeroGPU to use this directory os.environ['ZEROGPU_OFFLOAD_DIR'] = fallback_dir print(f"✅ Using fallback offload directory: {fallback_dir}") return except Exception as e: print(f"❌ Failed to setup {fallback_dir}: {e}") continue # If all fallbacks fail, disable offloading print("⚠️ All offload directories failed, disabling ZeroGPU offloading") os.environ['ZEROGPU_DISABLE_OFFLOAD'] = '1' def setup_memory_optimization(): """ Setup PyTorch memory optimization for ZeroGPU """ memory_config = { 'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True', 'CUDA_LAUNCH_BLOCKING': '0', # Allow async operations 'PYTORCH_NO_CUDA_MEMORY_CACHING': '0', # Enable memory caching } for key, value in memory_config.items(): os.environ[key] = value print(f"Set {key}={value}") def check_disk_space(directory, required_gb=50): """ Check if there's enough disk space for offloading """ try: import shutil free_bytes = shutil.disk_usage(directory).free free_gb = free_bytes / (1024**3) print(f"Available disk space in {directory}: {free_gb:.1f}GB") return free_gb >= required_gb except Exception as e: print(f"Could not check disk space: {e}") return True # Assume it's okay if we can't check if __name__ == "__main__": try: setup_zerogpu_environment() setup_memory_optimization() # Check if we have enough disk space offload_dir = os.environ.get('ZEROGPU_OFFLOAD_DIR', '/data-nvme/zerogpu-offload') if os.path.exists(offload_dir): if not check_disk_space(offload_dir): print("⚠️ Low disk space, consider cleaning up or using smaller models") print("🚀 ZeroGPU environment setup complete!") except Exception as e: print(f"❌ Error setting up ZeroGPU environment: {e}") print("Continuing with default configuration...")