Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,049 Bytes
e1d0067 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
"""
ZeroGPU-compatible startup script to handle NVMe offloading directory setup
"""
import os
import sys
import subprocess
def setup_zerogpu_environment():
"""
Setup ZeroGPU environment with proper offload directory handling
"""
print("Setting up ZeroGPU environment...")
# Primary ZeroGPU offload directory
zerogpu_offload_dir = '/data-nvme/zerogpu-offload'
# Try to create the expected NVMe directory
if not os.path.exists(zerogpu_offload_dir):
try:
# Try to create the directory
os.makedirs(zerogpu_offload_dir, exist_ok=True)
# Test write permissions
test_file = os.path.join(zerogpu_offload_dir, 'test_write.tmp')
with open(test_file, 'w') as f:
f.write('test')
os.remove(test_file)
print(f"✅ Created ZeroGPU offload directory: {zerogpu_offload_dir}")
except (OSError, PermissionError) as e:
print(f"⚠️ Cannot create NVMe directory ({e}), setting up fallback...")
setup_fallback_offload()
else:
print(f"✅ ZeroGPU offload directory exists: {zerogpu_offload_dir}")
def setup_fallback_offload():
"""
Setup fallback offload directory when NVMe is not available
"""
# Try different fallback locations
fallback_options = [
'/tmp/zerogpu-offload',
'/home/user/zerogpu-offload',
'./zerogpu-offload'
]
for fallback_dir in fallback_options:
try:
os.makedirs(fallback_dir, exist_ok=True)
# Test write permissions
test_file = os.path.join(fallback_dir, 'test_write.tmp')
with open(test_file, 'w') as f:
f.write('test')
os.remove(test_file)
# Set environment variables for ZeroGPU to use this directory
os.environ['ZEROGPU_OFFLOAD_DIR'] = fallback_dir
print(f"✅ Using fallback offload directory: {fallback_dir}")
return
except Exception as e:
print(f"❌ Failed to setup {fallback_dir}: {e}")
continue
# If all fallbacks fail, disable offloading
print("⚠️ All offload directories failed, disabling ZeroGPU offloading")
os.environ['ZEROGPU_DISABLE_OFFLOAD'] = '1'
def setup_memory_optimization():
"""
Setup PyTorch memory optimization for ZeroGPU
"""
memory_config = {
'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True',
'CUDA_LAUNCH_BLOCKING': '0', # Allow async operations
'PYTORCH_NO_CUDA_MEMORY_CACHING': '0', # Enable memory caching
}
for key, value in memory_config.items():
os.environ[key] = value
print(f"Set {key}={value}")
def check_disk_space(directory, required_gb=50):
"""
Check if there's enough disk space for offloading
"""
try:
import shutil
free_bytes = shutil.disk_usage(directory).free
free_gb = free_bytes / (1024**3)
print(f"Available disk space in {directory}: {free_gb:.1f}GB")
return free_gb >= required_gb
except Exception as e:
print(f"Could not check disk space: {e}")
return True # Assume it's okay if we can't check
if __name__ == "__main__":
try:
setup_zerogpu_environment()
setup_memory_optimization()
# Check if we have enough disk space
offload_dir = os.environ.get('ZEROGPU_OFFLOAD_DIR', '/data-nvme/zerogpu-offload')
if os.path.exists(offload_dir):
if not check_disk_space(offload_dir):
print("⚠️ Low disk space, consider cleaning up or using smaller models")
print("🚀 ZeroGPU environment setup complete!")
except Exception as e:
print(f"❌ Error setting up ZeroGPU environment: {e}")
print("Continuing with default configuration...") |