File size: 4,049 Bytes
e1d0067
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""

ZeroGPU-compatible startup script to handle NVMe offloading directory setup

"""

import os
import sys
import subprocess

def setup_zerogpu_environment():
    """

    Setup ZeroGPU environment with proper offload directory handling

    """
    print("Setting up ZeroGPU environment...")
    
    # Primary ZeroGPU offload directory
    zerogpu_offload_dir = '/data-nvme/zerogpu-offload'
    
    # Try to create the expected NVMe directory
    if not os.path.exists(zerogpu_offload_dir):
        try:
            # Try to create the directory
            os.makedirs(zerogpu_offload_dir, exist_ok=True)
            # Test write permissions
            test_file = os.path.join(zerogpu_offload_dir, 'test_write.tmp')
            with open(test_file, 'w') as f:
                f.write('test')
            os.remove(test_file)
            print(f"✅ Created ZeroGPU offload directory: {zerogpu_offload_dir}")
            
        except (OSError, PermissionError) as e:
            print(f"⚠️ Cannot create NVMe directory ({e}), setting up fallback...")
            setup_fallback_offload()
    else:
        print(f"✅ ZeroGPU offload directory exists: {zerogpu_offload_dir}")

def setup_fallback_offload():
    """

    Setup fallback offload directory when NVMe is not available

    """
    # Try different fallback locations
    fallback_options = [
        '/tmp/zerogpu-offload',
        '/home/user/zerogpu-offload',
        './zerogpu-offload'
    ]
    
    for fallback_dir in fallback_options:
        try:
            os.makedirs(fallback_dir, exist_ok=True)
            # Test write permissions
            test_file = os.path.join(fallback_dir, 'test_write.tmp')
            with open(test_file, 'w') as f:
                f.write('test')
            os.remove(test_file)
            
            # Set environment variables for ZeroGPU to use this directory
            os.environ['ZEROGPU_OFFLOAD_DIR'] = fallback_dir
            print(f"✅ Using fallback offload directory: {fallback_dir}")
            return
            
        except Exception as e:
            print(f"❌ Failed to setup {fallback_dir}: {e}")
            continue
    
    # If all fallbacks fail, disable offloading
    print("⚠️ All offload directories failed, disabling ZeroGPU offloading")
    os.environ['ZEROGPU_DISABLE_OFFLOAD'] = '1'

def setup_memory_optimization():
    """

    Setup PyTorch memory optimization for ZeroGPU

    """
    memory_config = {
        'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True',
        'CUDA_LAUNCH_BLOCKING': '0',  # Allow async operations
        'PYTORCH_NO_CUDA_MEMORY_CACHING': '0',  # Enable memory caching
    }
    
    for key, value in memory_config.items():
        os.environ[key] = value
        print(f"Set {key}={value}")

def check_disk_space(directory, required_gb=50):
    """

    Check if there's enough disk space for offloading

    """
    try:
        import shutil
        free_bytes = shutil.disk_usage(directory).free
        free_gb = free_bytes / (1024**3)
        print(f"Available disk space in {directory}: {free_gb:.1f}GB")
        return free_gb >= required_gb
    except Exception as e:
        print(f"Could not check disk space: {e}")
        return True  # Assume it's okay if we can't check

if __name__ == "__main__":
    try:
        setup_zerogpu_environment()
        setup_memory_optimization()
        
        # Check if we have enough disk space
        offload_dir = os.environ.get('ZEROGPU_OFFLOAD_DIR', '/data-nvme/zerogpu-offload')
        if os.path.exists(offload_dir):
            if not check_disk_space(offload_dir):
                print("⚠️ Low disk space, consider cleaning up or using smaller models")
        
        print("🚀 ZeroGPU environment setup complete!")
        
    except Exception as e:
        print(f"❌ Error setting up ZeroGPU environment: {e}")
        print("Continuing with default configuration...")