EdBanshee commited on
Commit
e1d0067
·
1 Parent(s): 5523755

This will need to be reverted

Browse files
Files changed (3) hide show
  1. ZEROGPU_SETUP_GUIDE.md +77 -0
  2. app.py +30 -0
  3. setup_zerogpu.py +112 -0
ZEROGPU_SETUP_GUIDE.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Additional Environment Configuration for ZeroGPU
2
+
3
+ Add this to your Hugging Face Space's **Settings** → **Variables**:
4
+
5
+ ## Environment Variables
6
+
7
+ ### Required:
8
+ ```
9
+ ZEROGPU_OFFLOAD_DIR=/tmp/zerogpu-offload
10
+ ```
11
+
12
+ ### Recommended:
13
+ ```
14
+ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
15
+ CUDA_LAUNCH_BLOCKING=0
16
+ HF_HUB_ENABLE_HF_TRANSFER=1
17
+ ```
18
+
19
+ ## Alternative: Direct Folder Creation
20
+
21
+ If the above doesn't work, you can also try creating a startup script in your Space.
22
+
23
+ ## Space Configuration File
24
+
25
+ Create or modify your Space's `README.md` to include:
26
+
27
+ ```yaml
28
+ ---
29
+ title: Wan2.2-Fast-I2I
30
+ emoji: 💻
31
+ colorFrom: purple
32
+ colorTo: gray
33
+ sdk: gradio
34
+ sdk_version: 5.44.1
35
+ app_file: app.py
36
+ pinned: false
37
+ hardware: a10g-large
38
+ ```
39
+
40
+ The `hardware: a10g-large` ensures you get a ZeroGPU instance with sufficient memory.
41
+
42
+ ## Dockerfile Alternative
43
+
44
+ If you need more control, create a `Dockerfile`:
45
+
46
+ ```dockerfile
47
+ FROM python:3.10
48
+
49
+ # Create offload directory
50
+ RUN mkdir -p /data-nvme/zerogpu-offload && chmod 755 /data-nvme/zerogpu-offload
51
+
52
+ # Set environment variables
53
+ ENV ZEROGPU_OFFLOAD_DIR=/data-nvme/zerogpu-offload
54
+ ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
55
+
56
+ # Install your requirements
57
+ COPY requirements.txt .
58
+ RUN pip install -r requirements.txt
59
+
60
+ # Copy your app
61
+ COPY . /app
62
+ WORKDIR /app
63
+
64
+ CMD ["python", "app.py"]
65
+ ```
66
+
67
+ ## Testing the Fix
68
+
69
+ The modifications I made to `app.py` should handle:
70
+
71
+ 1. ✅ **Automatic directory creation** - Creates `/data-nvme/zerogpu-offload` or falls back to `/tmp/zerogpu-offload`
72
+ 2. ✅ **Permission handling** - Gracefully handles cases where NVMe isn't writable
73
+ 3. ✅ **Environment variables** - Sets proper PyTorch memory configuration
74
+ 4. ✅ **ZeroGPU decorators restored** - Keeps `@spaces.GPU()` for proper GPU allocation
75
+ 5. ✅ **Memory optimization** - Added garbage collection and CUDA cache clearing
76
+
77
+ The error should be resolved and your Space should run on ZeroGPU infrastructure properly.
app.py CHANGED
@@ -2,6 +2,27 @@ import os
2
  # PyTorch 2.8 (temporary hack)
3
  os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 "torch<2.9" spaces')
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  # --- 1. Model Download and Setup (Diffusers Backend) ---
6
  import spaces
7
  import torch
@@ -211,6 +232,11 @@ def generate_video(
211
 
212
  progress(0.2, desc=f"Generating {num_frames} frames at {target_width}x{target_height} (seed: {current_seed})...")
213
 
 
 
 
 
 
214
  output_frames_list = pipe(
215
  image=processed_start_image,
216
  last_image=processed_end_image,
@@ -232,6 +258,10 @@ def generate_video(
232
 
233
  export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
234
 
 
 
 
 
235
  progress(1.0, desc="Done!")
236
  return video_path, current_seed
237
 
 
2
  # PyTorch 2.8 (temporary hack)
3
  os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 "torch<2.9" spaces')
4
 
5
+ # Setup ZeroGPU environment to handle NVMe offloading
6
+ try:
7
+ from setup_zerogpu import setup_zerogpu_environment, setup_memory_optimization
8
+ setup_zerogpu_environment()
9
+ setup_memory_optimization()
10
+ except ImportError:
11
+ # Fallback if setup_zerogpu.py is not available
12
+ print("Using fallback ZeroGPU setup...")
13
+ zerogpu_offload_dir = '/data-nvme/zerogpu-offload'
14
+ if not os.path.exists(zerogpu_offload_dir):
15
+ try:
16
+ os.makedirs(zerogpu_offload_dir, exist_ok=True)
17
+ print("Created ZeroGPU offload directory: " + zerogpu_offload_dir)
18
+ except (OSError, PermissionError):
19
+ fallback_dir = '/tmp/zerogpu-offload'
20
+ os.makedirs(fallback_dir, exist_ok=True)
21
+ os.environ['ZEROGPU_OFFLOAD_DIR'] = fallback_dir
22
+ print("Using fallback offload directory: " + fallback_dir)
23
+
24
+ os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
25
+
26
  # --- 1. Model Download and Setup (Diffusers Backend) ---
27
  import spaces
28
  import torch
 
232
 
233
  progress(0.2, desc=f"Generating {num_frames} frames at {target_width}x{target_height} (seed: {current_seed})...")
234
 
235
+ # Add memory cleanup before generation to avoid OOM
236
+ import gc
237
+ gc.collect()
238
+ torch.cuda.empty_cache()
239
+
240
  output_frames_list = pipe(
241
  image=processed_start_image,
242
  last_image=processed_end_image,
 
258
 
259
  export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
260
 
261
+ # Clean up memory after generation
262
+ gc.collect()
263
+ torch.cuda.empty_cache()
264
+
265
  progress(1.0, desc="Done!")
266
  return video_path, current_seed
267
 
setup_zerogpu.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ZeroGPU-compatible startup script to handle NVMe offloading directory setup
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ import subprocess
8
+
9
+ def setup_zerogpu_environment():
10
+ """
11
+ Setup ZeroGPU environment with proper offload directory handling
12
+ """
13
+ print("Setting up ZeroGPU environment...")
14
+
15
+ # Primary ZeroGPU offload directory
16
+ zerogpu_offload_dir = '/data-nvme/zerogpu-offload'
17
+
18
+ # Try to create the expected NVMe directory
19
+ if not os.path.exists(zerogpu_offload_dir):
20
+ try:
21
+ # Try to create the directory
22
+ os.makedirs(zerogpu_offload_dir, exist_ok=True)
23
+ # Test write permissions
24
+ test_file = os.path.join(zerogpu_offload_dir, 'test_write.tmp')
25
+ with open(test_file, 'w') as f:
26
+ f.write('test')
27
+ os.remove(test_file)
28
+ print(f"✅ Created ZeroGPU offload directory: {zerogpu_offload_dir}")
29
+
30
+ except (OSError, PermissionError) as e:
31
+ print(f"⚠️ Cannot create NVMe directory ({e}), setting up fallback...")
32
+ setup_fallback_offload()
33
+ else:
34
+ print(f"✅ ZeroGPU offload directory exists: {zerogpu_offload_dir}")
35
+
36
+ def setup_fallback_offload():
37
+ """
38
+ Setup fallback offload directory when NVMe is not available
39
+ """
40
+ # Try different fallback locations
41
+ fallback_options = [
42
+ '/tmp/zerogpu-offload',
43
+ '/home/user/zerogpu-offload',
44
+ './zerogpu-offload'
45
+ ]
46
+
47
+ for fallback_dir in fallback_options:
48
+ try:
49
+ os.makedirs(fallback_dir, exist_ok=True)
50
+ # Test write permissions
51
+ test_file = os.path.join(fallback_dir, 'test_write.tmp')
52
+ with open(test_file, 'w') as f:
53
+ f.write('test')
54
+ os.remove(test_file)
55
+
56
+ # Set environment variables for ZeroGPU to use this directory
57
+ os.environ['ZEROGPU_OFFLOAD_DIR'] = fallback_dir
58
+ print(f"✅ Using fallback offload directory: {fallback_dir}")
59
+ return
60
+
61
+ except Exception as e:
62
+ print(f"❌ Failed to setup {fallback_dir}: {e}")
63
+ continue
64
+
65
+ # If all fallbacks fail, disable offloading
66
+ print("⚠️ All offload directories failed, disabling ZeroGPU offloading")
67
+ os.environ['ZEROGPU_DISABLE_OFFLOAD'] = '1'
68
+
69
+ def setup_memory_optimization():
70
+ """
71
+ Setup PyTorch memory optimization for ZeroGPU
72
+ """
73
+ memory_config = {
74
+ 'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True',
75
+ 'CUDA_LAUNCH_BLOCKING': '0', # Allow async operations
76
+ 'PYTORCH_NO_CUDA_MEMORY_CACHING': '0', # Enable memory caching
77
+ }
78
+
79
+ for key, value in memory_config.items():
80
+ os.environ[key] = value
81
+ print(f"Set {key}={value}")
82
+
83
+ def check_disk_space(directory, required_gb=50):
84
+ """
85
+ Check if there's enough disk space for offloading
86
+ """
87
+ try:
88
+ import shutil
89
+ free_bytes = shutil.disk_usage(directory).free
90
+ free_gb = free_bytes / (1024**3)
91
+ print(f"Available disk space in {directory}: {free_gb:.1f}GB")
92
+ return free_gb >= required_gb
93
+ except Exception as e:
94
+ print(f"Could not check disk space: {e}")
95
+ return True # Assume it's okay if we can't check
96
+
97
+ if __name__ == "__main__":
98
+ try:
99
+ setup_zerogpu_environment()
100
+ setup_memory_optimization()
101
+
102
+ # Check if we have enough disk space
103
+ offload_dir = os.environ.get('ZEROGPU_OFFLOAD_DIR', '/data-nvme/zerogpu-offload')
104
+ if os.path.exists(offload_dir):
105
+ if not check_disk_space(offload_dir):
106
+ print("⚠️ Low disk space, consider cleaning up or using smaller models")
107
+
108
+ print("🚀 ZeroGPU environment setup complete!")
109
+
110
+ except Exception as e:
111
+ print(f"❌ Error setting up ZeroGPU environment: {e}")
112
+ print("Continuing with default configuration...")