[Admin maintenance] Support new ZeroGPU hardware

#15
by multimodalart HF Staff - opened
Files changed (2) hide show
  1. app.py +154 -5
  2. requirements.txt +8 -16
app.py CHANGED
@@ -1,15 +1,164 @@
1
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import spaces
3
- from gradio_litmodel3d import LitModel3D
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- import os
6
  import shutil
7
- os.environ['SPCONV_ALGO'] = 'native'
8
  from typing import *
9
- import torch
10
  import numpy as np
11
  import imageio
12
  from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  from trellis.pipelines import TrellisImageTo3DPipeline
14
  from trellis.utils import render_utils
15
  import trimesh
 
1
+ import os
2
+ # Force attention backends compatible with the new ZeroGPU (Blackwell) stack.
3
+ # Must be set BEFORE any trellis / dinov2 import.
4
+ # Trellis's dense attention has a native SDPA path; use it.
5
+ os.environ.setdefault('ATTN_BACKEND', 'sdpa')
6
+ # Sparse attention only knows 'xformers' or 'flash_attn'; keep 'xformers' but
7
+ # monkey-patch xformers.ops.memory_efficient_attention to SDPA below (none of
8
+ # the prebuilt xformers ops support sm_120 / Blackwell).
9
+ os.environ.setdefault('SPARSE_ATTN_BACKEND', 'xformers')
10
+ os.environ.setdefault('SPCONV_ALGO', 'native')
11
+ # Force dinov2 (loaded via torch.hub for image conditioning) to take its pure
12
+ # torch.nn.functional.scaled_dot_product_attention path instead of importing
13
+ # xformers.ops.memory_efficient_attention (which raises on sm_120).
14
+ os.environ.setdefault('XFORMERS_DISABLED', '1')
15
+
16
+ import sys
17
+ import subprocess
18
+ import tempfile
19
+ import ctypes
20
+
21
  import spaces
22
+ import torch
23
+ import gradio as gr
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # xformers -> SDPA shim for Blackwell (sm_120).
27
+ # The prebuilt xformers wheel ships FA3, FA2 and CutlassF ops that all assert
28
+ # device capability <= (9, 0); none load on sm_120, so any call to
29
+ # memory_efficient_attention raises NotImplementedError. dinov2 (image
30
+ # conditioning model in trellis) and trellis's own sparse paths both call it.
31
+ # Replace memory_efficient_attention with an SDPA-backed implementation that
32
+ # also handles xformers.fmha.BlockDiagonalMask (used by sparse attention).
33
+ # ---------------------------------------------------------------------------
34
+ try:
35
+ import xformers # noqa: F401
36
+ import xformers.ops as _xops
37
+ from torch.nn.functional import scaled_dot_product_attention as _sdpa
38
+
39
+ try:
40
+ _BlockDiagonalMask = _xops.fmha.BlockDiagonalMask
41
+ except Exception:
42
+ _BlockDiagonalMask = None
43
+
44
+ def _mea_sdpa(q, k, v, attn_bias=None, p=0.0, scale=None, *args, **kwargs):
45
+ # q, k, v: [B, N, H, C] (xformers layout). SDPA expects [B, H, N, C].
46
+ if attn_bias is None:
47
+ qh = q.transpose(1, 2)
48
+ kh = k.transpose(1, 2)
49
+ vh = v.transpose(1, 2)
50
+ out = _sdpa(qh, kh, vh, dropout_p=p, scale=scale)
51
+ return out.transpose(1, 2).contiguous()
52
+
53
+ if _BlockDiagonalMask is not None and isinstance(attn_bias, _BlockDiagonalMask):
54
+ # BlockDiagonal: q, k, v come as [1, T, H, C] where T is the
55
+ # concatenation of variable-length blocks. Split, apply SDPA per
56
+ # block, concatenate. q and kv can have different seqlens.
57
+ q_info = attn_bias.q_seqinfo
58
+ kv_info = attn_bias.k_seqinfo
59
+ q_starts = q_info.seqstart_py
60
+ kv_starts = kv_info.seqstart_py
61
+ outs = []
62
+ for i in range(len(q_starts) - 1):
63
+ qs, qe = q_starts[i], q_starts[i + 1]
64
+ ks, ke = kv_starts[i], kv_starts[i + 1]
65
+ qi = q[:, qs:qe].transpose(1, 2)
66
+ ki = k[:, ks:ke].transpose(1, 2)
67
+ vi = v[:, ks:ke].transpose(1, 2)
68
+ oi = _sdpa(qi, ki, vi, dropout_p=p, scale=scale)
69
+ outs.append(oi.transpose(1, 2))
70
+ return torch.cat(outs, dim=1).contiguous()
71
+
72
+ # Fallback: dense additive bias.
73
+ qh = q.transpose(1, 2)
74
+ kh = k.transpose(1, 2)
75
+ vh = v.transpose(1, 2)
76
+ out = _sdpa(qh, kh, vh, attn_mask=attn_bias, dropout_p=p, scale=scale)
77
+ return out.transpose(1, 2).contiguous()
78
+
79
+ _xops.memory_efficient_attention = _mea_sdpa
80
+ print("[xformers-shim] Replaced memory_efficient_attention with SDPA backend (Blackwell sm_120 fallback).")
81
+ except Exception as _e:
82
+ print(f"[xformers-shim] Skipped: {_e}")
83
 
 
84
  import shutil
 
85
  from typing import *
 
86
  import numpy as np
87
  import imageio
88
  from PIL import Image
89
+
90
+ # Build nvdiffrast and diff_gaussian_rasterization from source on first GPU call.
91
+ CUDA_HOME = "/cuda-image/usr/local/cuda-13.0"
92
+ CUDA_LIBDIR = os.path.join(CUDA_HOME, "lib64")
93
+ _NVDIFFRAST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "extensions", "nvdiffrast")
94
+
95
+
96
+ @spaces.GPU(duration=600)
97
+ def _first_gpu_setup():
98
+ need = {}
99
+ for name, modname in [
100
+ ("nvdiffrast", "nvdiffrast"),
101
+ ("diff_gaussian_rasterization", "diff_gaussian_rasterization"),
102
+ ]:
103
+ try:
104
+ __import__(modname)
105
+ except ImportError:
106
+ need[name] = True
107
+ if not need:
108
+ return
109
+
110
+ patch_dir = tempfile.mkdtemp(prefix="torch_cuda_patch_")
111
+ with open(os.path.join(patch_dir, "sitecustomize.py"), "w") as f:
112
+ f.write(
113
+ "try:\n"
114
+ " import torch.utils.cpp_extension as _c\n"
115
+ " _c._check_cuda_version = lambda *a, **k: None\n"
116
+ "except Exception:\n"
117
+ " pass\n"
118
+ )
119
+
120
+ env = os.environ.copy()
121
+ env["CUDA_HOME"] = CUDA_HOME
122
+ env["CUDA_PATH"] = CUDA_HOME
123
+ env["PATH"] = os.path.join(CUDA_HOME, "bin") + os.pathsep + env.get("PATH", "")
124
+ env["PYTHONPATH"] = patch_dir + os.pathsep + env.get("PYTHONPATH", "")
125
+ env["TORCH_CUDA_ARCH_LIST"] = "12.0" # Blackwell sm_120
126
+
127
+ subprocess.check_call(
128
+ [sys.executable, "-m", "pip", "install", "--no-deps",
129
+ "setuptools", "wheel", "ninja", "packaging"],
130
+ )
131
+
132
+ if "nvdiffrast" in need:
133
+ subprocess.check_call(
134
+ [sys.executable, "-m", "pip", "install",
135
+ "--no-build-isolation", "--no-deps",
136
+ _NVDIFFRAST_DIR],
137
+ env=env,
138
+ )
139
+ if "diff_gaussian_rasterization" in need:
140
+ # Hi3DGen actually uses the mip-splatting submodule fork; not the
141
+ # original graphdeco-inria release on PyPI.
142
+ mip = tempfile.mkdtemp(prefix="mip_")
143
+ subprocess.check_call(
144
+ ["git", "clone", "--recursive", "--depth=1",
145
+ "https://github.com/autonomousvision/mip-splatting.git", mip],
146
+ )
147
+ subprocess.check_call(
148
+ [sys.executable, "-m", "pip", "install",
149
+ "--no-build-isolation", "--no-deps",
150
+ os.path.join(mip, "submodules", "diff-gaussian-rasterization")],
151
+ env=env,
152
+ )
153
+
154
+
155
+ _first_gpu_setup()
156
+ try:
157
+ ctypes.CDLL(os.path.join(CUDA_LIBDIR, "libcudart.so.13"), mode=ctypes.RTLD_GLOBAL)
158
+ os.environ["LD_LIBRARY_PATH"] = CUDA_LIBDIR + os.pathsep + os.environ.get("LD_LIBRARY_PATH", "")
159
+ except OSError:
160
+ pass
161
+
162
  from trellis.pipelines import TrellisImageTo3DPipeline
163
  from trellis.utils import render_utils
164
  import trimesh
requirements.txt CHANGED
@@ -1,11 +1,10 @@
1
- --extra-index-url https://download.pytorch.org/whl/cu121
2
- huggingface-hub==0.36.0
3
  diffusers==0.35.0
4
  accelerate==1.2.1
5
  kornia==0.8.0
6
- timm==0.6.7
7
- torch==2.4.0
8
- torchvision==0.19.0
9
  pillow==10.4.0
10
  imageio==2.36.1
11
  imageio-ffmpeg==0.5.1
@@ -21,15 +20,8 @@ pyvista==0.44.2
21
  pymeshfix==0.17.0
22
  igraph==0.11.8
23
  git+https://github.com/EasternJournalist/utils3d.git@9a4eb15e4021b67b12c460c7057d642626897ec8
24
- xformers==0.0.27.post2
25
- spconv-cu120==2.3.6
26
  transformers==4.46.3
27
- gradio_litmodel3d==0.0.1
28
- triton==3.0.0
29
- nvidia-cudnn-cu12==9.1.0.70
30
- nvidia-nccl-cu12==2.20.5
31
- tokenizers==0.20.3
32
- spaces==0.42.1
33
- https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
34
- https://huggingface.co/spaces/JeffreyXiang/TRELLIS/resolve/main/wheels/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl?download=true
35
- https://huggingface.co/spaces/JeffreyXiang/TRELLIS/resolve/main/wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl?download=true
 
1
+ huggingface-hub
 
2
  diffusers==0.35.0
3
  accelerate==1.2.1
4
  kornia==0.8.0
5
+ timm
6
+ torch==2.10.0
7
+ torchvision==0.25.0
8
  pillow==10.4.0
9
  imageio==2.36.1
10
  imageio-ffmpeg==0.5.1
 
20
  pymeshfix==0.17.0
21
  igraph==0.11.8
22
  git+https://github.com/EasternJournalist/utils3d.git@9a4eb15e4021b67b12c460c7057d642626897ec8
23
+ xformers
24
+ spconv-cu126==2.3.8
25
  transformers==4.46.3
26
+ einops
27
+ spaces