[Admin maintenance] Support new ZeroGPU hardware

#6
by multimodalart HF Staff - opened
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: ⚡
4
  colorFrom: gray
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.20.1
8
  python_version: 3.10.13
9
  app_file: app.py
10
  pinned: false
 
4
  colorFrom: gray
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.49.1
8
  python_version: 3.10.13
9
  app_file: app.py
10
  pinned: false
app.py CHANGED
@@ -1,18 +1,88 @@
1
  import os, subprocess, shlex, sys, gc
2
  import time
 
 
3
  import torch
4
  import numpy as np
5
  import shutil
6
  import argparse
7
  import gradio as gr
8
  import uuid
9
- import spaces
10
-
11
- subprocess.run(shlex.split("pip install wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl"))
12
- subprocess.run(shlex.split("pip install wheel/simple_knn-0.0.0-cp310-cp310-linux_x86_64.whl"))
13
- subprocess.run(shlex.split("pip install wheel/curope-0.0.0-cp310-cp310-linux_x86_64.whl"))
14
 
15
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  os.sys.path.append(os.path.abspath(os.path.join(BASE_DIR, "submodules", "dust3r")))
17
  # os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
18
  from dust3r.inference import inference
@@ -253,7 +323,7 @@ with block:
253
  inputs=[input_path],
254
  outputs=[output_video, output_file, output_model],
255
  fn=lambda x: process(inputfiles=None, input_path=x),
256
- cache_examples=True,
257
  label='Sparse-view Examples'
258
  )
259
  block.launch(server_name="0.0.0.0", share=False)
 
1
  import os, subprocess, shlex, sys, gc
2
  import time
3
+ import ctypes
4
+ import spaces
5
  import torch
6
  import numpy as np
7
  import shutil
8
  import argparse
9
  import gradio as gr
10
  import uuid
 
 
 
 
 
11
 
12
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13
+ CUDA_HOME = "/cuda-image/usr/local/cuda-13.0"
14
+ CUDA_LIBDIR = os.path.join(CUDA_HOME, "lib64")
15
+
16
+
17
+ @spaces.GPU(duration=600)
18
+ def _first_gpu_setup():
19
+ need = {}
20
+ for name in ("diff_gaussian_rasterization", "simple_knn", "curope"):
21
+ try:
22
+ __import__(name)
23
+ except ImportError:
24
+ need[name] = True
25
+ if not need:
26
+ return
27
+
28
+ import tempfile
29
+
30
+ patch_dir = tempfile.mkdtemp(prefix="torch_cuda_patch_")
31
+ with open(os.path.join(patch_dir, "sitecustomize.py"), "w") as f:
32
+ f.write(
33
+ "try:\n"
34
+ " import torch.utils.cpp_extension as _c\n"
35
+ " _c._check_cuda_version = lambda *a, **k: None\n"
36
+ "except Exception:\n"
37
+ " pass\n"
38
+ )
39
+
40
+ env = os.environ.copy()
41
+ env["CUDA_HOME"] = CUDA_HOME
42
+ env["CUDA_PATH"] = CUDA_HOME
43
+ env["PATH"] = os.path.join(CUDA_HOME, "bin") + os.pathsep + env.get("PATH", "")
44
+ env["PYTHONPATH"] = patch_dir + os.pathsep + env.get("PYTHONPATH", "")
45
+ env["TORCH_CUDA_ARCH_LIST"] = "12.0"
46
+
47
+ subprocess.check_call(
48
+ [sys.executable, "-m", "pip", "install", "--no-deps",
49
+ "setuptools", "wheel", "ninja", "packaging"],
50
+ )
51
+
52
+ if "diff_gaussian_rasterization" in need:
53
+ subprocess.check_call(
54
+ [sys.executable, "-m", "pip", "install",
55
+ "--no-build-isolation", "--no-deps",
56
+ os.path.join(BASE_DIR, "submodules", "diff-gaussian-rasterization")],
57
+ env=env,
58
+ )
59
+ if "simple_knn" in need:
60
+ subprocess.check_call(
61
+ [sys.executable, "-m", "pip", "install",
62
+ "--no-build-isolation", "--no-deps",
63
+ os.path.join(BASE_DIR, "submodules", "simple-knn")],
64
+ env=env,
65
+ )
66
+ if "curope" in need:
67
+ subprocess.check_call(
68
+ [sys.executable, "-m", "pip", "install",
69
+ "--no-build-isolation", "--no-deps",
70
+ os.path.join(BASE_DIR, "submodules", "dust3r", "croco", "models", "curope")],
71
+ env=env,
72
+ )
73
+
74
+
75
+ _first_gpu_setup()
76
+ try:
77
+ ctypes.CDLL(os.path.join(CUDA_LIBDIR, "libcudart.so.13"), mode=ctypes.RTLD_GLOBAL)
78
+ os.environ["LD_LIBRARY_PATH"] = CUDA_LIBDIR + os.pathsep + os.environ.get("LD_LIBRARY_PATH", "")
79
+ except OSError:
80
+ pass
81
+
82
+ # torch 2.6+ flipped weights_only default — restore old behaviour for trusted checkpoints
83
+ _orig_torch_load = torch.load
84
+ torch.load = lambda *a, **k: _orig_torch_load(*a, **{**k, "weights_only": k.get("weights_only", False)})
85
+
86
  os.sys.path.append(os.path.abspath(os.path.join(BASE_DIR, "submodules", "dust3r")))
87
  # os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
88
  from dust3r.inference import inference
 
323
  inputs=[input_path],
324
  outputs=[output_video, output_file, output_model],
325
  fn=lambda x: process(inputfiles=None, input_path=x),
326
+ cache_examples=False,
327
  label='Sparse-view Examples'
328
  )
329
  block.launch(server_name="0.0.0.0", share=False)
requirements.txt CHANGED
@@ -1,8 +1,8 @@
1
- torch==2.2.0
2
  torchvision
3
  roma
4
  evo
5
- gradio==5.0.1
6
  matplotlib
7
  tqdm
8
  opencv-python
@@ -14,4 +14,4 @@ pyglet<2
14
  huggingface-hub[torch]>=0.22
15
  plyfile
16
  imageio[ffmpeg]
17
- spaces
 
1
+ torch
2
  torchvision
3
  roma
4
  evo
5
+ gradio
6
  matplotlib
7
  tqdm
8
  opencv-python
 
14
  huggingface-hub[torch]>=0.22
15
  plyfile
16
  imageio[ffmpeg]
17
+ spaces
submodules/dust3r/croco/models/curope/kernels.cu CHANGED
@@ -98,7 +98,7 @@ void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float ba
98
  const int N_BLOCKS = B * N; // each block takes care of H*D values
99
  const int SHARED_MEM = sizeof(float) * (D + D/4);
100
 
101
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] {
102
  rope_2d_cuda_kernel<scalar_t> <<<N_BLOCKS, THREADS_PER_BLOCK, SHARED_MEM>>> (
103
  //tokens.data_ptr<scalar_t>(),
104
  tokens.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
 
98
  const int N_BLOCKS = B * N; // each block takes care of H*D values
99
  const int SHARED_MEM = sizeof(float) * (D + D/4);
100
 
101
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.scalar_type(), "rope_2d_cuda", ([&] {
102
  rope_2d_cuda_kernel<scalar_t> <<<N_BLOCKS, THREADS_PER_BLOCK, SHARED_MEM>>> (
103
  //tokens.data_ptr<scalar_t>(),
104
  tokens.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),