H-Liu1997 commited on
Commit
c3deedc
·
1 Parent(s): d7b5638

Migrate to ZeroGPU: Python 3.10, remove pytorch3d dependency

Browse files

- Switch to Python 3.10 + PyTorch 2.1.2 for ZeroGPU compatibility
- Replace pytorch3d.PerspectiveCameras with pure PyTorch implementation
- Add @spaces.GPU decorator with lazy model loading
- Pin pyglet==1.5.28 for headless server compatibility
- Update hardware config to zero-a10g

Files changed (5) hide show
  1. README.md +2 -1
  2. app.py +40 -18
  3. emage_utils/npz2pose.py +42 -4
  4. pre-requirements.txt +1 -1
  5. requirements.txt +2 -4
README.md CHANGED
@@ -5,11 +5,12 @@ colorFrom: green
5
  colorTo: gray
6
  sdk: gradio
7
  sdk_version: 4.44.1
8
- python_version: 3.9.20
9
  app_file: app.py
10
  pinned: false
11
  license: apache-2.0
12
  short_description: Co-Speech 3D Gesture Generation
 
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
5
  colorTo: gray
6
  sdk: gradio
7
  sdk_version: 4.44.1
8
+ python_version: 3.10
9
  app_file: app.py
10
  pinned: false
11
  license: apache-2.0
12
  short_description: Co-Speech 3D Gesture Generation
13
+ hardware: zero-a10g
14
  ---
15
 
16
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -19,36 +19,53 @@ from models.disco_audio import DiscoAudioModel
19
  from models.emage_audio import EmageAudioModel, EmageVQVAEConv, EmageVAEConv, EmageVQModel
20
  import torch.nn.functional as F
21
 
22
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
  save_folder = "./gradio_results"
24
  os.makedirs(save_folder, exist_ok=True)
25
- print(device)
26
 
27
  if not os.path.exists("./emage_evaltools/smplx_models"):
28
  import subprocess
29
  subprocess.run(["git", "clone", "https://huggingface.co/H-Liu1997/emage_evaltools"])
30
 
31
- model_camn = CamnAudioModel.from_pretrained("H-Liu1997/camn_audio").to(device).eval()
32
- model_disco = DiscoAudioModel.from_pretrained("H-Liu1997/disco_audio").to(device).eval()
 
 
 
33
 
34
- face_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/face").to(device).eval()
35
- upper_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/upper").to(device).eval()
36
- lower_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/lower").to(device).eval()
37
- hands_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/hands").to(device).eval()
38
- global_motion_ae = EmageVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/global").to(device).eval()
39
 
40
- emage_vq_model = EmageVQModel(
41
- face_model=face_motion_vq,
42
- upper_model=upper_motion_vq,
43
- lower_model=lower_motion_vq,
44
- hands_model=hands_motion_vq,
45
- global_model=global_motion_ae
46
- ).to(device).eval()
 
 
 
47
 
48
- model_emage = EmageAudioModel.from_pretrained("H-Liu1997/emage_audio").to(device).eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
 
51
  def inference_camn(audio_path, sr_model, pose_fps, seed_frames):
 
52
  audio_loaded, _ = librosa.load(audio_path, sr=sr_model)
53
  audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)
54
  sid = torch.zeros(1, 1).long().to(device)
@@ -61,6 +78,7 @@ def inference_camn(audio_path, sr_model, pose_fps, seed_frames):
61
  return npz_path
62
 
63
  def inference_disco(audio_path, sr_model, pose_fps, seed_frames):
 
64
  audio_loaded, _ = librosa.load(audio_path, sr=sr_model)
65
  audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)
66
  sid = torch.zeros(1, 1).long().to(device)
@@ -73,6 +91,7 @@ def inference_disco(audio_path, sr_model, pose_fps, seed_frames):
73
  return npz_path
74
 
75
  def inference_emage(audio_path, sr_model, pose_fps):
 
76
  audio_loaded, _ = librosa.load(audio_path, sr=sr_model)
77
  audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)
78
  sid = torch.zeros(1, 1).long().to(device)
@@ -112,10 +131,13 @@ def inference_emage(audio_path, sr_model, pose_fps):
112
  return npz_path
113
 
114
 
 
115
  def inference_app(audio, model_type, render_mesh=False, render_face=False, render_mesh_face=False):
116
  if audio is None:
117
  return [None, None, None, None, None]
118
 
 
 
119
  sr_in, audio_data = audio
120
  # --- TRUNCATE to 60 seconds if longer ---
121
  max_len = int(60 * sr_in)
@@ -228,7 +250,7 @@ with gr.Blocks() as demo:
228
  inputs=[input_audio, model_type, render_mesh, render_face, render_mesh_face],
229
  outputs=[vid_body, vid_mesh, vid_face, vid_meshface, file_npz],
230
  fn=inference_app,
231
- cache_examples=True
232
  )
233
 
234
  if __name__ == "__main__":
 
19
  from models.emage_audio import EmageAudioModel, EmageVQVAEConv, EmageVAEConv, EmageVQModel
20
  import torch.nn.functional as F
21
 
 
22
  save_folder = "./gradio_results"
23
  os.makedirs(save_folder, exist_ok=True)
 
24
 
25
  if not os.path.exists("./emage_evaltools/smplx_models"):
26
  import subprocess
27
  subprocess.run(["git", "clone", "https://huggingface.co/H-Liu1997/emage_evaltools"])
28
 
29
+ model_camn = None
30
+ model_disco = None
31
+ model_emage = None
32
+ emage_vq_model = None
33
+ _models_loaded = False
34
 
 
 
 
 
 
35
 
36
+ def load_models():
37
+ global model_camn, model_disco, model_emage, emage_vq_model, _models_loaded
38
+ if _models_loaded:
39
+ return
40
+
41
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
42
+ print(f"Loading models to {device}")
43
+
44
+ model_camn = CamnAudioModel.from_pretrained("H-Liu1997/camn_audio").to(device).eval()
45
+ model_disco = DiscoAudioModel.from_pretrained("H-Liu1997/disco_audio").to(device).eval()
46
 
47
+ face_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/face").to(device).eval()
48
+ upper_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/upper").to(device).eval()
49
+ lower_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/lower").to(device).eval()
50
+ hands_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/hands").to(device).eval()
51
+ global_motion_ae = EmageVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/global").to(device).eval()
52
+
53
+ emage_vq_model = EmageVQModel(
54
+ face_model=face_motion_vq,
55
+ upper_model=upper_motion_vq,
56
+ lower_model=lower_motion_vq,
57
+ hands_model=hands_motion_vq,
58
+ global_model=global_motion_ae
59
+ ).to(device).eval()
60
+
61
+ model_emage = EmageAudioModel.from_pretrained("H-Liu1997/emage_audio").to(device).eval()
62
+
63
+ _models_loaded = True
64
+ print("Models loaded successfully")
65
 
66
 
67
  def inference_camn(audio_path, sr_model, pose_fps, seed_frames):
68
+ device = next(model_camn.parameters()).device
69
  audio_loaded, _ = librosa.load(audio_path, sr=sr_model)
70
  audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)
71
  sid = torch.zeros(1, 1).long().to(device)
 
78
  return npz_path
79
 
80
  def inference_disco(audio_path, sr_model, pose_fps, seed_frames):
81
+ device = next(model_disco.parameters()).device
82
  audio_loaded, _ = librosa.load(audio_path, sr=sr_model)
83
  audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)
84
  sid = torch.zeros(1, 1).long().to(device)
 
91
  return npz_path
92
 
93
  def inference_emage(audio_path, sr_model, pose_fps):
94
+ device = next(model_emage.parameters()).device
95
  audio_loaded, _ = librosa.load(audio_path, sr=sr_model)
96
  audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)
97
  sid = torch.zeros(1, 1).long().to(device)
 
131
  return npz_path
132
 
133
 
134
+ @spaces.GPU(duration=120)
135
  def inference_app(audio, model_type, render_mesh=False, render_face=False, render_mesh_face=False):
136
  if audio is None:
137
  return [None, None, None, None, None]
138
 
139
+ load_models()
140
+
141
  sr_in, audio_data = audio
142
  # --- TRUNCATE to 60 seconds if longer ---
143
  max_len = int(60 * sr_in)
 
250
  inputs=[input_audio, model_type, render_mesh, render_face, render_mesh_face],
251
  outputs=[vid_body, vid_mesh, vid_face, vid_meshface, file_npz],
252
  fn=inference_app,
253
+ cache_examples=False
254
  )
255
 
256
  if __name__ == "__main__":
emage_utils/npz2pose.py CHANGED
@@ -8,10 +8,49 @@ import cv2
8
  import numpy as np
9
  import torch
10
  import smplx
11
- from pytorch3d.renderer import PerspectiveCameras
12
  from torchvision.io import write_video
13
  from torchvision.transforms.functional import convert_image_dtype
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  SMPLX_BODY_JOINT_EDGES = [
16
  {"indices": [12, 17], "color": [255, 0, 0]},
17
  {"indices": [12, 16], "color": [255, 85, 0]},
@@ -258,13 +297,12 @@ def _get_cameras(
258
  device=device, dtype=torch.float32
259
  )
260
  t = torch.tensor(camera_transl, device=device, dtype=torch.float32)
261
- cameras = PerspectiveCameras(
262
  focal_length=focal_length,
263
  principal_point=((width / 2, height / 2),),
264
- in_ndc=False,
265
  R=r.expand(batch_size, -1, -1),
266
  T=t.expand(batch_size, -1),
267
- image_size=((height, width),),
268
  device=device,
269
  )
270
  return cameras
 
8
  import numpy as np
9
  import torch
10
  import smplx
 
11
  from torchvision.io import write_video
12
  from torchvision.transforms.functional import convert_image_dtype
13
 
14
+
15
+ class SimplePerspectiveCamera:
16
+ """Pure PyTorch implementation of perspective camera projection."""
17
+
18
+ def __init__(self, focal_length, principal_point, image_size, R, T, device):
19
+ self.focal_length = focal_length
20
+ self.principal_point = principal_point
21
+ self.image_size = image_size
22
+ self.R = R
23
+ self.T = T
24
+ self.device = device
25
+
26
+ def transform_points_screen(self, points):
27
+ """
28
+ Transform 3D points to 2D screen coordinates.
29
+
30
+ Args:
31
+ points: (N, num_points, 3) tensor of 3D points
32
+
33
+ Returns:
34
+ (N, num_points, 2) tensor of 2D screen coordinates
35
+ """
36
+ batch_size = points.shape[0]
37
+
38
+ points_cam = torch.bmm(points, self.R.transpose(1, 2)) + self.T.unsqueeze(1)
39
+
40
+ x = points_cam[..., 0]
41
+ y = points_cam[..., 1]
42
+ z = points_cam[..., 2].clamp(min=1e-8)
43
+
44
+ fx = self.focal_length if isinstance(self.focal_length, float) else self.focal_length[0]
45
+ fy = fx
46
+
47
+ cx, cy = self.principal_point[0]
48
+
49
+ x_screen = fx * x / z + cx
50
+ y_screen = fy * y / z + cy
51
+
52
+ return torch.stack([x_screen, y_screen], dim=-1)
53
+
54
  SMPLX_BODY_JOINT_EDGES = [
55
  {"indices": [12, 17], "color": [255, 0, 0]},
56
  {"indices": [12, 16], "color": [255, 85, 0]},
 
297
  device=device, dtype=torch.float32
298
  )
299
  t = torch.tensor(camera_transl, device=device, dtype=torch.float32)
300
+ cameras = SimplePerspectiveCamera(
301
  focal_length=focal_length,
302
  principal_point=((width / 2, height / 2),),
303
+ image_size=((height, width),),
304
  R=r.expand(batch_size, -1, -1),
305
  T=t.expand(batch_size, -1),
 
306
  device=device,
307
  )
308
  return cameras
pre-requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  numpy==1.23
2
- torch==2.0.0
3
  torchvision
4
  torchaudio
 
1
  numpy==1.23
2
+ torch==2.1.2
3
  torchvision
4
  torchaudio
requirements.txt CHANGED
@@ -1,5 +1,3 @@
1
- -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py39_cu118_pyt200/download.html
2
- pytorch3d
3
  scikit-image==0.21.0
4
  scikit-learn==1.3.2
5
  scipy==1.11.4
@@ -10,7 +8,7 @@ opencv-python==4.8.1.78
10
  easydict
11
  timm
12
  wget
13
- av==11.0.0
14
  ffmpeg-python
15
  imageio-ffmpeg==0.4.9
16
  omegaconf==2.2.3
@@ -34,6 +32,6 @@ tqdm==4.66.1
34
  transformers==4.35.2
35
  trimesh==3.23.5
36
  wandb==0.16.0
37
- pyglet
38
  smplx
39
  pyrender
 
 
 
1
  scikit-image==0.21.0
2
  scikit-learn==1.3.2
3
  scipy==1.11.4
 
8
  easydict
9
  timm
10
  wget
11
+ av>=11.0.0
12
  ffmpeg-python
13
  imageio-ffmpeg==0.4.9
14
  omegaconf==2.2.3
 
32
  transformers==4.35.2
33
  trimesh==3.23.5
34
  wandb==0.16.0
35
+ pyglet==1.5.28
36
  smplx
37
  pyrender