Spaces:

H-Liu1997
/

EMAGE

Running on Zero

H-Liu1997 commited on Jan 17

Commit

c3deedc

1 Parent(s): d7b5638

Migrate to ZeroGPU: Python 3.10, remove pytorch3d dependency

- Switch to Python 3.10 + PyTorch 2.1.2 for ZeroGPU compatibility
- Replace pytorch3d.PerspectiveCameras with pure PyTorch implementation
- Add @spaces.GPU decorator with lazy model loading
- Pin pyglet==1.5.28 for headless server compatibility
- Update hardware config to zero-a10g

Files changed (5) hide show

README.md +2 -1
app.py +40 -18
emage_utils/npz2pose.py +42 -4
pre-requirements.txt +1 -1
requirements.txt +2 -4

README.md CHANGED Viewed

@@ -5,11 +5,12 @@ colorFrom: green
 colorTo: gray
 sdk: gradio
 sdk_version: 4.44.1
-python_version: 3.9.20
 app_file: app.py
 pinned: false
 license: apache-2.0
 short_description: Co-Speech 3D Gesture Generation
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorTo: gray
 sdk: gradio
 sdk_version: 4.44.1
+python_version: 3.10
 app_file: app.py
 pinned: false
 license: apache-2.0
 short_description: Co-Speech 3D Gesture Generation
+hardware: zero-a10g
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -19,36 +19,53 @@ from models.disco_audio import DiscoAudioModel
 from models.emage_audio import EmageAudioModel, EmageVQVAEConv, EmageVAEConv, EmageVQModel
 import torch.nn.functional as F
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 save_folder = "./gradio_results"
 os.makedirs(save_folder, exist_ok=True)
-print(device)
 if not os.path.exists("./emage_evaltools/smplx_models"):
     import subprocess
     subprocess.run(["git", "clone", "https://huggingface.co/H-Liu1997/emage_evaltools"])
-model_camn = CamnAudioModel.from_pretrained("H-Liu1997/camn_audio").to(device).eval()
-model_disco = DiscoAudioModel.from_pretrained("H-Liu1997/disco_audio").to(device).eval()
-face_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/face").to(device).eval()
-upper_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/upper").to(device).eval()
-lower_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/lower").to(device).eval()
-hands_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/hands").to(device).eval()
-global_motion_ae = EmageVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/global").to(device).eval()
-emage_vq_model = EmageVQModel(
-    face_model=face_motion_vq,
-    upper_model=upper_motion_vq,
-    lower_model=lower_motion_vq,
-    hands_model=hands_motion_vq,
-    global_model=global_motion_ae
-).to(device).eval()
-model_emage = EmageAudioModel.from_pretrained("H-Liu1997/emage_audio").to(device).eval()
 def inference_camn(audio_path, sr_model, pose_fps, seed_frames):
     audio_loaded, _ = librosa.load(audio_path, sr=sr_model)
     audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)
     sid = torch.zeros(1, 1).long().to(device)
@@ -61,6 +78,7 @@ def inference_camn(audio_path, sr_model, pose_fps, seed_frames):
     return npz_path
 def inference_disco(audio_path, sr_model, pose_fps, seed_frames):
     audio_loaded, _ = librosa.load(audio_path, sr=sr_model)
     audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)
     sid = torch.zeros(1, 1).long().to(device)
@@ -73,6 +91,7 @@ def inference_disco(audio_path, sr_model, pose_fps, seed_frames):
     return npz_path
 def inference_emage(audio_path, sr_model, pose_fps):
     audio_loaded, _ = librosa.load(audio_path, sr=sr_model)
     audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)
     sid = torch.zeros(1, 1).long().to(device)
@@ -112,10 +131,13 @@ def inference_emage(audio_path, sr_model, pose_fps):
     return npz_path
 def inference_app(audio, model_type, render_mesh=False, render_face=False, render_mesh_face=False):
     if audio is None:
         return [None, None, None, None, None]
     sr_in, audio_data = audio
     # --- TRUNCATE to 60 seconds if longer ---
     max_len = int(60 * sr_in)
@@ -228,7 +250,7 @@ with gr.Blocks() as demo:
       inputs=[input_audio, model_type, render_mesh, render_face, render_mesh_face],
       outputs=[vid_body, vid_mesh, vid_face, vid_meshface, file_npz],
       fn=inference_app,
-      cache_examples=True
       )
 if __name__ == "__main__":

 from models.emage_audio import EmageAudioModel, EmageVQVAEConv, EmageVAEConv, EmageVQModel
 import torch.nn.functional as F
 save_folder = "./gradio_results"
 os.makedirs(save_folder, exist_ok=True)
 if not os.path.exists("./emage_evaltools/smplx_models"):
     import subprocess
     subprocess.run(["git", "clone", "https://huggingface.co/H-Liu1997/emage_evaltools"])
+model_camn = None
+model_disco = None
+model_emage = None
+emage_vq_model = None
+_models_loaded = False
+def load_models():
+    global model_camn, model_disco, model_emage, emage_vq_model, _models_loaded
+    if _models_loaded:
+        return
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Loading models to {device}")
+    model_camn = CamnAudioModel.from_pretrained("H-Liu1997/camn_audio").to(device).eval()
+    model_disco = DiscoAudioModel.from_pretrained("H-Liu1997/disco_audio").to(device).eval()
+    face_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/face").to(device).eval()
+    upper_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/upper").to(device).eval()
+    lower_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/lower").to(device).eval()
+    hands_motion_vq = EmageVQVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/hands").to(device).eval()
+    global_motion_ae = EmageVAEConv.from_pretrained("H-Liu1997/emage_audio", subfolder="emage_vq/global").to(device).eval()
+    emage_vq_model = EmageVQModel(
+        face_model=face_motion_vq,
+        upper_model=upper_motion_vq,
+        lower_model=lower_motion_vq,
+        hands_model=hands_motion_vq,
+        global_model=global_motion_ae
+    ).to(device).eval()
+    model_emage = EmageAudioModel.from_pretrained("H-Liu1997/emage_audio").to(device).eval()
+    _models_loaded = True
+    print("Models loaded successfully")
 def inference_camn(audio_path, sr_model, pose_fps, seed_frames):
+    device = next(model_camn.parameters()).device
     audio_loaded, _ = librosa.load(audio_path, sr=sr_model)
     audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)
     sid = torch.zeros(1, 1).long().to(device)
     return npz_path
 def inference_disco(audio_path, sr_model, pose_fps, seed_frames):
+    device = next(model_disco.parameters()).device
     audio_loaded, _ = librosa.load(audio_path, sr=sr_model)
     audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)
     sid = torch.zeros(1, 1).long().to(device)
     return npz_path
 def inference_emage(audio_path, sr_model, pose_fps):
+    device = next(model_emage.parameters()).device
     audio_loaded, _ = librosa.load(audio_path, sr=sr_model)
     audio_t = torch.from_numpy(audio_loaded).float().unsqueeze(0).to(device)
     sid = torch.zeros(1, 1).long().to(device)
     return npz_path
+@spaces.GPU(duration=120)
 def inference_app(audio, model_type, render_mesh=False, render_face=False, render_mesh_face=False):
     if audio is None:
         return [None, None, None, None, None]
+    load_models()
     sr_in, audio_data = audio
     # --- TRUNCATE to 60 seconds if longer ---
     max_len = int(60 * sr_in)
       inputs=[input_audio, model_type, render_mesh, render_face, render_mesh_face],
       outputs=[vid_body, vid_mesh, vid_face, vid_meshface, file_npz],
       fn=inference_app,
+      cache_examples=False
       )
 if __name__ == "__main__":

emage_utils/npz2pose.py CHANGED Viewed

@@ -8,10 +8,49 @@ import cv2
 import numpy as np
 import torch
 import smplx
-from pytorch3d.renderer import PerspectiveCameras
 from torchvision.io import write_video
 from torchvision.transforms.functional import convert_image_dtype
 SMPLX_BODY_JOINT_EDGES = [
     {"indices": [12, 17], "color": [255, 0, 0]},
     {"indices": [12, 16], "color": [255, 85, 0]},
@@ -258,13 +297,12 @@ def _get_cameras(
         device=device, dtype=torch.float32
     )
     t = torch.tensor(camera_transl, device=device, dtype=torch.float32)
-    cameras = PerspectiveCameras(
         focal_length=focal_length,
         principal_point=((width / 2, height / 2),),
-        in_ndc=False,
         R=r.expand(batch_size, -1, -1),
         T=t.expand(batch_size, -1),
-        image_size=((height, width),),
         device=device,
     )
     return cameras

 import numpy as np
 import torch
 import smplx
 from torchvision.io import write_video
 from torchvision.transforms.functional import convert_image_dtype
+class SimplePerspectiveCamera:
+    """Pure PyTorch implementation of perspective camera projection."""
+    def __init__(self, focal_length, principal_point, image_size, R, T, device):
+        self.focal_length = focal_length
+        self.principal_point = principal_point
+        self.image_size = image_size
+        self.R = R
+        self.T = T
+        self.device = device
+    def transform_points_screen(self, points):
+        """
+        Transform 3D points to 2D screen coordinates.
+        Args:
+            points: (N, num_points, 3) tensor of 3D points
+        Returns:
+            (N, num_points, 2) tensor of 2D screen coordinates
+        """
+        batch_size = points.shape[0]
+        points_cam = torch.bmm(points, self.R.transpose(1, 2)) + self.T.unsqueeze(1)
+        x = points_cam[..., 0]
+        y = points_cam[..., 1]
+        z = points_cam[..., 2].clamp(min=1e-8)
+        fx = self.focal_length if isinstance(self.focal_length, float) else self.focal_length[0]
+        fy = fx
+        cx, cy = self.principal_point[0]
+        x_screen = fx * x / z + cx
+        y_screen = fy * y / z + cy
+        return torch.stack([x_screen, y_screen], dim=-1)
 SMPLX_BODY_JOINT_EDGES = [
     {"indices": [12, 17], "color": [255, 0, 0]},
     {"indices": [12, 16], "color": [255, 85, 0]},
         device=device, dtype=torch.float32
     )
     t = torch.tensor(camera_transl, device=device, dtype=torch.float32)
+    cameras = SimplePerspectiveCamera(
         focal_length=focal_length,
         principal_point=((width / 2, height / 2),),
+        image_size=((height, width),),
         R=r.expand(batch_size, -1, -1),
         T=t.expand(batch_size, -1),
         device=device,
     )
     return cameras

pre-requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 numpy==1.23
-torch==2.0.0
 torchvision
 torchaudio

 numpy==1.23
+torch==2.1.2
 torchvision
 torchaudio

requirements.txt CHANGED Viewed

@@ -1,5 +1,3 @@
--f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py39_cu118_pyt200/download.html
-pytorch3d
 scikit-image==0.21.0
 scikit-learn==1.3.2
 scipy==1.11.4
@@ -10,7 +8,7 @@ opencv-python==4.8.1.78
 easydict
 timm
 wget
-av==11.0.0
 ffmpeg-python
 imageio-ffmpeg==0.4.9
 omegaconf==2.2.3
@@ -34,6 +32,6 @@ tqdm==4.66.1
 transformers==4.35.2
 trimesh==3.23.5
 wandb==0.16.0
-pyglet
 smplx
 pyrender

 scikit-image==0.21.0
 scikit-learn==1.3.2
 scipy==1.11.4
 easydict
 timm
 wget
+av>=11.0.0
 ffmpeg-python
 imageio-ffmpeg==0.4.9
 omegaconf==2.2.3
 transformers==4.35.2
 trimesh==3.23.5
 wandb==0.16.0
+pyglet==1.5.28
 smplx
 pyrender