Spaces:

rdz-falcon
/

SignMotionGPT

Build error

App Files Files Community

rdz-falcon commited on Jan 11

Commit

adaffa8

verified ·

1 Parent(s): a7a2fc6

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -45

app.py CHANGED Viewed

@@ -67,18 +67,23 @@ PARAM_NAMES = ["betas", "body_pose", "left_hand_pose", "right_hand_pose",
 AVATAR_COLOR = (0.36, 0.78, 0.36, 1.0)  # Green color as RGBA
 VIDEO_FPS = 15
 VIDEO_SLOWDOWN = 2
-FRAME_WIDTH = 384  # Must be divisible by 16 for video codec compatibility
-FRAME_HEIGHT = 512  # Smaller, more compact video size
 # =====================================================================
 # Install/Import Dependencies
 # =====================================================================
-import os
-import sys
-import gradio as gr
-import smplx
-import imageio
 # PyRender for high-quality rendering
 PYRENDER_AVAILABLE = False
@@ -88,7 +93,13 @@ try:
     from PIL import Image, ImageDraw, ImageFont
     PYRENDER_AVAILABLE = True
 except ImportError:
-    print("Warning: PyRender dependencies not available. Install trimesh, pyrender, Pillow.")
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch.nn.functional as F
@@ -498,8 +509,8 @@ def render_single_frame(
     label: str = "",
     color: tuple = AVATAR_COLOR,
     fixed_center: np.ndarray = None,
-    camera_distance: float = 4.5,
-    focal_length: float = 1200,
     frame_width: int = FRAME_WIDTH,
     frame_height: int = FRAME_HEIGHT,
     bg_color: tuple = (0.95, 0.95, 0.97, 1.0)
@@ -540,12 +551,21 @@ def render_single_frame(
         znear=0.1, zfar=20.0
     )
-    # Camera pose: positioned BEHIND the subject (at +Z), looking at -Z (toward face)
     camera_pose = np.eye(4)
-    camera_pose[0, 3] = camera_target[0]
-    camera_pose[1, 3] = camera_target[1]
-    camera_pose[2, 3] = camera_target[2] + camera_distance
-    camera_pose[:3, :3] = np.eye(3)
     scene.add(camera, pose=camera_pose)
@@ -594,8 +614,8 @@ def render_side_by_side_frame(
     faces: np.ndarray,
     labels: list,
     fixed_centers: list = None,
-    camera_distance: float = 4.5,
-    focal_length: float = 1200,
     frame_width: int = FRAME_WIDTH,
     frame_height: int = FRAME_HEIGHT,
     bg_color: tuple = (0.95, 0.95, 0.97, 1.0)
@@ -635,8 +655,8 @@ def render_video(
     label: str = "",
     fps: int = VIDEO_FPS,
     slowdown: int = VIDEO_SLOWDOWN,
-    camera_distance: float = 4.5,
-    focal_length: float = 1200,
     frame_width: int = FRAME_WIDTH,
     frame_height: int = FRAME_HEIGHT
 ) -> str:
@@ -644,13 +664,9 @@ def render_video(
     if not ensure_pyrender():
         raise RuntimeError("PyRender not available")
-    # =========================================================================
-    # FIX APPLIED: Removed the manual axis flips.
-    # The previous code was flipping Y (upside down) and Z (facing away).
-    # =========================================================================
     verts = verts.copy()
-    # verts[..., 1] *= -1  # <--- COMMENTED OUT (Fixes upside down)
-    # verts[..., 2] *= -1  # <--- COMMENTED OUT (Fixes facing away)
     # Trim last few frames to remove end-of-sequence artifacts
     T_total = verts.shape[0]
@@ -690,8 +706,8 @@ def render_comparison_video(
     label2: str = "",
     fps: int = VIDEO_FPS,
     slowdown: int = VIDEO_SLOWDOWN,
-    camera_distance: float = 4.5,
-    focal_length: float = 1200,
     frame_width: int = FRAME_WIDTH,
     frame_height: int = FRAME_HEIGHT
 ) -> str:
@@ -699,19 +715,11 @@ def render_comparison_video(
     if not ensure_pyrender():
         raise RuntimeError("PyRender not available")
-    # =========================================================================
-    # FIX APPLIED: Removed the manual axis flips.
-    # =========================================================================
     verts1 = verts1.copy()
     verts2 = verts2.copy()
-    # Fix Avatar 1 - Removed flips
-    # verts1[..., 1] *= -1
-    # verts1[..., 2] *= -1
-    # Fix Avatar 2 - Removed flips
-    # verts2[..., 1] *= -1
-    # verts2[..., 2] *= -1
     # Match lengths and trim
     T_total = min(verts1.shape[0], verts2.shape[0])
@@ -878,7 +886,7 @@ def create_gradio_interface():
                     lines=1, max_lines=1
                 )
-                generate_btn = gr.Button("Generate Motion", variant="primary")
                 gr.Markdown("---")
                 gr.Markdown("### Generated Tokens")
@@ -886,7 +894,8 @@ def create_gradio_interface():
                 tokens_output = gr.Textbox(
                     label="Motion Tokens (both variants)",
                     lines=8,
-                    interactive=False
                 )
                 if _word_pid_map:
@@ -897,7 +906,8 @@ def create_gradio_interface():
                 gr.Markdown("### Motion Comparison (Two Signer Variants)")
                 video_output = gr.Video(
                     label="Generated Motion",
-                    autoplay=True
                 )
         if example_list:
@@ -906,16 +916,17 @@ def create_gradio_interface():
             for item in example_list:
                 word, pid = item['word'], item['pid']
-                with gr.Row():
                     with gr.Column(scale=1, min_width=180):
                         gr.HTML(f'<div class="example-word-label">{word.upper()}</div>')
                         gr.HTML(f'<div class="example-variant-label">Variant: {pid}</div>')
-                        example_btn = gr.Button("Load Example", variant="secondary")
                     with gr.Column(scale=3, min_width=500):
                         example_video = gr.Video(
                             label=f"Example: {word}",
-                            autoplay=False
                         )
                     example_btn.click(
@@ -970,4 +981,4 @@ if __name__ == "__main__":
         server_name="0.0.0.0",
         server_port=7860,
         share=False
-    )

 AVATAR_COLOR = (0.36, 0.78, 0.36, 1.0)  # Green color as RGBA
 VIDEO_FPS = 15
 VIDEO_SLOWDOWN = 2
+FRAME_WIDTH = 544  # Must be divisible by 16 for video codec compatibility
+FRAME_HEIGHT = 720
 # =====================================================================
 # Install/Import Dependencies
 # =====================================================================
+try:
+    import gradio as gr
+except ImportError:
+    os.system("pip install -q gradio>=4.0.0")
+    import gradio as gr
+try:
+    import smplx
+except ImportError:
+    os.system("pip install -q smplx==0.1.28")
+    import smplx
 # PyRender for high-quality rendering
 PYRENDER_AVAILABLE = False
     from PIL import Image, ImageDraw, ImageFont
     PYRENDER_AVAILABLE = True
 except ImportError:
+    pass
+try:
+    import imageio
+except ImportError:
+    os.system("pip install -q imageio[ffmpeg]")
+    import imageio
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch.nn.functional as F
     label: str = "",
     color: tuple = AVATAR_COLOR,
     fixed_center: np.ndarray = None,
+    camera_distance: float = 3.5,
+    focal_length: float = 2000,
     frame_width: int = FRAME_WIDTH,
     frame_height: int = FRAME_HEIGHT,
     bg_color: tuple = (0.95, 0.95, 0.97, 1.0)
         znear=0.1, zfar=20.0
     )
+    # Camera pose: After 180-degree rotation around X-axis, coordinate system changes
+    # Camera should be positioned in front (negative Z) with flipped orientation
+    # This matches visualize.py and ensures proper face visibility
     camera_pose = np.eye(4)
+    camera_pose[0, 3] = camera_target[0]                    # Center X
+    camera_pose[1, 3] = camera_target[1]                    # Center Y (body center)
+    camera_pose[2, 3] = camera_target[2] - camera_distance  # In front (negative Z)
+    # Camera orientation: flip to look at subject (SOKE-style)
+    # This rotation makes camera look toward +Z (at the subject)
+    camera_pose[:3, :3] = np.array([
+        [1,  0,  0],
+        [0, -1,  0],
+        [0,  0, -1]
+    ])
     scene.add(camera, pose=camera_pose)
     faces: np.ndarray,
     labels: list,
     fixed_centers: list = None,
+    camera_distance: float = 3.5,
+    focal_length: float = 2000,
     frame_width: int = FRAME_WIDTH,
     frame_height: int = FRAME_HEIGHT,
     bg_color: tuple = (0.95, 0.95, 0.97, 1.0)
     label: str = "",
     fps: int = VIDEO_FPS,
     slowdown: int = VIDEO_SLOWDOWN,
+    camera_distance: float = 3.5,
+    focal_length: float = 2000,
     frame_width: int = FRAME_WIDTH,
     frame_height: int = FRAME_HEIGHT
 ) -> str:
     if not ensure_pyrender():
         raise RuntimeError("PyRender not available")
+    # Apply orientation fix: rotate 180 degrees around X-axis
     verts = verts.copy()
+    verts[..., 1:] *= -1
     # Trim last few frames to remove end-of-sequence artifacts
     T_total = verts.shape[0]
     label2: str = "",
     fps: int = VIDEO_FPS,
     slowdown: int = VIDEO_SLOWDOWN,
+    camera_distance: float = 3.5,
+    focal_length: float = 2000,
     frame_width: int = FRAME_WIDTH,
     frame_height: int = FRAME_HEIGHT
 ) -> str:
     if not ensure_pyrender():
         raise RuntimeError("PyRender not available")
+    # Apply orientation fix
     verts1 = verts1.copy()
     verts2 = verts2.copy()
+    verts1[..., 1:] *= -1
+    verts2[..., 1:] *= -1
     # Match lengths and trim
     T_total = min(verts1.shape[0], verts2.shape[0])
                     lines=1, max_lines=1
                 )
+                generate_btn = gr.Button("Generate Motion", variant="primary", size="lg")
                 gr.Markdown("---")
                 gr.Markdown("### Generated Tokens")
                 tokens_output = gr.Textbox(
                     label="Motion Tokens (both variants)",
                     lines=8,
+                    interactive=False,
+                    show_copy_button=True
                 )
                 if _word_pid_map:
                 gr.Markdown("### Motion Comparison (Two Signer Variants)")
                 video_output = gr.Video(
                     label="Generated Motion",
+                    autoplay=True,
+                    show_download_button=True
                 )
         if example_list:
             for item in example_list:
                 word, pid = item['word'], item['pid']
+                with gr.Row(elem_classes="example-row"):
                     with gr.Column(scale=1, min_width=180):
                         gr.HTML(f'<div class="example-word-label">{word.upper()}</div>')
                         gr.HTML(f'<div class="example-variant-label">Variant: {pid}</div>')
+                        example_btn = gr.Button("Load Example", size="sm", variant="secondary")
                     with gr.Column(scale=3, min_width=500):
                         example_video = gr.Video(
                             label=f"Example: {word}",
+                            autoplay=False,
+                            show_download_button=True
                         )
                     example_btn.click(
         server_name="0.0.0.0",
         server_port=7860,
         share=False
+    )