Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -67,18 +67,23 @@ PARAM_NAMES = ["betas", "body_pose", "left_hand_pose", "right_hand_pose",
|
|
| 67 |
AVATAR_COLOR = (0.36, 0.78, 0.36, 1.0) # Green color as RGBA
|
| 68 |
VIDEO_FPS = 15
|
| 69 |
VIDEO_SLOWDOWN = 2
|
| 70 |
-
FRAME_WIDTH =
|
| 71 |
-
FRAME_HEIGHT =
|
| 72 |
|
| 73 |
# =====================================================================
|
| 74 |
# Install/Import Dependencies
|
| 75 |
# =====================================================================
|
| 76 |
-
|
| 77 |
-
import
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
-
|
| 80 |
-
import smplx
|
| 81 |
-
|
|
|
|
|
|
|
| 82 |
|
| 83 |
# PyRender for high-quality rendering
|
| 84 |
PYRENDER_AVAILABLE = False
|
|
@@ -88,7 +93,13 @@ try:
|
|
| 88 |
from PIL import Image, ImageDraw, ImageFont
|
| 89 |
PYRENDER_AVAILABLE = True
|
| 90 |
except ImportError:
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 94 |
import torch.nn.functional as F
|
|
@@ -498,8 +509,8 @@ def render_single_frame(
|
|
| 498 |
label: str = "",
|
| 499 |
color: tuple = AVATAR_COLOR,
|
| 500 |
fixed_center: np.ndarray = None,
|
| 501 |
-
camera_distance: float =
|
| 502 |
-
focal_length: float =
|
| 503 |
frame_width: int = FRAME_WIDTH,
|
| 504 |
frame_height: int = FRAME_HEIGHT,
|
| 505 |
bg_color: tuple = (0.95, 0.95, 0.97, 1.0)
|
|
@@ -540,12 +551,21 @@ def render_single_frame(
|
|
| 540 |
znear=0.1, zfar=20.0
|
| 541 |
)
|
| 542 |
|
| 543 |
-
# Camera pose:
|
|
|
|
|
|
|
| 544 |
camera_pose = np.eye(4)
|
| 545 |
-
camera_pose[0, 3] = camera_target[0]
|
| 546 |
-
camera_pose[1, 3] = camera_target[1]
|
| 547 |
-
camera_pose[2, 3] = camera_target[2]
|
| 548 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
|
| 550 |
scene.add(camera, pose=camera_pose)
|
| 551 |
|
|
@@ -594,8 +614,8 @@ def render_side_by_side_frame(
|
|
| 594 |
faces: np.ndarray,
|
| 595 |
labels: list,
|
| 596 |
fixed_centers: list = None,
|
| 597 |
-
camera_distance: float =
|
| 598 |
-
focal_length: float =
|
| 599 |
frame_width: int = FRAME_WIDTH,
|
| 600 |
frame_height: int = FRAME_HEIGHT,
|
| 601 |
bg_color: tuple = (0.95, 0.95, 0.97, 1.0)
|
|
@@ -635,8 +655,8 @@ def render_video(
|
|
| 635 |
label: str = "",
|
| 636 |
fps: int = VIDEO_FPS,
|
| 637 |
slowdown: int = VIDEO_SLOWDOWN,
|
| 638 |
-
camera_distance: float =
|
| 639 |
-
focal_length: float =
|
| 640 |
frame_width: int = FRAME_WIDTH,
|
| 641 |
frame_height: int = FRAME_HEIGHT
|
| 642 |
) -> str:
|
|
@@ -644,13 +664,9 @@ def render_video(
|
|
| 644 |
if not ensure_pyrender():
|
| 645 |
raise RuntimeError("PyRender not available")
|
| 646 |
|
| 647 |
-
#
|
| 648 |
-
# FIX APPLIED: Removed the manual axis flips.
|
| 649 |
-
# The previous code was flipping Y (upside down) and Z (facing away).
|
| 650 |
-
# =========================================================================
|
| 651 |
verts = verts.copy()
|
| 652 |
-
|
| 653 |
-
# verts[..., 2] *= -1 # <--- COMMENTED OUT (Fixes facing away)
|
| 654 |
|
| 655 |
# Trim last few frames to remove end-of-sequence artifacts
|
| 656 |
T_total = verts.shape[0]
|
|
@@ -690,8 +706,8 @@ def render_comparison_video(
|
|
| 690 |
label2: str = "",
|
| 691 |
fps: int = VIDEO_FPS,
|
| 692 |
slowdown: int = VIDEO_SLOWDOWN,
|
| 693 |
-
camera_distance: float =
|
| 694 |
-
focal_length: float =
|
| 695 |
frame_width: int = FRAME_WIDTH,
|
| 696 |
frame_height: int = FRAME_HEIGHT
|
| 697 |
) -> str:
|
|
@@ -699,19 +715,11 @@ def render_comparison_video(
|
|
| 699 |
if not ensure_pyrender():
|
| 700 |
raise RuntimeError("PyRender not available")
|
| 701 |
|
| 702 |
-
#
|
| 703 |
-
# FIX APPLIED: Removed the manual axis flips.
|
| 704 |
-
# =========================================================================
|
| 705 |
verts1 = verts1.copy()
|
| 706 |
verts2 = verts2.copy()
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
# verts1[..., 1] *= -1
|
| 710 |
-
# verts1[..., 2] *= -1
|
| 711 |
-
|
| 712 |
-
# Fix Avatar 2 - Removed flips
|
| 713 |
-
# verts2[..., 1] *= -1
|
| 714 |
-
# verts2[..., 2] *= -1
|
| 715 |
|
| 716 |
# Match lengths and trim
|
| 717 |
T_total = min(verts1.shape[0], verts2.shape[0])
|
|
@@ -878,7 +886,7 @@ def create_gradio_interface():
|
|
| 878 |
lines=1, max_lines=1
|
| 879 |
)
|
| 880 |
|
| 881 |
-
generate_btn = gr.Button("Generate Motion", variant="primary")
|
| 882 |
|
| 883 |
gr.Markdown("---")
|
| 884 |
gr.Markdown("### Generated Tokens")
|
|
@@ -886,7 +894,8 @@ def create_gradio_interface():
|
|
| 886 |
tokens_output = gr.Textbox(
|
| 887 |
label="Motion Tokens (both variants)",
|
| 888 |
lines=8,
|
| 889 |
-
interactive=False
|
|
|
|
| 890 |
)
|
| 891 |
|
| 892 |
if _word_pid_map:
|
|
@@ -897,7 +906,8 @@ def create_gradio_interface():
|
|
| 897 |
gr.Markdown("### Motion Comparison (Two Signer Variants)")
|
| 898 |
video_output = gr.Video(
|
| 899 |
label="Generated Motion",
|
| 900 |
-
autoplay=True
|
|
|
|
| 901 |
)
|
| 902 |
|
| 903 |
if example_list:
|
|
@@ -906,16 +916,17 @@ def create_gradio_interface():
|
|
| 906 |
|
| 907 |
for item in example_list:
|
| 908 |
word, pid = item['word'], item['pid']
|
| 909 |
-
with gr.Row():
|
| 910 |
with gr.Column(scale=1, min_width=180):
|
| 911 |
gr.HTML(f'<div class="example-word-label">{word.upper()}</div>')
|
| 912 |
gr.HTML(f'<div class="example-variant-label">Variant: {pid}</div>')
|
| 913 |
-
example_btn = gr.Button("Load Example", variant="secondary")
|
| 914 |
|
| 915 |
with gr.Column(scale=3, min_width=500):
|
| 916 |
example_video = gr.Video(
|
| 917 |
label=f"Example: {word}",
|
| 918 |
-
autoplay=False
|
|
|
|
| 919 |
)
|
| 920 |
|
| 921 |
example_btn.click(
|
|
@@ -970,4 +981,4 @@ if __name__ == "__main__":
|
|
| 970 |
server_name="0.0.0.0",
|
| 971 |
server_port=7860,
|
| 972 |
share=False
|
| 973 |
-
)
|
|
|
|
| 67 |
AVATAR_COLOR = (0.36, 0.78, 0.36, 1.0) # Green color as RGBA
|
| 68 |
VIDEO_FPS = 15
|
| 69 |
VIDEO_SLOWDOWN = 2
|
| 70 |
+
FRAME_WIDTH = 544 # Must be divisible by 16 for video codec compatibility
|
| 71 |
+
FRAME_HEIGHT = 720
|
| 72 |
|
| 73 |
# =====================================================================
|
| 74 |
# Install/Import Dependencies
|
| 75 |
# =====================================================================
|
| 76 |
+
try:
|
| 77 |
+
import gradio as gr
|
| 78 |
+
except ImportError:
|
| 79 |
+
os.system("pip install -q gradio>=4.0.0")
|
| 80 |
+
import gradio as gr
|
| 81 |
|
| 82 |
+
try:
|
| 83 |
+
import smplx
|
| 84 |
+
except ImportError:
|
| 85 |
+
os.system("pip install -q smplx==0.1.28")
|
| 86 |
+
import smplx
|
| 87 |
|
| 88 |
# PyRender for high-quality rendering
|
| 89 |
PYRENDER_AVAILABLE = False
|
|
|
|
| 93 |
from PIL import Image, ImageDraw, ImageFont
|
| 94 |
PYRENDER_AVAILABLE = True
|
| 95 |
except ImportError:
|
| 96 |
+
pass
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
import imageio
|
| 100 |
+
except ImportError:
|
| 101 |
+
os.system("pip install -q imageio[ffmpeg]")
|
| 102 |
+
import imageio
|
| 103 |
|
| 104 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 105 |
import torch.nn.functional as F
|
|
|
|
| 509 |
label: str = "",
|
| 510 |
color: tuple = AVATAR_COLOR,
|
| 511 |
fixed_center: np.ndarray = None,
|
| 512 |
+
camera_distance: float = 3.5,
|
| 513 |
+
focal_length: float = 2000,
|
| 514 |
frame_width: int = FRAME_WIDTH,
|
| 515 |
frame_height: int = FRAME_HEIGHT,
|
| 516 |
bg_color: tuple = (0.95, 0.95, 0.97, 1.0)
|
|
|
|
| 551 |
znear=0.1, zfar=20.0
|
| 552 |
)
|
| 553 |
|
| 554 |
+
# Camera pose: After 180-degree rotation around X-axis, coordinate system changes
|
| 555 |
+
# Camera should be positioned in front (negative Z) with flipped orientation
|
| 556 |
+
# This matches visualize.py and ensures proper face visibility
|
| 557 |
camera_pose = np.eye(4)
|
| 558 |
+
camera_pose[0, 3] = camera_target[0] # Center X
|
| 559 |
+
camera_pose[1, 3] = camera_target[1] # Center Y (body center)
|
| 560 |
+
camera_pose[2, 3] = camera_target[2] - camera_distance # In front (negative Z)
|
| 561 |
+
|
| 562 |
+
# Camera orientation: flip to look at subject (SOKE-style)
|
| 563 |
+
# This rotation makes camera look toward +Z (at the subject)
|
| 564 |
+
camera_pose[:3, :3] = np.array([
|
| 565 |
+
[1, 0, 0],
|
| 566 |
+
[0, -1, 0],
|
| 567 |
+
[0, 0, -1]
|
| 568 |
+
])
|
| 569 |
|
| 570 |
scene.add(camera, pose=camera_pose)
|
| 571 |
|
|
|
|
| 614 |
faces: np.ndarray,
|
| 615 |
labels: list,
|
| 616 |
fixed_centers: list = None,
|
| 617 |
+
camera_distance: float = 3.5,
|
| 618 |
+
focal_length: float = 2000,
|
| 619 |
frame_width: int = FRAME_WIDTH,
|
| 620 |
frame_height: int = FRAME_HEIGHT,
|
| 621 |
bg_color: tuple = (0.95, 0.95, 0.97, 1.0)
|
|
|
|
| 655 |
label: str = "",
|
| 656 |
fps: int = VIDEO_FPS,
|
| 657 |
slowdown: int = VIDEO_SLOWDOWN,
|
| 658 |
+
camera_distance: float = 3.5,
|
| 659 |
+
focal_length: float = 2000,
|
| 660 |
frame_width: int = FRAME_WIDTH,
|
| 661 |
frame_height: int = FRAME_HEIGHT
|
| 662 |
) -> str:
|
|
|
|
| 664 |
if not ensure_pyrender():
|
| 665 |
raise RuntimeError("PyRender not available")
|
| 666 |
|
| 667 |
+
# Apply orientation fix: rotate 180 degrees around X-axis
|
|
|
|
|
|
|
|
|
|
| 668 |
verts = verts.copy()
|
| 669 |
+
verts[..., 1:] *= -1
|
|
|
|
| 670 |
|
| 671 |
# Trim last few frames to remove end-of-sequence artifacts
|
| 672 |
T_total = verts.shape[0]
|
|
|
|
| 706 |
label2: str = "",
|
| 707 |
fps: int = VIDEO_FPS,
|
| 708 |
slowdown: int = VIDEO_SLOWDOWN,
|
| 709 |
+
camera_distance: float = 3.5,
|
| 710 |
+
focal_length: float = 2000,
|
| 711 |
frame_width: int = FRAME_WIDTH,
|
| 712 |
frame_height: int = FRAME_HEIGHT
|
| 713 |
) -> str:
|
|
|
|
| 715 |
if not ensure_pyrender():
|
| 716 |
raise RuntimeError("PyRender not available")
|
| 717 |
|
| 718 |
+
# Apply orientation fix
|
|
|
|
|
|
|
| 719 |
verts1 = verts1.copy()
|
| 720 |
verts2 = verts2.copy()
|
| 721 |
+
verts1[..., 1:] *= -1
|
| 722 |
+
verts2[..., 1:] *= -1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 723 |
|
| 724 |
# Match lengths and trim
|
| 725 |
T_total = min(verts1.shape[0], verts2.shape[0])
|
|
|
|
| 886 |
lines=1, max_lines=1
|
| 887 |
)
|
| 888 |
|
| 889 |
+
generate_btn = gr.Button("Generate Motion", variant="primary", size="lg")
|
| 890 |
|
| 891 |
gr.Markdown("---")
|
| 892 |
gr.Markdown("### Generated Tokens")
|
|
|
|
| 894 |
tokens_output = gr.Textbox(
|
| 895 |
label="Motion Tokens (both variants)",
|
| 896 |
lines=8,
|
| 897 |
+
interactive=False,
|
| 898 |
+
show_copy_button=True
|
| 899 |
)
|
| 900 |
|
| 901 |
if _word_pid_map:
|
|
|
|
| 906 |
gr.Markdown("### Motion Comparison (Two Signer Variants)")
|
| 907 |
video_output = gr.Video(
|
| 908 |
label="Generated Motion",
|
| 909 |
+
autoplay=True,
|
| 910 |
+
show_download_button=True
|
| 911 |
)
|
| 912 |
|
| 913 |
if example_list:
|
|
|
|
| 916 |
|
| 917 |
for item in example_list:
|
| 918 |
word, pid = item['word'], item['pid']
|
| 919 |
+
with gr.Row(elem_classes="example-row"):
|
| 920 |
with gr.Column(scale=1, min_width=180):
|
| 921 |
gr.HTML(f'<div class="example-word-label">{word.upper()}</div>')
|
| 922 |
gr.HTML(f'<div class="example-variant-label">Variant: {pid}</div>')
|
| 923 |
+
example_btn = gr.Button("Load Example", size="sm", variant="secondary")
|
| 924 |
|
| 925 |
with gr.Column(scale=3, min_width=500):
|
| 926 |
example_video = gr.Video(
|
| 927 |
label=f"Example: {word}",
|
| 928 |
+
autoplay=False,
|
| 929 |
+
show_download_button=True
|
| 930 |
)
|
| 931 |
|
| 932 |
example_btn.click(
|
|
|
|
| 981 |
server_name="0.0.0.0",
|
| 982 |
server_port=7860,
|
| 983 |
share=False
|
| 984 |
+
)
|