Spaces:

rdz-falcon
/

SignMotionGPT

Running

App Files Files Community

rdz-falcon commited on Jan 11

Commit

1872ff5

verified ·

1 Parent(s): 625c5a9

Update app.py

Browse files

Files changed (1) hide show

app.py +213 -221

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ Uses PyRender for high-quality avatar visualization
 # IMPORTANT: Set OpenGL platform BEFORE any OpenGL imports (for headless rendering)
 import os
 os.environ["PYOPENGL_PLATFORM"] = "egl"
 import sys
 import re
 import json
@@ -15,12 +14,9 @@ import warnings
 import tempfile
 import uuid
 from pathlib import Path
 import torch
 import numpy as np
 warnings.filterwarnings("ignore")
 # =====================================================================
 # Configuration for HuggingFace Spaces
 # =====================================================================
@@ -29,19 +25,15 @@ DATA_DIR = os.path.join(WORK_DIR, "data")
 OUTPUT_DIR = os.path.join(WORK_DIR, "outputs")
 os.makedirs(DATA_DIR, exist_ok=True)
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 # Path definitions
 DATASET_PATH = os.path.join(DATA_DIR, "motion_llm_dataset.json")
 VQVAE_CHECKPOINT = os.path.join(DATA_DIR, "vqvae_model.pt")
 STATS_PATH = os.path.join(DATA_DIR, "vqvae_stats.pt")
 SMPLX_MODEL_DIR = os.path.join(DATA_DIR, "smplx_models")
 # HuggingFace model config
 HF_REPO_ID = os.environ.get("HF_REPO_ID", "rdz-falcon/SignMotionGPTfit-archive")
 HF_SUBFOLDER = os.environ.get("HF_SUBFOLDER", "stage2_v2/epoch-030")
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Generation parameters
 M_START = "<M_START>"
 M_END = "<M_END>"
@@ -49,7 +41,6 @@ PAD_TOKEN = "<PAD>"
 INFERENCE_TEMPERATURE = 0.7
 INFERENCE_TOP_K = 50
 INFERENCE_REPETITION_PENALTY = 1.2
 # VQ-VAE parameters
 SMPL_DIM = 182
 CODEBOOK_SIZE = 512
@@ -58,18 +49,15 @@ VQ_ARGS = dict(
     width=512, depth=3, down_t=2, stride_t=2,
     dilation_growth_rate=3, activation='relu', norm=None, quantizer="ema_reset"
 )
 PARAM_DIMS = [10, 63, 45, 45, 3, 10, 3, 3]
-PARAM_NAMES = ["shape", "body_pose", "lhand_pose", "rhand_pose",
-               "jaw_pose", "expression", "root_pose", "cam_trans"]
 # Visualization defaults
 AVATAR_COLOR = (0.36, 0.78, 0.36, 1.0)  # Green color as RGBA
 VIDEO_FPS = 15
 VIDEO_SLOWDOWN = 2
 FRAME_WIDTH = 544  # Must be divisible by 16 for video codec compatibility
 FRAME_HEIGHT = 720
 # =====================================================================
 # Install/Import Dependencies
 # =====================================================================
@@ -78,13 +66,11 @@ try:
 except ImportError:
     os.system("pip install -q gradio>=4.0.0")
     import gradio as gr
 try:
     import smplx
 except ImportError:
     os.system("pip install -q smplx==0.1.28")
     import smplx
 # PyRender for high-quality rendering
 PYRENDER_AVAILABLE = False
 try:
@@ -94,16 +80,13 @@ try:
     PYRENDER_AVAILABLE = True
 except ImportError:
     pass
 try:
     import imageio
 except ImportError:
     os.system("pip install -q imageio[ffmpeg]")
     import imageio
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch.nn.functional as F
 # =====================================================================
 # Import VQ-VAE architecture
 # =====================================================================
@@ -113,13 +96,11 @@ if parent_dir not in sys.path:
     sys.path.insert(0, parent_dir)
 if current_dir not in sys.path:
     sys.path.insert(0, current_dir)
 try:
     from mGPT.archs.mgpt_vq import VQVae
 except ImportError as e:
     print(f"Warning: Could not import VQVae: {e}")
     VQVae = None
 # =====================================================================
 # Global Cache
 # =====================================================================
@@ -131,10 +112,8 @@ _model_cache = {
     "stats": (None, None),
     "initialized": False
 }
 _word_pid_map = {}
 _example_cache = {}
 # =====================================================================
 # PyRender Setup
 # =====================================================================
@@ -143,12 +122,12 @@ def ensure_pyrender():
     global PYRENDER_AVAILABLE, trimesh, pyrender, Image, ImageDraw, ImageFont
     if PYRENDER_AVAILABLE:
         return True
     print("Installing pyrender dependencies...")
     if os.path.exists("/etc/debian_version"):
         os.system("apt-get update -qq && apt-get install -qq -y libegl1-mesa-dev libgles2-mesa-dev > /dev/null 2>&1")
     os.system("pip install -q trimesh pyrender PyOpenGL PyOpenGL_accelerate Pillow")
     try:
         import trimesh
         import pyrender
@@ -158,23 +137,22 @@ def ensure_pyrender():
     except ImportError as e:
         print(f"Could not install pyrender: {e}")
         return False
 # =====================================================================
 # Dataset Loading - Word to PID mapping
 # =====================================================================
 def load_word_pid_mapping():
     """Load the dataset and build word -> PIDs mapping."""
     global _word_pid_map
     if not os.path.exists(DATASET_PATH):
         print(f"Dataset not found: {DATASET_PATH}")
         return
     print(f"Loading dataset from: {DATASET_PATH}")
     try:
         with open(DATASET_PATH, 'r', encoding='utf-8') as f:
             data = json.load(f)
         for entry in data:
             word = entry.get('word', '').lower()
             pid = entry.get('participant_id', '')
@@ -182,21 +160,17 @@ def load_word_pid_mapping():
                 if word not in _word_pid_map:
                     _word_pid_map[word] = set()
                 _word_pid_map[word].add(pid)
         for word in _word_pid_map:
             _word_pid_map[word] = sorted(list(_word_pid_map[word]))
         print(f"Loaded {len(_word_pid_map)} unique words from dataset")
     except Exception as e:
         print(f"Error loading dataset: {e}")
 def get_pids_for_word(word: str) -> list:
     """Get valid PIDs for a word from the dataset."""
     word = word.lower().strip()
     return _word_pid_map.get(word, [])
 def get_random_pids_for_word(word: str, count: int = 2) -> list:
     """Get random PIDs for a word. Returns up to 'count' PIDs."""
     pids = get_pids_for_word(word)
@@ -205,29 +179,26 @@ def get_random_pids_for_word(word: str, count: int = 2) -> list:
     if len(pids) <= count:
         return pids
     return random.sample(pids, count)
 def get_example_words_with_pids(count: int = 3) -> list:
     """Get example words with valid PIDs from dataset."""
     examples = []
     preferred = ['push', 'passport', 'library', 'send', 'college', 'help', 'thank', 'hello']
     for word in preferred:
         pids = get_pids_for_word(word)
         if pids:
             examples.append((word, pids[0]))
             if len(examples) >= count:
                 break
     if len(examples) < count:
         available = [w for w in _word_pid_map.keys() if w not in [e[0] for e in examples]]
         random.shuffle(available)
         for word in available[:count - len(examples)]:
             pids = _word_pid_map[word]
             examples.append((word, pids[0]))
-    return examples
 # =====================================================================
 # VQ-VAE Wrapper
 # =====================================================================
@@ -240,14 +211,13 @@ class MotionGPT_VQVAE_Wrapper(torch.nn.Module):
             nfeats=smpl_dim, code_num=codebook_size, code_dim=code_dim,
             output_emb_width=code_dim, **kwargs
         )
 # =====================================================================
 # Model Loading Functions
 # =====================================================================
 def load_llm_model():
     print(f"Loading LLM from: {HF_REPO_ID}/{HF_SUBFOLDER}")
     token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
     tokenizer = AutoTokenizer.from_pretrained(
         HF_REPO_ID, subfolder=HF_SUBFOLDER, trust_remote_code=True, token=token
     )
@@ -263,8 +233,6 @@ def load_llm_model():
     model.eval()
     print(f"LLM loaded (vocab size: {len(tokenizer)})")
     return model, tokenizer
 def load_vqvae_model():
     if not os.path.exists(VQVAE_CHECKPOINT):
         print(f"VQ-VAE checkpoint not found: {VQVAE_CHECKPOINT}")
@@ -277,8 +245,6 @@ def load_vqvae_model():
     model.eval()
     print(f"VQ-VAE loaded")
     return model
 def load_stats():
     if not os.path.exists(STATS_PATH):
         return None, None
@@ -287,8 +253,6 @@ def load_stats():
     if torch.is_tensor(mean): mean = mean.cpu().numpy()
     if torch.is_tensor(std): std = std.cpu().numpy()
     return mean, std
 def load_smplx_model():
     if not os.path.exists(SMPLX_MODEL_DIR):
         print(f"SMPL-X directory not found: {SMPLX_MODEL_DIR}")
@@ -302,47 +266,43 @@ def load_smplx_model():
     ).to(DEVICE)
     print(f"SMPL-X loaded")
     return model
 def initialize_models():
     global _model_cache
     if _model_cache["initialized"]:
         return
     print("\n" + "="*60)
     print("  Initializing SignMotionGPT Models")
     print("="*60)
     load_word_pid_mapping()
     _model_cache["llm_model"], _model_cache["llm_tokenizer"] = load_llm_model()
     try:
         _model_cache["vqvae_model"] = load_vqvae_model()
         _model_cache["stats"] = load_stats()
         _model_cache["smplx_model"] = load_smplx_model()
     except Exception as e:
         print(f"Could not load visualization models: {e}")
     # Ensure PyRender is available
     ensure_pyrender()
     _model_cache["initialized"] = True
     print("All models initialized")
     print("="*60)
 def precompute_examples():
     """Pre-compute animations for example words at startup."""
     global _example_cache
     if not _model_cache["initialized"]:
         return
     examples = get_example_words_with_pids(3)
     print(f"\nPre-computing {len(examples)} example animations...")
     for word, pid in examples:
         key = f"{word}_{pid}"
         print(f"  Computing: {word} ({pid})...")
@@ -353,22 +313,21 @@ def precompute_examples():
         except Exception as e:
             print(f"    Failed: {word} - {e}")
             _example_cache[key] = {"video_path": None, "tokens": "", "word": word, "pid": pid}
-    print("Example pre-computation complete\n")
 # =====================================================================
 # Motion Generation Functions
 # =====================================================================
 def generate_motion_tokens(word: str, variant: str) -> str:
     model = _model_cache["llm_model"]
     tokenizer = _model_cache["llm_tokenizer"]
     if model is None or tokenizer is None:
         raise RuntimeError("LLM model not loaded")
     prompt = f"Instruction: Generate motion for word '{word}' with variant '{variant}'.\nMotion: "
     inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
         output = model.generate(
             **inputs, max_new_tokens=100, do_sample=True,
@@ -378,46 +337,42 @@ def generate_motion_tokens(word: str, variant: str) -> str:
             eos_token_id=tokenizer.convert_tokens_to_ids(M_END),
             early_stopping=True
         )
     decoded = tokenizer.decode(output[0], skip_special_tokens=False)
     motion_part = decoded.split("Motion: ")[-1] if "Motion: " in decoded else decoded
     return motion_part.strip()
 def parse_motion_tokens(token_str: str) -> list:
     if isinstance(token_str, (list, tuple, np.ndarray)):
         return [int(x) for x in token_str]
     if not isinstance(token_str, str):
         return []
     matches = re.findall(r'<M(\d+)>', token_str)
     if matches:
         return [int(x) for x in matches]
     matches = re.findall(r'<motion_(\d+)>', token_str)
     if matches:
         return [int(x) for x in matches]
-    return []
 def decode_tokens_to_params(tokens: list) -> np.ndarray:
     vqvae_model = _model_cache["vqvae_model"]
     mean, std = _model_cache["stats"]
     if vqvae_model is None or not tokens:
         return np.zeros((0, SMPL_DIM), dtype=np.float32)
     idx = torch.tensor(tokens, dtype=torch.long, device=DEVICE).unsqueeze(0)
     T_q = idx.shape[1]
     quantizer = vqvae_model.vqvae.quantizer
     if hasattr(quantizer, "codebook"):
         codebook = quantizer.codebook.to(DEVICE)
         code_dim = codebook.shape[1]
     else:
         code_dim = CODE_DIM
     x_quantized = None
     if hasattr(quantizer, "dequantize"):
         try:
@@ -431,47 +386,55 @@ def decode_tokens_to_params(tokens: list) -> np.ndarray:
                     x_quantized = dq.permute(0, 2, 1).contiguous()
         except Exception:
             pass
     if x_quantized is None:
         if not hasattr(quantizer, "codebook"):
             return np.zeros((0, SMPL_DIM), dtype=np.float32)
         with torch.no_grad():
             emb = codebook[idx]
             x_quantized = emb.permute(0, 2, 1).contiguous()
     with torch.no_grad():
         x_dec = vqvae_model.vqvae.decoder(x_quantized)
         smpl_out = vqvae_model.vqvae.postprocess(x_dec)
         params_np = smpl_out.squeeze(0).cpu().numpy()
     if (mean is not None) and (std is not None):
         params_np = (params_np * np.array(std).reshape(1, -1)) + np.array(mean).reshape(1, -1)
-    return params_np
-def params_to_vertices(params_seq: np.ndarray) -> tuple:
-    smplx_model = _model_cache["smplx_model"]
-    if smplx_model is None or params_seq.shape[0] == 0:
-        return None, None
     starts = np.cumsum([0] + PARAM_DIMS[:-1])
     ends = starts + np.array(PARAM_DIMS)
     T = params_seq.shape[0]
     all_verts = []
-    batch_size = 32
     num_body_joints = getattr(smplx_model, "NUM_BODY_JOINTS", 21)
     with torch.no_grad():
         for s in range(0, T, batch_size):
-            batch = params_seq[s:s+batch_size]
             B = batch.shape[0]
-            np_parts = {name: batch[:, st:ed].astype(np.float32) for name, st, ed in zip(PARAM_NAMES, starts, ends)}
-            tensor_parts = {name: torch.from_numpy(arr).to(DEVICE) for name, arr in np_parts.items()}
-            # Handle body pose - in this data format, body_pose is 63 dims (21 joints * 3)
-            # root_pose is separate as global_orient (3 dims)
             body_t = tensor_parts['body_pose']
             L_body = body_t.shape[1]
             expected_no_go = num_body_joints * 3
@@ -484,24 +447,67 @@ def params_to_vertices(params_seq: np.ndarray) -> tuple:
                 global_orient = torch.zeros((B, 3), dtype=torch.float32, device=DEVICE)
                 body_pose_only = body_t
             else:
                 if L_body > expected_no_go:
                     global_orient = body_t[:, :3].contiguous()
                     body_pose_only = body_t[:, 3:].contiguous()
                 else:
-                    body_pose_only = F.pad(body_t, (0, max(0, expected_no_go - L_body)))
                     global_orient = torch.zeros((B, 3), dtype=torch.float32, device=DEVICE)
             out = smplx_model(
-                betas=tensor_parts['betas'], global_orient=global_orient, body_pose=body_pose_only,
-                left_hand_pose=tensor_parts['left_hand_pose'], right_hand_pose=tensor_parts['right_hand_pose'],
-                expression=tensor_parts['expression'], jaw_pose=tensor_parts['jaw_pose'],
-                leye_pose=tensor_parts['eye_pose'], reye_pose=tensor_parts['eye_pose'],
-                transl=tensor_parts['trans'], return_verts=True
             )
-            all_verts.append(out.vertices.detach().cpu().numpy())
-    return np.concatenate(all_verts, axis=0), smplx_model.faces.astype(np.int32)
 # =====================================================================
 # PyRender Visualization Functions
 # =====================================================================
@@ -520,20 +526,15 @@ def render_single_frame(
     """Render a single mesh frame using PyRender."""
     if not PYRENDER_AVAILABLE:
         raise RuntimeError("PyRender not available")
     # Check for invalid vertices
     if not np.isfinite(verts).all():
         blank = np.ones((frame_height, frame_width, 3), dtype=np.uint8) * 200
         return blank
-    # IMPORTANT: Rotate mesh 180 degrees around X-axis (like visualize.py)
-    # This fixes the coordinate system so we view from the front
-    rot_matrix = trimesh.transformations.rotation_matrix(np.radians(180), [1, 0, 0])
-    verts_rotated = np.dot(verts, rot_matrix[:3, :3].T)
     # Create scene
     scene = pyrender.Scene(bg_color=bg_color, ambient_light=[0.4, 0.4, 0.4])
     # Material
     material = pyrender.MetallicRoughnessMaterial(
         metallicFactor=0.0,
@@ -541,29 +542,31 @@ def render_single_frame(
         alphaMode='OPAQUE',
         baseColorFactor=color
     )
-    # Create mesh with rotated vertices
-    mesh = trimesh.Trimesh(vertices=verts_rotated, faces=faces)
     mesh_render = pyrender.Mesh.from_trimesh(mesh, material=material, smooth=True)
     scene.add(mesh_render)
-    # Compute center for camera positioning (using rotated vertices)
-    mesh_center = verts_rotated.mean(axis=0)
     camera_target = fixed_center if fixed_center is not None else mesh_center
     # Camera setup
     camera = pyrender.IntrinsicsCamera(
         fx=focal_length, fy=focal_length,
         cx=frame_width / 2, cy=frame_height / 2,
         znear=0.1, zfar=20.0
     )
     camera_pose = np.eye(4)
     camera_pose[0, 3] = camera_target[0]                    # Center X
     camera_pose[1, 3] = camera_target[1]                    # Center Y (body center)
     camera_pose[2, 3] = camera_target[2] - camera_distance  # In front (negative Z)
     # Camera orientation: flip to look at subject (SOKE-style)
     # This rotation makes camera look toward +Z (at the subject)
     camera_pose[:3, :3] = np.array([
@@ -571,49 +574,47 @@ def render_single_frame(
         [0, -1,  0],
         [0,  0, -1]
     ])
     scene.add(camera, pose=camera_pose)
     # Lighting
     key_light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=3.0)
     key_pose = np.eye(4)
     key_pose[:3, :3] = trimesh.transformations.euler_matrix(np.radians(-30), np.radians(-20), 0)[:3, :3]
     scene.add(key_light, pose=key_pose)
     fill_light = pyrender.DirectionalLight(color=[0.9, 0.9, 1.0], intensity=1.5)
     fill_pose = np.eye(4)
     fill_pose[:3, :3] = trimesh.transformations.euler_matrix(np.radians(-20), np.radians(30), 0)[:3, :3]
     scene.add(fill_light, pose=fill_pose)
     rim_light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=2.0)
     rim_pose = np.eye(4)
     rim_pose[:3, :3] = trimesh.transformations.euler_matrix(np.radians(30), np.radians(180), 0)[:3, :3]
     scene.add(rim_light, pose=rim_pose)
     # Render
     renderer = pyrender.OffscreenRenderer(viewport_width=frame_width, viewport_height=frame_height, point_size=1.0)
     color_img, _ = renderer.render(scene)
     renderer.delete()
     # Add label
     if label:
         img = Image.fromarray(color_img)
         draw = ImageDraw.Draw(img)
         try:
             font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 20)
         except:
             font = ImageFont.load_default()
         text_width = len(label) * 10 + 20
         draw.rectangle([10, 10, 10 + text_width, 35], fill=(0, 0, 0, 180))
         draw.text((15, 12), label, fill=(255, 255, 255), font=font)
-        color_img = np.array(img)
-    return color_img
 def render_side_by_side_frame(
     verts_list: list,
     faces: np.ndarray,
@@ -628,20 +629,20 @@ def render_side_by_side_frame(
     """Render multiple meshes side-by-side for comparison."""
     if not PYRENDER_AVAILABLE:
         raise RuntimeError("PyRender not available")
     # Colors for each avatar
     colors = [
         (0.3, 0.8, 0.4, 1.0),    # Green
         (0.3, 0.6, 0.9, 1.0),    # Blue
         (0.9, 0.5, 0.2, 1.0),    # Orange
     ]
     frames = []
     for i, verts in enumerate(verts_list):
         fixed_center = fixed_centers[i] if fixed_centers else None
         color = colors[i % len(colors)]
         label = labels[i] if i < len(labels) else ""
         frame = render_single_frame(
             verts, faces, label=label, color=color,
             fixed_center=fixed_center, camera_distance=camera_distance,
@@ -649,10 +650,8 @@ def render_side_by_side_frame(
             frame_height=frame_height, bg_color=bg_color
         )
         frames.append(frame)
-    return np.concatenate(frames, axis=1)
 def render_video(
     verts: np.ndarray,
     faces: np.ndarray,
@@ -668,17 +667,19 @@ def render_video(
     """Render single avatar animation to video."""
     if not ensure_pyrender():
         raise RuntimeError("PyRender not available")
     # Trim last few frames to remove end-of-sequence artifacts
     T_total = verts.shape[0]
     trim_amount = min(8, int(T_total * 0.15))
     T = max(5, T_total - trim_amount)
-    # Compute fixed camera target from first frame (after rotation)
-    rot_matrix = trimesh.transformations.rotation_matrix(np.radians(180), [1, 0, 0])
-    verts_rotated_first = np.dot(verts[0], rot_matrix[:3, :3].T)
-    fixed_center = verts_rotated_first.mean(axis=0)
     frames = []
     for t in range(T):
         frame = render_single_frame(
@@ -689,16 +690,14 @@ def render_video(
         )
         for _ in range(slowdown):
             frames.append(frame)
     # Save video
     Path(output_path).parent.mkdir(parents=True, exist_ok=True)
     if len(frames) > 0:
         imageio.mimsave(output_path, frames, fps=fps, codec='libx264', quality=8)
-    return output_path
 def render_comparison_video(
     verts1: np.ndarray,
     faces1: np.ndarray,
@@ -717,24 +716,27 @@ def render_comparison_video(
     """Render side-by-side comparison video."""
     if not ensure_pyrender():
         raise RuntimeError("PyRender not available")
     # Match lengths and trim
     T_total = min(verts1.shape[0], verts2.shape[0])
     trim_amount = min(8, int(T_total * 0.15))
     T = max(5, T_total - trim_amount)
     verts1 = verts1[:T]
     verts2 = verts2[:T]
-    # Compute fixed camera targets (after rotation)
-    rot_matrix = trimesh.transformations.rotation_matrix(np.radians(180), [1, 0, 0])
-    verts1_rotated_first = np.dot(verts1[0], rot_matrix[:3, :3].T)
-    verts2_rotated_first = np.dot(verts2[0], rot_matrix[:3, :3].T)
-    fixed_center1 = verts1_rotated_first.mean(axis=0)
-    fixed_center2 = verts2_rotated_first.mean(axis=0)
     labels = [label1, label2]
     frames = []
     for t in range(T):
         frame = render_side_by_side_frame(
@@ -745,15 +747,14 @@ def render_comparison_video(
         )
         for _ in range(slowdown):
             frames.append(frame)
     # Save video
     Path(output_path).parent.mkdir(parents=True, exist_ok=True)
     if len(frames) > 0:
         imageio.mimsave(output_path, frames, fps=fps, codec='libx264', quality=8)
-    return output_path
 # =====================================================================
 # Main Processing Functions
 # =====================================================================
@@ -761,80 +762,74 @@ def generate_verts_for_word(word: str, pid: str) -> tuple:
     """Generate vertices and faces for a word-PID pair."""
     generated_tokens = generate_motion_tokens(word, pid)
     token_ids = parse_motion_tokens(generated_tokens)
     if not token_ids:
         return None, None, generated_tokens
     if _model_cache["vqvae_model"] is None or _model_cache["smplx_model"] is None:
         return None, None, generated_tokens
     params = decode_tokens_to_params(token_ids)
     if params.shape[0] == 0:
         return None, None, generated_tokens
     verts, faces = params_to_vertices(params)
     return verts, faces, generated_tokens
 def generate_video_for_word(word: str, pid: str) -> tuple:
     """Generate video and tokens for a word. Returns (video_path, tokens)."""
     verts, faces, tokens = generate_verts_for_word(word, pid)
     if verts is None:
         return None, tokens
     # Generate unique filename
     video_filename = f"motion_{word}_{pid}_{uuid.uuid4().hex[:8]}.mp4"
     video_path = os.path.join(OUTPUT_DIR, video_filename)
     render_video(verts, faces, video_path, label=f"{pid}")
     return video_path, tokens
 def process_word(word: str):
     """Main processing: generate side-by-side comparison video for two random PIDs."""
     if not word or not word.strip():
         return None, ""
     word = word.strip().lower()
     pids = get_random_pids_for_word(word, 2)
     if not pids:
         return None, f"Word '{word}' not found in dataset"
     if len(pids) == 1:
         pids = [pids[0], pids[0]]
     try:
         verts1, faces1, tokens1 = generate_verts_for_word(word, pids[0])
         verts2, faces2, tokens2 = generate_verts_for_word(word, pids[1])
         if verts1 is None and verts2 is None:
             return None, tokens1 or tokens2 or "Failed to generate motion"
         # Generate unique filename
         video_filename = f"comparison_{word}_{uuid.uuid4().hex[:8]}.mp4"
         video_path = os.path.join(OUTPUT_DIR, video_filename)
         if verts1 is None:
             render_video(verts2, faces2, video_path, label=pids[1])
             return video_path, tokens2
         if verts2 is None:
             render_video(verts1, faces1, video_path, label=pids[0])
             return video_path, tokens1
         render_comparison_video(
             verts1, faces1, verts2, faces2, video_path,
             label1=pids[0], label2=pids[1]
         )
         combined_tokens = f"[{pids[0]}] {tokens1}\n\n[{pids[1]}] {tokens2}"
         return video_path, combined_tokens
     except Exception as e:
         return None, f"Error: {str(e)[:100]}"
 def get_example_video(word: str, pid: str):
     """Get pre-computed example video."""
     key = f"{word}_{pid}"
@@ -843,65 +838,67 @@ def get_example_video(word: str, pid: str):
         return cached.get("video_path"), cached.get("tokens", "")
     video_path, tokens = generate_video_for_word(word, pid)
     return video_path, tokens
 # =====================================================================
 # Gradio Interface
 # =====================================================================
 def create_gradio_interface():
     custom_css = """
     .gradio-container { max-width: 1400px !important; }
-    .example-row { margin-top: 15px; padding: 12px; background: #f8f9fa; border-radius: 6px; }
     .example-word-label {
         text-align: center;
         font-size: 28px !important;
         font-weight: bold !important;
-        color: #2c3e50 !important;
         margin: 10px 0 !important;
         padding: 10px !important;
     }
     .example-variant-label {
         text-align: center;
         font-size: 14px !important;
-        color: #7f8c8d !important;
         margin-bottom: 10px !important;
     }
     """
     example_list = list(_example_cache.values()) if _example_cache else []
     with gr.Blocks(title="SignMotionGPT", css=custom_css, theme=gr.themes.Default()) as demo:
         gr.Markdown("# SignMotionGPT Demo")
         gr.Markdown("Text-to-Sign Language Motion Generation with Variant Comparison")
         gr.Markdown("*High-quality PyRender visualization with proper hand motion rendering*")
         with gr.Row():
             with gr.Column(scale=1, min_width=280):
                 gr.Markdown("### Input")
                 word_input = gr.Textbox(
                     label="Word",
                     placeholder="Enter a word from the dataset...",
                     lines=1, max_lines=1
                 )
                 generate_btn = gr.Button("Generate Motion", variant="primary", size="lg")
                 gr.Markdown("---")
                 gr.Markdown("### Generated Tokens")
                 tokens_output = gr.Textbox(
                     label="Motion Tokens (both variants)",
                     lines=8,
                     interactive=False,
                     show_copy_button=True
                 )
                 if _word_pid_map:
                     sample_words = list(_word_pid_map.keys())[:10]
                     gr.Markdown(f"**Available words:** {', '.join(sample_words)}, ...")
             with gr.Column(scale=2, min_width=700):
                 gr.Markdown("### Motion Comparison (Two Signer Variants)")
                 video_output = gr.Video(
@@ -909,11 +906,11 @@ def create_gradio_interface():
                     autoplay=True,
                     show_download_button=True
                 )
         if example_list:
             gr.Markdown("---")
             gr.Markdown("### Pre-computed Examples")
             for item in example_list:
                 word, pid = item['word'], item['pid']
                 with gr.Row(elem_classes="example-row"):
@@ -921,37 +918,36 @@ def create_gradio_interface():
                         gr.HTML(f'<div class="example-word-label">{word.upper()}</div>')
                         gr.HTML(f'<div class="example-variant-label">Variant: {pid}</div>')
                         example_btn = gr.Button("Load Example", size="sm", variant="secondary")
                     with gr.Column(scale=3, min_width=500):
                         example_video = gr.Video(
                             label=f"Example: {word}",
                             autoplay=False,
                             show_download_button=True
                         )
                     example_btn.click(
                         fn=lambda w=word, p=pid: get_example_video(w, p),
                         inputs=[],
                         outputs=[example_video, tokens_output]
                     )
         gr.Markdown("---")
         gr.Markdown("*SignMotionGPT: LLM-based sign language motion generation with PyRender visualization*")
         generate_btn.click(
             fn=process_word,
             inputs=[word_input],
             outputs=[video_output, tokens_output]
         )
         word_input.submit(
             fn=process_word,
             inputs=[word_input],
             outputs=[video_output, tokens_output]
         )
-    return demo
 # =====================================================================
 # Main Entry Point for HuggingFace Spaces
 # =====================================================================
@@ -965,20 +961,16 @@ print(f"Output Directory: {OUTPUT_DIR}")
 print(f"Dataset: {DATASET_PATH}")
 print(f"PyRender Available: {PYRENDER_AVAILABLE}")
 print("="*60 + "\n")
 # Initialize models at startup
 initialize_models()
 # Pre-compute example animations
 precompute_examples()
 # Create and launch interface
 demo = create_gradio_interface()
 if __name__ == "__main__":
     # Launch with settings for HuggingFace Spaces
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False
-    )

 # IMPORTANT: Set OpenGL platform BEFORE any OpenGL imports (for headless rendering)
 import os
 os.environ["PYOPENGL_PLATFORM"] = "egl"
 import sys
 import re
 import json
 import tempfile
 import uuid
 from pathlib import Path
 import torch
 import numpy as np
 warnings.filterwarnings("ignore")
 # =====================================================================
 # Configuration for HuggingFace Spaces
 # =====================================================================
 OUTPUT_DIR = os.path.join(WORK_DIR, "outputs")
 os.makedirs(DATA_DIR, exist_ok=True)
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 # Path definitions
 DATASET_PATH = os.path.join(DATA_DIR, "motion_llm_dataset.json")
 VQVAE_CHECKPOINT = os.path.join(DATA_DIR, "vqvae_model.pt")
 STATS_PATH = os.path.join(DATA_DIR, "vqvae_stats.pt")
 SMPLX_MODEL_DIR = os.path.join(DATA_DIR, "smplx_models")
 # HuggingFace model config
 HF_REPO_ID = os.environ.get("HF_REPO_ID", "rdz-falcon/SignMotionGPTfit-archive")
 HF_SUBFOLDER = os.environ.get("HF_SUBFOLDER", "stage2_v2/epoch-030")
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Generation parameters
 M_START = "<M_START>"
 M_END = "<M_END>"
 INFERENCE_TEMPERATURE = 0.7
 INFERENCE_TOP_K = 50
 INFERENCE_REPETITION_PENALTY = 1.2
 # VQ-VAE parameters
 SMPL_DIM = 182
 CODEBOOK_SIZE = 512
     width=512, depth=3, down_t=2, stride_t=2,
     dilation_growth_rate=3, activation='relu', norm=None, quantizer="ema_reset"
 )
 PARAM_DIMS = [10, 63, 45, 45, 3, 10, 3, 3]
+PARAM_NAMES = ["betas", "body_pose", "left_hand_pose", "right_hand_pose",
+               "trans", "expression", "jaw_pose", "eye_pose"]
 # Visualization defaults
 AVATAR_COLOR = (0.36, 0.78, 0.36, 1.0)  # Green color as RGBA
 VIDEO_FPS = 15
 VIDEO_SLOWDOWN = 2
 FRAME_WIDTH = 544  # Must be divisible by 16 for video codec compatibility
 FRAME_HEIGHT = 720
 # =====================================================================
 # Install/Import Dependencies
 # =====================================================================
 except ImportError:
     os.system("pip install -q gradio>=4.0.0")
     import gradio as gr
 try:
     import smplx
 except ImportError:
     os.system("pip install -q smplx==0.1.28")
     import smplx
 # PyRender for high-quality rendering
 PYRENDER_AVAILABLE = False
 try:
     PYRENDER_AVAILABLE = True
 except ImportError:
     pass
 try:
     import imageio
 except ImportError:
     os.system("pip install -q imageio[ffmpeg]")
     import imageio
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch.nn.functional as F
 # =====================================================================
 # Import VQ-VAE architecture
 # =====================================================================
     sys.path.insert(0, parent_dir)
 if current_dir not in sys.path:
     sys.path.insert(0, current_dir)
 try:
     from mGPT.archs.mgpt_vq import VQVae
 except ImportError as e:
     print(f"Warning: Could not import VQVae: {e}")
     VQVae = None
 # =====================================================================
 # Global Cache
 # =====================================================================
     "stats": (None, None),
     "initialized": False
 }
 _word_pid_map = {}
 _example_cache = {}
 # =====================================================================
 # PyRender Setup
 # =====================================================================
     global PYRENDER_AVAILABLE, trimesh, pyrender, Image, ImageDraw, ImageFont
     if PYRENDER_AVAILABLE:
         return True
     print("Installing pyrender dependencies...")
     if os.path.exists("/etc/debian_version"):
         os.system("apt-get update -qq && apt-get install -qq -y libegl1-mesa-dev libgles2-mesa-dev > /dev/null 2>&1")
     os.system("pip install -q trimesh pyrender PyOpenGL PyOpenGL_accelerate Pillow")
     try:
         import trimesh
         import pyrender
     except ImportError as e:
         print(f"Could not install pyrender: {e}")
         return False
 # =====================================================================
 # Dataset Loading - Word to PID mapping
 # =====================================================================
 def load_word_pid_mapping():
     """Load the dataset and build word -> PIDs mapping."""
     global _word_pid_map
     if not os.path.exists(DATASET_PATH):
         print(f"Dataset not found: {DATASET_PATH}")
         return
     print(f"Loading dataset from: {DATASET_PATH}")
     try:
         with open(DATASET_PATH, 'r', encoding='utf-8') as f:
             data = json.load(f)
         for entry in data:
             word = entry.get('word', '').lower()
             pid = entry.get('participant_id', '')
                 if word not in _word_pid_map:
                     _word_pid_map[word] = set()
                 _word_pid_map[word].add(pid)
         for word in _word_pid_map:
             _word_pid_map[word] = sorted(list(_word_pid_map[word]))
         print(f"Loaded {len(_word_pid_map)} unique words from dataset")
     except Exception as e:
         print(f"Error loading dataset: {e}")
 def get_pids_for_word(word: str) -> list:
     """Get valid PIDs for a word from the dataset."""
     word = word.lower().strip()
     return _word_pid_map.get(word, [])
 def get_random_pids_for_word(word: str, count: int = 2) -> list:
     """Get random PIDs for a word. Returns up to 'count' PIDs."""
     pids = get_pids_for_word(word)
     if len(pids) <= count:
         return pids
     return random.sample(pids, count)
 def get_example_words_with_pids(count: int = 3) -> list:
     """Get example words with valid PIDs from dataset."""
     examples = []
     preferred = ['push', 'passport', 'library', 'send', 'college', 'help', 'thank', 'hello']
     for word in preferred:
         pids = get_pids_for_word(word)
         if pids:
             examples.append((word, pids[0]))
             if len(examples) >= count:
                 break
     if len(examples) < count:
         available = [w for w in _word_pid_map.keys() if w not in [e[0] for e in examples]]
         random.shuffle(available)
         for word in available[:count - len(examples)]:
             pids = _word_pid_map[word]
             examples.append((word, pids[0]))
+    return examples
 # =====================================================================
 # VQ-VAE Wrapper
 # =====================================================================
             nfeats=smpl_dim, code_num=codebook_size, code_dim=code_dim,
             output_emb_width=code_dim, **kwargs
         )
 # =====================================================================
 # Model Loading Functions
 # =====================================================================
 def load_llm_model():
     print(f"Loading LLM from: {HF_REPO_ID}/{HF_SUBFOLDER}")
     token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
     tokenizer = AutoTokenizer.from_pretrained(
         HF_REPO_ID, subfolder=HF_SUBFOLDER, trust_remote_code=True, token=token
     )
     model.eval()
     print(f"LLM loaded (vocab size: {len(tokenizer)})")
     return model, tokenizer
 def load_vqvae_model():
     if not os.path.exists(VQVAE_CHECKPOINT):
         print(f"VQ-VAE checkpoint not found: {VQVAE_CHECKPOINT}")
     model.eval()
     print(f"VQ-VAE loaded")
     return model
 def load_stats():
     if not os.path.exists(STATS_PATH):
         return None, None
     if torch.is_tensor(mean): mean = mean.cpu().numpy()
     if torch.is_tensor(std): std = std.cpu().numpy()
     return mean, std
 def load_smplx_model():
     if not os.path.exists(SMPLX_MODEL_DIR):
         print(f"SMPL-X directory not found: {SMPLX_MODEL_DIR}")
     ).to(DEVICE)
     print(f"SMPL-X loaded")
     return model
 def initialize_models():
     global _model_cache
     if _model_cache["initialized"]:
         return
     print("\n" + "="*60)
     print("  Initializing SignMotionGPT Models")
     print("="*60)
     load_word_pid_mapping()
     _model_cache["llm_model"], _model_cache["llm_tokenizer"] = load_llm_model()
     try:
         _model_cache["vqvae_model"] = load_vqvae_model()
         _model_cache["stats"] = load_stats()
         _model_cache["smplx_model"] = load_smplx_model()
     except Exception as e:
         print(f"Could not load visualization models: {e}")
     # Ensure PyRender is available
     ensure_pyrender()
     _model_cache["initialized"] = True
     print("All models initialized")
     print("="*60)
 def precompute_examples():
     """Pre-compute animations for example words at startup."""
     global _example_cache
     if not _model_cache["initialized"]:
         return
     examples = get_example_words_with_pids(3)
     print(f"\nPre-computing {len(examples)} example animations...")
     for word, pid in examples:
         key = f"{word}_{pid}"
         print(f"  Computing: {word} ({pid})...")
         except Exception as e:
             print(f"    Failed: {word} - {e}")
             _example_cache[key] = {"video_path": None, "tokens": "", "word": word, "pid": pid}
+    print("Example pre-computation complete\n")
 # =====================================================================
 # Motion Generation Functions
 # =====================================================================
 def generate_motion_tokens(word: str, variant: str) -> str:
     model = _model_cache["llm_model"]
     tokenizer = _model_cache["llm_tokenizer"]
     if model is None or tokenizer is None:
         raise RuntimeError("LLM model not loaded")
     prompt = f"Instruction: Generate motion for word '{word}' with variant '{variant}'.\nMotion: "
     inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
         output = model.generate(
             **inputs, max_new_tokens=100, do_sample=True,
             eos_token_id=tokenizer.convert_tokens_to_ids(M_END),
             early_stopping=True
         )
     decoded = tokenizer.decode(output[0], skip_special_tokens=False)
     motion_part = decoded.split("Motion: ")[-1] if "Motion: " in decoded else decoded
     return motion_part.strip()
 def parse_motion_tokens(token_str: str) -> list:
     if isinstance(token_str, (list, tuple, np.ndarray)):
         return [int(x) for x in token_str]
     if not isinstance(token_str, str):
         return []
     matches = re.findall(r'<M(\d+)>', token_str)
     if matches:
         return [int(x) for x in matches]
     matches = re.findall(r'<motion_(\d+)>', token_str)
     if matches:
         return [int(x) for x in matches]
+    return []
 def decode_tokens_to_params(tokens: list) -> np.ndarray:
     vqvae_model = _model_cache["vqvae_model"]
     mean, std = _model_cache["stats"]
     if vqvae_model is None or not tokens:
         return np.zeros((0, SMPL_DIM), dtype=np.float32)
     idx = torch.tensor(tokens, dtype=torch.long, device=DEVICE).unsqueeze(0)
     T_q = idx.shape[1]
     quantizer = vqvae_model.vqvae.quantizer
     if hasattr(quantizer, "codebook"):
         codebook = quantizer.codebook.to(DEVICE)
         code_dim = codebook.shape[1]
     else:
         code_dim = CODE_DIM
     x_quantized = None
     if hasattr(quantizer, "dequantize"):
         try:
                     x_quantized = dq.permute(0, 2, 1).contiguous()
         except Exception:
             pass
     if x_quantized is None:
         if not hasattr(quantizer, "codebook"):
             return np.zeros((0, SMPL_DIM), dtype=np.float32)
         with torch.no_grad():
             emb = codebook[idx]
             x_quantized = emb.permute(0, 2, 1).contiguous()
     with torch.no_grad():
         x_dec = vqvae_model.vqvae.decoder(x_quantized)
         smpl_out = vqvae_model.vqvae.postprocess(x_dec)
         params_np = smpl_out.squeeze(0).cpu().numpy()
     if (mean is not None) and (std is not None):
         params_np = (params_np * np.array(std).reshape(1, -1)) + np.array(mean).reshape(1, -1)
+    return params_np
+def params_to_vertices(params_seq: np.ndarray, smplx_model, batch_size=32) -> tuple:
+    """
+    Convert SMPL-X parameters to 3D vertices.
+    FIXED: Properly handles jaw_pose and expression to prevent lip/mouth issues.
+    """
+    # Compute parameter slicing indices
     starts = np.cumsum([0] + PARAM_DIMS[:-1])
     ends = starts + np.array(PARAM_DIMS)
     T = params_seq.shape[0]
     all_verts = []
+    # Infer number of body joints
     num_body_joints = getattr(smplx_model, "NUM_BODY_JOINTS", 21)
     with torch.no_grad():
         for s in range(0, T, batch_size):
+            batch = params_seq[s:s+batch_size]  # (B, SMPL_DIM)
             B = batch.shape[0]
+            # Extract parameters
+            np_parts = {}
+            for name, st, ed in zip(PARAM_NAMES, starts, ends):
+                np_parts[name] = batch[:, st:ed].astype(np.float32)
+            # Convert to tensors
+            tensor_parts = {
+                name: torch.from_numpy(arr).to(DEVICE)
+                for name, arr in np_parts.items()
+            }
+            # Handle body pose (may or may not include global orient)
             body_t = tensor_parts['body_pose']
             L_body = body_t.shape[1]
             expected_no_go = num_body_joints * 3
                 global_orient = torch.zeros((B, 3), dtype=torch.float32, device=DEVICE)
                 body_pose_only = body_t
             else:
+                # Best-effort fallback
                 if L_body > expected_no_go:
                     global_orient = body_t[:, :3].contiguous()
                     body_pose_only = body_t[:, 3:].contiguous()
                 else:
+                    pad_len = max(0, expected_no_go - L_body)
+                    body_pose_only = F.pad(body_t, (0, pad_len))
                     global_orient = torch.zeros((B, 3), dtype=torch.float32, device=DEVICE)
+            # ✅ FIX: Ensure jaw_pose is properly shaped (should be B x 3)
+            jaw_pose = tensor_parts['jaw_pose']
+            if jaw_pose.shape[1] != 3:
+                print(f"Warning: jaw_pose has shape {jaw_pose.shape}, padding/trimming to (B, 3)")
+                if jaw_pose.shape[1] < 3:
+                    jaw_pose = F.pad(jaw_pose, (0, 3 - jaw_pose.shape[1]))
+                else:
+                    jaw_pose = jaw_pose[:, :3]
+            jaw_pose = jaw_pose.contiguous()
+            # ✅ FIX: Ensure expression is properly shaped (should be B x 10)
+            expression = tensor_parts['expression']
+            if expression.shape[1] != 10:
+                print(f"Warning: expression has shape {expression.shape}, padding/trimming to (B, 10)")
+                if expression.shape[1] < 10:
+                    expression = F.pad(expression, (0, 10 - expression.shape[1]))
+                else:
+                    expression = expression[:, :10]
+            expression = expression.contiguous()
+            # ✅ FIX: Ensure eye_pose is properly shaped (should be B x 3)
+            eye_pose = tensor_parts['eye_pose']
+            if eye_pose.shape[1] != 3:
+                print(f"Warning: eye_pose has shape {eye_pose.shape}, padding/trimming to (B, 3)")
+                if eye_pose.shape[1] < 3:
+                    eye_pose = F.pad(eye_pose, (0, 3 - eye_pose.shape[1]))
+                else:
+                    eye_pose = eye_pose[:, :3]
+            eye_pose = eye_pose.contiguous()
+            # Call SMPL-X with validated parameters
             out = smplx_model(
+                betas=tensor_parts['betas'],
+                global_orient=global_orient,
+                body_pose=body_pose_only,
+                left_hand_pose=tensor_parts['left_hand_pose'],
+                right_hand_pose=tensor_parts['right_hand_pose'],
+                expression=expression,  # ✅ Using validated expression
+                jaw_pose=jaw_pose,      # ✅ Using validated jaw_pose
+                leye_pose=eye_pose,     # ✅ Using validated eye_pose
+                reye_pose=eye_pose,     # ✅ Using validated eye_pose
+                transl=tensor_parts['trans'],
+                return_verts=True
             )
+            verts = out.vertices.detach().cpu().numpy()  # (B, V, 3)
+            all_verts.append(verts)
+    verts_all = np.concatenate(all_verts, axis=0)  # (T, V, 3)
+    faces = smplx_model.faces.astype(np.int32)
+    return verts_all, faces
 # =====================================================================
 # PyRender Visualization Functions
 # =====================================================================
     """Render a single mesh frame using PyRender."""
     if not PYRENDER_AVAILABLE:
         raise RuntimeError("PyRender not available")
     # Check for invalid vertices
     if not np.isfinite(verts).all():
         blank = np.ones((frame_height, frame_width, 3), dtype=np.uint8) * 200
         return blank
     # Create scene
     scene = pyrender.Scene(bg_color=bg_color, ambient_light=[0.4, 0.4, 0.4])
     # Material
     material = pyrender.MetallicRoughnessMaterial(
         metallicFactor=0.0,
         alphaMode='OPAQUE',
         baseColorFactor=color
     )
+    # Create mesh
+    mesh = trimesh.Trimesh(vertices=verts, faces=faces)
     mesh_render = pyrender.Mesh.from_trimesh(mesh, material=material, smooth=True)
     scene.add(mesh_render)
+    # Compute center for camera positioning
+    mesh_center = verts.mean(axis=0)
     camera_target = fixed_center if fixed_center is not None else mesh_center
     # Camera setup
     camera = pyrender.IntrinsicsCamera(
         fx=focal_length, fy=focal_length,
         cx=frame_width / 2, cy=frame_height / 2,
         znear=0.1, zfar=20.0
     )
+    # Camera pose: After 180-degree rotation around X-axis, coordinate system changes
+    # Camera should be positioned in front (negative Z) with flipped orientation
+    # This matches visualize.py and ensures proper face visibility
     camera_pose = np.eye(4)
     camera_pose[0, 3] = camera_target[0]                    # Center X
     camera_pose[1, 3] = camera_target[1]                    # Center Y (body center)
     camera_pose[2, 3] = camera_target[2] - camera_distance  # In front (negative Z)
     # Camera orientation: flip to look at subject (SOKE-style)
     # This rotation makes camera look toward +Z (at the subject)
     camera_pose[:3, :3] = np.array([
         [0, -1,  0],
         [0,  0, -1]
     ])
     scene.add(camera, pose=camera_pose)
     # Lighting
     key_light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=3.0)
     key_pose = np.eye(4)
     key_pose[:3, :3] = trimesh.transformations.euler_matrix(np.radians(-30), np.radians(-20), 0)[:3, :3]
     scene.add(key_light, pose=key_pose)
     fill_light = pyrender.DirectionalLight(color=[0.9, 0.9, 1.0], intensity=1.5)
     fill_pose = np.eye(4)
     fill_pose[:3, :3] = trimesh.transformations.euler_matrix(np.radians(-20), np.radians(30), 0)[:3, :3]
     scene.add(fill_light, pose=fill_pose)
     rim_light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=2.0)
     rim_pose = np.eye(4)
     rim_pose[:3, :3] = trimesh.transformations.euler_matrix(np.radians(30), np.radians(180), 0)[:3, :3]
     scene.add(rim_light, pose=rim_pose)
     # Render
     renderer = pyrender.OffscreenRenderer(viewport_width=frame_width, viewport_height=frame_height, point_size=1.0)
     color_img, _ = renderer.render(scene)
     renderer.delete()
     # Add label
     if label:
         img = Image.fromarray(color_img)
         draw = ImageDraw.Draw(img)
         try:
             font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 20)
         except:
             font = ImageFont.load_default()
         text_width = len(label) * 10 + 20
         draw.rectangle([10, 10, 10 + text_width, 35], fill=(0, 0, 0, 180))
         draw.text((15, 12), label, fill=(255, 255, 255), font=font)
+        color_img = np.array(img)
+    return color_img
 def render_side_by_side_frame(
     verts_list: list,
     faces: np.ndarray,
     """Render multiple meshes side-by-side for comparison."""
     if not PYRENDER_AVAILABLE:
         raise RuntimeError("PyRender not available")
     # Colors for each avatar
     colors = [
         (0.3, 0.8, 0.4, 1.0),    # Green
         (0.3, 0.6, 0.9, 1.0),    # Blue
         (0.9, 0.5, 0.2, 1.0),    # Orange
     ]
     frames = []
     for i, verts in enumerate(verts_list):
         fixed_center = fixed_centers[i] if fixed_centers else None
         color = colors[i % len(colors)]
         label = labels[i] if i < len(labels) else ""
         frame = render_single_frame(
             verts, faces, label=label, color=color,
             fixed_center=fixed_center, camera_distance=camera_distance,
             frame_height=frame_height, bg_color=bg_color
         )
         frames.append(frame)
+    return np.concatenate(frames, axis=1)
 def render_video(
     verts: np.ndarray,
     faces: np.ndarray,
     """Render single avatar animation to video."""
     if not ensure_pyrender():
         raise RuntimeError("PyRender not available")
+    # Apply orientation fix: rotate 180 degrees around X-axis
+    verts = verts.copy()
+    verts[..., 1:] *= -1
     # Trim last few frames to remove end-of-sequence artifacts
     T_total = verts.shape[0]
     trim_amount = min(8, int(T_total * 0.15))
     T = max(5, T_total - trim_amount)
+    # Compute fixed camera target from first frame
+    fixed_center = verts[0].mean(axis=0)
     frames = []
     for t in range(T):
         frame = render_single_frame(
         )
         for _ in range(slowdown):
             frames.append(frame)
     # Save video
     Path(output_path).parent.mkdir(parents=True, exist_ok=True)
     if len(frames) > 0:
         imageio.mimsave(output_path, frames, fps=fps, codec='libx264', quality=8)
+    return output_path
 def render_comparison_video(
     verts1: np.ndarray,
     faces1: np.ndarray,
     """Render side-by-side comparison video."""
     if not ensure_pyrender():
         raise RuntimeError("PyRender not available")
+    # Apply orientation fix
+    verts1 = verts1.copy()
+    verts2 = verts2.copy()
+    verts1[..., 1:] *= -1
+    verts2[..., 1:] *= -1
     # Match lengths and trim
     T_total = min(verts1.shape[0], verts2.shape[0])
     trim_amount = min(8, int(T_total * 0.15))
     T = max(5, T_total - trim_amount)
     verts1 = verts1[:T]
     verts2 = verts2[:T]
+    # Compute fixed camera targets
+    fixed_center1 = verts1[0].mean(axis=0)
+    fixed_center2 = verts2[0].mean(axis=0)
     labels = [label1, label2]
     frames = []
     for t in range(T):
         frame = render_side_by_side_frame(
         )
         for _ in range(slowdown):
             frames.append(frame)
     # Save video
     Path(output_path).parent.mkdir(parents=True, exist_ok=True)
     if len(frames) > 0:
         imageio.mimsave(output_path, frames, fps=fps, codec='libx264', quality=8)
+    return output_path
 # =====================================================================
 # Main Processing Functions
 # =====================================================================
     """Generate vertices and faces for a word-PID pair."""
     generated_tokens = generate_motion_tokens(word, pid)
     token_ids = parse_motion_tokens(generated_tokens)
     if not token_ids:
         return None, None, generated_tokens
     if _model_cache["vqvae_model"] is None or _model_cache["smplx_model"] is None:
         return None, None, generated_tokens
     params = decode_tokens_to_params(token_ids)
     if params.shape[0] == 0:
         return None, None, generated_tokens
     verts, faces = params_to_vertices(params)
     return verts, faces, generated_tokens
 def generate_video_for_word(word: str, pid: str) -> tuple:
     """Generate video and tokens for a word. Returns (video_path, tokens)."""
     verts, faces, tokens = generate_verts_for_word(word, pid)
     if verts is None:
         return None, tokens
     # Generate unique filename
     video_filename = f"motion_{word}_{pid}_{uuid.uuid4().hex[:8]}.mp4"
     video_path = os.path.join(OUTPUT_DIR, video_filename)
     render_video(verts, faces, video_path, label=f"{pid}")
     return video_path, tokens
 def process_word(word: str):
     """Main processing: generate side-by-side comparison video for two random PIDs."""
     if not word or not word.strip():
         return None, ""
     word = word.strip().lower()
     pids = get_random_pids_for_word(word, 2)
     if not pids:
         return None, f"Word '{word}' not found in dataset"
     if len(pids) == 1:
         pids = [pids[0], pids[0]]
     try:
         verts1, faces1, tokens1 = generate_verts_for_word(word, pids[0])
         verts2, faces2, tokens2 = generate_verts_for_word(word, pids[1])
         if verts1 is None and verts2 is None:
             return None, tokens1 or tokens2 or "Failed to generate motion"
         # Generate unique filename
         video_filename = f"comparison_{word}_{uuid.uuid4().hex[:8]}.mp4"
         video_path = os.path.join(OUTPUT_DIR, video_filename)
         if verts1 is None:
             render_video(verts2, faces2, video_path, label=pids[1])
             return video_path, tokens2
         if verts2 is None:
             render_video(verts1, faces1, video_path, label=pids[0])
             return video_path, tokens1
         render_comparison_video(
             verts1, faces1, verts2, faces2, video_path,
             label1=pids[0], label2=pids[1]
         )
         combined_tokens = f"[{pids[0]}] {tokens1}\n\n[{pids[1]}] {tokens2}"
         return video_path, combined_tokens
     except Exception as e:
         return None, f"Error: {str(e)[:100]}"
 def get_example_video(word: str, pid: str):
     """Get pre-computed example video."""
     key = f"{word}_{pid}"
         return cached.get("video_path"), cached.get("tokens", "")
     video_path, tokens = generate_video_for_word(word, pid)
     return video_path, tokens
 # =====================================================================
 # Gradio Interface
 # =====================================================================
 def create_gradio_interface():
     custom_css = """
     .gradio-container { max-width: 1400px !important; }
+    .example-row { margin-top: 15px; padding: 12px; background:
+#f8f9fa; border-radius: 6px; }
     .example-word-label {
         text-align: center;
         font-size: 28px !important;
         font-weight: bold !important;
+        color:
+#2c3e50 !important;
         margin: 10px 0 !important;
         padding: 10px !important;
     }
     .example-variant-label {
         text-align: center;
         font-size: 14px !important;
+        color:
+#7f8c8d !important;
         margin-bottom: 10px !important;
     }
     """
     example_list = list(_example_cache.values()) if _example_cache else []
     with gr.Blocks(title="SignMotionGPT", css=custom_css, theme=gr.themes.Default()) as demo:
         gr.Markdown("# SignMotionGPT Demo")
         gr.Markdown("Text-to-Sign Language Motion Generation with Variant Comparison")
         gr.Markdown("*High-quality PyRender visualization with proper hand motion rendering*")
         with gr.Row():
             with gr.Column(scale=1, min_width=280):
                 gr.Markdown("### Input")
                 word_input = gr.Textbox(
                     label="Word",
                     placeholder="Enter a word from the dataset...",
                     lines=1, max_lines=1
                 )
                 generate_btn = gr.Button("Generate Motion", variant="primary", size="lg")
                 gr.Markdown("---")
                 gr.Markdown("### Generated Tokens")
                 tokens_output = gr.Textbox(
                     label="Motion Tokens (both variants)",
                     lines=8,
                     interactive=False,
                     show_copy_button=True
                 )
                 if _word_pid_map:
                     sample_words = list(_word_pid_map.keys())[:10]
                     gr.Markdown(f"**Available words:** {', '.join(sample_words)}, ...")
             with gr.Column(scale=2, min_width=700):
                 gr.Markdown("### Motion Comparison (Two Signer Variants)")
                 video_output = gr.Video(
                     autoplay=True,
                     show_download_button=True
                 )
         if example_list:
             gr.Markdown("---")
             gr.Markdown("### Pre-computed Examples")
             for item in example_list:
                 word, pid = item['word'], item['pid']
                 with gr.Row(elem_classes="example-row"):
                         gr.HTML(f'<div class="example-word-label">{word.upper()}</div>')
                         gr.HTML(f'<div class="example-variant-label">Variant: {pid}</div>')
                         example_btn = gr.Button("Load Example", size="sm", variant="secondary")
                     with gr.Column(scale=3, min_width=500):
                         example_video = gr.Video(
                             label=f"Example: {word}",
                             autoplay=False,
                             show_download_button=True
                         )
                     example_btn.click(
                         fn=lambda w=word, p=pid: get_example_video(w, p),
                         inputs=[],
                         outputs=[example_video, tokens_output]
                     )
         gr.Markdown("---")
         gr.Markdown("*SignMotionGPT: LLM-based sign language motion generation with PyRender visualization*")
         generate_btn.click(
             fn=process_word,
             inputs=[word_input],
             outputs=[video_output, tokens_output]
         )
         word_input.submit(
             fn=process_word,
             inputs=[word_input],
             outputs=[video_output, tokens_output]
         )
+    return demo
 # =====================================================================
 # Main Entry Point for HuggingFace Spaces
 # =====================================================================
 print(f"Dataset: {DATASET_PATH}")
 print(f"PyRender Available: {PYRENDER_AVAILABLE}")
 print("="*60 + "\n")
 # Initialize models at startup
 initialize_models()
 # Pre-compute example animations
 precompute_examples()
 # Create and launch interface
 demo = create_gradio_interface()
 if __name__ == "__main__":
     # Launch with settings for HuggingFace Spaces
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False
+    )