""" SignMotionGPT - HuggingFace Spaces Demo Text-to-Sign Language Motion Generation Uses PyRender for high-quality avatar visualization """ # IMPORTANT: Set OpenGL platform BEFORE any OpenGL imports (for headless rendering) import os os.environ["PYOPENGL_PLATFORM"] = "egl" import sys import re import json import random import warnings import tempfile import uuid from pathlib import Path import torch import numpy as np warnings.filterwarnings("ignore") # ===================================================================== # Configuration for HuggingFace Spaces # ===================================================================== WORK_DIR = os.getcwd() DATA_DIR = os.path.join(WORK_DIR, "data") OUTPUT_DIR = os.path.join(WORK_DIR, "outputs") os.makedirs(DATA_DIR, exist_ok=True) os.makedirs(OUTPUT_DIR, exist_ok=True) # Path definitions DATASET_PATH = os.path.join(DATA_DIR, "motion_llm_dataset.json") VQVAE_CHECKPOINT = os.path.join(DATA_DIR, "vqvae_model.pt") STATS_PATH = os.path.join(DATA_DIR, "vqvae_stats.pt") SMPLX_MODEL_DIR = os.path.join(DATA_DIR, "smplx_models") # HuggingFace model config HF_REPO_ID = os.environ.get("HF_REPO_ID", "rdz-falcon/SignMotionGPTfit-archive") HF_SUBFOLDER = os.environ.get("HF_SUBFOLDER", "stage2_v2/epoch-030") DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Generation parameters M_START = "" M_END = "" PAD_TOKEN = "" INFERENCE_TEMPERATURE = 0.7 INFERENCE_TOP_K = 50 INFERENCE_REPETITION_PENALTY = 1.2 # VQ-VAE parameters SMPL_DIM = 182 CODEBOOK_SIZE = 512 CODE_DIM = 512 VQ_ARGS = dict( width=512, depth=3, down_t=2, stride_t=2, dilation_growth_rate=3, activation='relu', norm=None, quantizer="ema_reset" ) PARAM_DIMS = [10, 63, 45, 45, 3, 10, 3, 3] PARAM_NAMES = ["betas", "body_pose", "left_hand_pose", "right_hand_pose", "trans", "expression", "jaw_pose", "eye_pose"] # Visualization defaults AVATAR_COLOR = (0.36, 0.78, 0.36, 1.0) # Green color as RGBA VIDEO_FPS = 15 VIDEO_SLOWDOWN = 2 FRAME_WIDTH = 544 # Must be divisible by 16 for video codec compatibility FRAME_HEIGHT = 720 # ===================================================================== # Install/Import Dependencies # ===================================================================== try: import gradio as gr except ImportError: os.system("pip install -q gradio>=4.0.0") import gradio as gr try: import smplx except ImportError: os.system("pip install -q smplx==0.1.28") import smplx # PyRender for high-quality rendering PYRENDER_AVAILABLE = False try: import trimesh import pyrender from PIL import Image, ImageDraw, ImageFont PYRENDER_AVAILABLE = True except ImportError: pass try: import imageio except ImportError: os.system("pip install -q imageio[ffmpeg]") import imageio from transformers import AutoModelForCausalLM, AutoTokenizer import torch.nn.functional as F # ===================================================================== # Import VQ-VAE architecture # ===================================================================== current_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(current_dir) if parent_dir not in sys.path: sys.path.insert(0, parent_dir) if current_dir not in sys.path: sys.path.insert(0, current_dir) try: from mGPT.archs.mgpt_vq import VQVae except ImportError as e: print(f"Warning: Could not import VQVae: {e}") VQVae = None # ===================================================================== # Global Cache # ===================================================================== _model_cache = { "llm_model": None, "llm_tokenizer": None, "vqvae_model": None, "smplx_model": None, "stats": (None, None), "initialized": False } _word_pid_map = {} _example_cache = {} # ===================================================================== # PyRender Setup # ===================================================================== def ensure_pyrender(): """Install pyrender dependencies if not available""" global PYRENDER_AVAILABLE, trimesh, pyrender, Image, ImageDraw, ImageFont if PYRENDER_AVAILABLE: return True print("Installing pyrender dependencies...") if os.path.exists("/etc/debian_version"): os.system("apt-get update -qq && apt-get install -qq -y libegl1-mesa-dev libgles2-mesa-dev > /dev/null 2>&1") os.system("pip install -q trimesh pyrender PyOpenGL PyOpenGL_accelerate Pillow") try: import trimesh import pyrender from PIL import Image, ImageDraw, ImageFont PYRENDER_AVAILABLE = True return True except ImportError as e: print(f"Could not install pyrender: {e}") return False # ===================================================================== # Dataset Loading - Word to PID mapping # ===================================================================== def load_word_pid_mapping(): """Load the dataset and build word -> PIDs mapping.""" global _word_pid_map if not os.path.exists(DATASET_PATH): print(f"Dataset not found: {DATASET_PATH}") return print(f"Loading dataset from: {DATASET_PATH}") try: with open(DATASET_PATH, 'r', encoding='utf-8') as f: data = json.load(f) for entry in data: word = entry.get('word', '').lower() pid = entry.get('participant_id', '') if word and pid: if word not in _word_pid_map: _word_pid_map[word] = set() _word_pid_map[word].add(pid) for word in _word_pid_map: _word_pid_map[word] = sorted(list(_word_pid_map[word])) print(f"Loaded {len(_word_pid_map)} unique words from dataset") except Exception as e: print(f"Error loading dataset: {e}") def get_pids_for_word(word: str) -> list: """Get valid PIDs for a word from the dataset.""" word = word.lower().strip() return _word_pid_map.get(word, []) def get_random_pids_for_word(word: str, count: int = 2) -> list: """Get random PIDs for a word. Returns up to 'count' PIDs.""" pids = get_pids_for_word(word) if not pids: return [] if len(pids) <= count: return pids return random.sample(pids, count) def get_example_words_with_pids(count: int = 3) -> list: """Get example words with valid PIDs from dataset.""" examples = [] preferred = ['push', 'passport', 'library', 'send', 'college', 'help', 'thank', 'hello'] for word in preferred: pids = get_pids_for_word(word) if pids: examples.append((word, pids[0])) if len(examples) >= count: break if len(examples) < count: available = [w for w in _word_pid_map.keys() if w not in [e[0] for e in examples]] random.shuffle(available) for word in available[:count - len(examples)]: pids = _word_pid_map[word] examples.append((word, pids[0])) return examples # ===================================================================== # VQ-VAE Wrapper # ===================================================================== class MotionGPT_VQVAE_Wrapper(torch.nn.Module): def __init__(self, smpl_dim=SMPL_DIM, codebook_size=CODEBOOK_SIZE, code_dim=CODE_DIM, **kwargs): super().__init__() if VQVae is None: raise RuntimeError("VQVae architecture not available") self.vqvae = VQVae( nfeats=smpl_dim, code_num=codebook_size, code_dim=code_dim, output_emb_width=code_dim, **kwargs ) # ===================================================================== # Model Loading Functions # ===================================================================== def load_llm_model(): print(f"Loading LLM from: {HF_REPO_ID}/{HF_SUBFOLDER}") token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN") tokenizer = AutoTokenizer.from_pretrained( HF_REPO_ID, subfolder=HF_SUBFOLDER, trust_remote_code=True, token=token ) model = AutoModelForCausalLM.from_pretrained( HF_REPO_ID, subfolder=HF_SUBFOLDER, trust_remote_code=True, token=token, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 ) if tokenizer.pad_token is None: tokenizer.add_special_tokens({"pad_token": PAD_TOKEN}) model.resize_token_embeddings(len(tokenizer)) model.config.pad_token_id = tokenizer.pad_token_id model.to(DEVICE) model.eval() print(f"LLM loaded (vocab size: {len(tokenizer)})") return model, tokenizer def load_vqvae_model(): if not os.path.exists(VQVAE_CHECKPOINT): print(f"VQ-VAE checkpoint not found: {VQVAE_CHECKPOINT}") return None print(f"Loading VQ-VAE from: {VQVAE_CHECKPOINT}") model = MotionGPT_VQVAE_Wrapper(smpl_dim=SMPL_DIM, codebook_size=CODEBOOK_SIZE, code_dim=CODE_DIM, **VQ_ARGS).to(DEVICE) ckpt = torch.load(VQVAE_CHECKPOINT, map_location=DEVICE, weights_only=False) state_dict = ckpt.get('model_state_dict', ckpt) model.load_state_dict(state_dict, strict=False) model.eval() print(f"VQ-VAE loaded") return model def load_stats(): if not os.path.exists(STATS_PATH): return None, None st = torch.load(STATS_PATH, map_location='cpu', weights_only=False) mean, std = st.get('mean', 0), st.get('std', 1) if torch.is_tensor(mean): mean = mean.cpu().numpy() if torch.is_tensor(std): std = std.cpu().numpy() return mean, std def load_smplx_model(): if not os.path.exists(SMPLX_MODEL_DIR): print(f"SMPL-X directory not found: {SMPLX_MODEL_DIR}") return None print(f"Loading SMPL-X from: {SMPLX_MODEL_DIR}") model = smplx.SMPLX( model_path=SMPLX_MODEL_DIR, model_type='smplx', gender='neutral', use_pca=False, create_global_orient=True, create_body_pose=True, create_betas=True, create_expression=True, create_jaw_pose=True, create_left_hand_pose=True, create_right_hand_pose=True, create_transl=True ).to(DEVICE) print(f"SMPL-X loaded") return model def initialize_models(): global _model_cache if _model_cache["initialized"]: return print("\n" + "="*60) print(" Initializing SignMotionGPT Models") print("="*60) load_word_pid_mapping() _model_cache["llm_model"], _model_cache["llm_tokenizer"] = load_llm_model() try: _model_cache["vqvae_model"] = load_vqvae_model() _model_cache["stats"] = load_stats() _model_cache["smplx_model"] = load_smplx_model() except Exception as e: print(f"Could not load visualization models: {e}") # Ensure PyRender is available ensure_pyrender() _model_cache["initialized"] = True print("All models initialized") print("="*60) def precompute_examples(): """Pre-compute animations for example words at startup.""" global _example_cache if not _model_cache["initialized"]: return examples = get_example_words_with_pids(3) print(f"\nPre-computing {len(examples)} example animations...") for word, pid in examples: key = f"{word}_{pid}" print(f" Computing: {word} ({pid})...") try: video_path, tokens = generate_video_for_word(word, pid) _example_cache[key] = {"video_path": video_path, "tokens": tokens, "word": word, "pid": pid} print(f" Done: {word}") except Exception as e: print(f" Failed: {word} - {e}") _example_cache[key] = {"video_path": None, "tokens": "", "word": word, "pid": pid} print("Example pre-computation complete\n") # ===================================================================== # Motion Generation Functions # ===================================================================== def generate_motion_tokens(word: str, variant: str) -> str: model = _model_cache["llm_model"] tokenizer = _model_cache["llm_tokenizer"] if model is None or tokenizer is None: raise RuntimeError("LLM model not loaded") prompt = f"Instruction: Generate motion for word '{word}' with variant '{variant}'.\nMotion: " inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE) with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=100, do_sample=True, temperature=INFERENCE_TEMPERATURE, top_k=INFERENCE_TOP_K, repetition_penalty=INFERENCE_REPETITION_PENALTY, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.convert_tokens_to_ids(M_END), early_stopping=True ) decoded = tokenizer.decode(output[0], skip_special_tokens=False) motion_part = decoded.split("Motion: ")[-1] if "Motion: " in decoded else decoded return motion_part.strip() def parse_motion_tokens(token_str: str) -> list: if isinstance(token_str, (list, tuple, np.ndarray)): return [int(x) for x in token_str] if not isinstance(token_str, str): return [] matches = re.findall(r'', token_str) if matches: return [int(x) for x in matches] matches = re.findall(r'', token_str) if matches: return [int(x) for x in matches] return [] def decode_tokens_to_params(tokens: list) -> np.ndarray: vqvae_model = _model_cache["vqvae_model"] mean, std = _model_cache["stats"] if vqvae_model is None or not tokens: return np.zeros((0, SMPL_DIM), dtype=np.float32) idx = torch.tensor(tokens, dtype=torch.long, device=DEVICE).unsqueeze(0) T_q = idx.shape[1] quantizer = vqvae_model.vqvae.quantizer if hasattr(quantizer, "codebook"): codebook = quantizer.codebook.to(DEVICE) code_dim = codebook.shape[1] else: code_dim = CODE_DIM x_quantized = None if hasattr(quantizer, "dequantize"): try: with torch.no_grad(): dq = quantizer.dequantize(idx) if dq is not None: dq = dq.contiguous() if dq.ndim == 3 and dq.shape[1] == code_dim: x_quantized = dq elif dq.ndim == 3 and dq.shape[1] == T_q: x_quantized = dq.permute(0, 2, 1).contiguous() except Exception: pass if x_quantized is None: if not hasattr(quantizer, "codebook"): return np.zeros((0, SMPL_DIM), dtype=np.float32) with torch.no_grad(): emb = codebook[idx] x_quantized = emb.permute(0, 2, 1).contiguous() with torch.no_grad(): x_dec = vqvae_model.vqvae.decoder(x_quantized) smpl_out = vqvae_model.vqvae.postprocess(x_dec) params_np = smpl_out.squeeze(0).cpu().numpy() if (mean is not None) and (std is not None): params_np = (params_np * np.array(std).reshape(1, -1)) + np.array(mean).reshape(1, -1) return params_np def params_to_vertices(params_seq: np.ndarray) -> tuple: smplx_model = _model_cache["smplx_model"] if smplx_model is None or params_seq.shape[0] == 0: return None, None starts = np.cumsum([0] + PARAM_DIMS[:-1]) ends = starts + np.array(PARAM_DIMS) T = params_seq.shape[0] all_verts = [] batch_size = 32 num_body_joints = getattr(smplx_model, "NUM_BODY_JOINTS", 21) with torch.no_grad(): for s in range(0, T, batch_size): batch = params_seq[s:s+batch_size] B = batch.shape[0] np_parts = {name: batch[:, st:ed].astype(np.float32) for name, st, ed in zip(PARAM_NAMES, starts, ends)} tensor_parts = {name: torch.from_numpy(arr).to(DEVICE) for name, arr in np_parts.items()} # ================================================================= # FIX: Neutralize Jaw Pose # ================================================================= # The generated jaw rotation can be unstable, causing the mouth # to rotate backwards into the neck. We force it to 0 (closed) # to keep the face render clean. tensor_parts['jaw_pose'] = torch.zeros_like(tensor_parts['jaw_pose']) # ================================================================= body_t = tensor_parts['body_pose'] L_body = body_t.shape[1] expected_no_go = num_body_joints * 3 expected_with_go = (num_body_joints + 1) * 3 if L_body == expected_with_go: global_orient = body_t[:, :3].contiguous() body_pose_only = body_t[:, 3:].contiguous() elif L_body == expected_no_go: global_orient = torch.zeros((B, 3), dtype=torch.float32, device=DEVICE) body_pose_only = body_t else: if L_body > expected_no_go: global_orient = body_t[:, :3].contiguous() body_pose_only = body_t[:, 3:].contiguous() else: body_pose_only = F.pad(body_t, (0, max(0, expected_no_go - L_body))) global_orient = torch.zeros((B, 3), dtype=torch.float32, device=DEVICE) out = smplx_model( betas=tensor_parts['betas'], global_orient=global_orient, body_pose=body_pose_only, left_hand_pose=tensor_parts['left_hand_pose'], right_hand_pose=tensor_parts['right_hand_pose'], expression=tensor_parts['expression'], jaw_pose=tensor_parts['jaw_pose'], leye_pose=tensor_parts['eye_pose'], reye_pose=tensor_parts['eye_pose'], transl=tensor_parts['trans'], return_verts=True ) all_verts.append(out.vertices.detach().cpu().numpy()) return np.concatenate(all_verts, axis=0), smplx_model.faces.astype(np.int32) # ===================================================================== # PyRender Visualization Functions # ===================================================================== def render_single_frame( verts: np.ndarray, faces: np.ndarray, label: str = "", color: tuple = AVATAR_COLOR, fixed_center: np.ndarray = None, camera_distance: float = 3.5, focal_length: float = 2000, frame_width: int = FRAME_WIDTH, frame_height: int = FRAME_HEIGHT, bg_color: tuple = (0.95, 0.95, 0.97, 1.0) ) -> np.ndarray: """Render a single mesh frame using PyRender.""" if not PYRENDER_AVAILABLE: raise RuntimeError("PyRender not available") # Check for invalid vertices if not np.isfinite(verts).all(): blank = np.ones((frame_height, frame_width, 3), dtype=np.uint8) * 200 return blank # Create scene scene = pyrender.Scene(bg_color=bg_color, ambient_light=[0.4, 0.4, 0.4]) # Material material = pyrender.MetallicRoughnessMaterial( metallicFactor=0.0, roughnessFactor=0.4, alphaMode='OPAQUE', baseColorFactor=color ) # Create mesh mesh = trimesh.Trimesh(vertices=verts, faces=faces) mesh_render = pyrender.Mesh.from_trimesh(mesh, material=material, smooth=True) scene.add(mesh_render) # Compute center for camera positioning mesh_center = verts.mean(axis=0) camera_target = fixed_center if fixed_center is not None else mesh_center # Camera setup camera = pyrender.IntrinsicsCamera( fx=focal_length, fy=focal_length, cx=frame_width / 2, cy=frame_height / 2, znear=0.1, zfar=20.0 ) # Camera pose: After 180-degree rotation around X-axis, coordinate system changes # Camera should be positioned in front (negative Z) with flipped orientation # This matches visualize.py and ensures proper face visibility camera_pose = np.eye(4) camera_pose[0, 3] = camera_target[0] # Center X camera_pose[1, 3] = camera_target[1] # Center Y (body center) camera_pose[2, 3] = camera_target[2] - camera_distance # In front (negative Z) # Camera orientation: flip to look at subject (SOKE-style) # This rotation makes camera look toward +Z (at the subject) camera_pose[:3, :3] = np.array([ [1, 0, 0], [0, -1, 0], [0, 0, -1] ]) scene.add(camera, pose=camera_pose) # Lighting key_light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=3.0) key_pose = np.eye(4) key_pose[:3, :3] = trimesh.transformations.euler_matrix(np.radians(-30), np.radians(-20), 0)[:3, :3] scene.add(key_light, pose=key_pose) fill_light = pyrender.DirectionalLight(color=[0.9, 0.9, 1.0], intensity=1.5) fill_pose = np.eye(4) fill_pose[:3, :3] = trimesh.transformations.euler_matrix(np.radians(-20), np.radians(30), 0)[:3, :3] scene.add(fill_light, pose=fill_pose) rim_light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=2.0) rim_pose = np.eye(4) rim_pose[:3, :3] = trimesh.transformations.euler_matrix(np.radians(30), np.radians(180), 0)[:3, :3] scene.add(rim_light, pose=rim_pose) # Render renderer = pyrender.OffscreenRenderer(viewport_width=frame_width, viewport_height=frame_height, point_size=1.0) color_img, _ = renderer.render(scene) renderer.delete() # Add label if label: img = Image.fromarray(color_img) draw = ImageDraw.Draw(img) try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 20) except: font = ImageFont.load_default() text_width = len(label) * 10 + 20 draw.rectangle([10, 10, 10 + text_width, 35], fill=(0, 0, 0, 180)) draw.text((15, 12), label, fill=(255, 255, 255), font=font) color_img = np.array(img) return color_img def render_side_by_side_frame( verts_list: list, faces: np.ndarray, labels: list, fixed_centers: list = None, camera_distance: float = 3.5, focal_length: float = 2000, frame_width: int = FRAME_WIDTH, frame_height: int = FRAME_HEIGHT, bg_color: tuple = (0.95, 0.95, 0.97, 1.0) ) -> np.ndarray: """Render multiple meshes side-by-side for comparison.""" if not PYRENDER_AVAILABLE: raise RuntimeError("PyRender not available") # Colors for each avatar colors = [ (0.3, 0.8, 0.4, 1.0), # Green (0.3, 0.6, 0.9, 1.0), # Blue (0.9, 0.5, 0.2, 1.0), # Orange ] frames = [] for i, verts in enumerate(verts_list): fixed_center = fixed_centers[i] if fixed_centers else None color = colors[i % len(colors)] label = labels[i] if i < len(labels) else "" frame = render_single_frame( verts, faces, label=label, color=color, fixed_center=fixed_center, camera_distance=camera_distance, focal_length=focal_length, frame_width=frame_width, frame_height=frame_height, bg_color=bg_color ) frames.append(frame) return np.concatenate(frames, axis=1) def render_video( verts: np.ndarray, faces: np.ndarray, output_path: str, label: str = "", fps: int = VIDEO_FPS, slowdown: int = VIDEO_SLOWDOWN, camera_distance: float = 3.5, focal_length: float = 2000, frame_width: int = FRAME_WIDTH, frame_height: int = FRAME_HEIGHT ) -> str: """Render single avatar animation to video.""" if not ensure_pyrender(): raise RuntimeError("PyRender not available") # Apply orientation fix: rotate 180 degrees around X-axis verts = verts.copy() verts[..., 1:] *= -1 # Trim last few frames to remove end-of-sequence artifacts T_total = verts.shape[0] trim_amount = min(8, int(T_total * 0.15)) T = max(5, T_total - trim_amount) # Compute fixed camera target from first frame fixed_center = verts[0].mean(axis=0) frames = [] for t in range(T): frame = render_single_frame( verts[t], faces, label=label, fixed_center=fixed_center, camera_distance=camera_distance, focal_length=focal_length, frame_width=frame_width, frame_height=frame_height ) for _ in range(slowdown): frames.append(frame) # Save video Path(output_path).parent.mkdir(parents=True, exist_ok=True) if len(frames) > 0: imageio.mimsave(output_path, frames, fps=fps, codec='libx264', quality=8) return output_path def render_comparison_video( verts1: np.ndarray, faces1: np.ndarray, verts2: np.ndarray, faces2: np.ndarray, output_path: str, label1: str = "", label2: str = "", fps: int = VIDEO_FPS, slowdown: int = VIDEO_SLOWDOWN, camera_distance: float = 3.5, focal_length: float = 2000, frame_width: int = FRAME_WIDTH, frame_height: int = FRAME_HEIGHT ) -> str: """Render side-by-side comparison video.""" if not ensure_pyrender(): raise RuntimeError("PyRender not available") # Apply orientation fix verts1 = verts1.copy() verts2 = verts2.copy() verts1[..., 1:] *= -1 verts2[..., 1:] *= -1 # Match lengths and trim T_total = min(verts1.shape[0], verts2.shape[0]) trim_amount = min(8, int(T_total * 0.15)) T = max(5, T_total - trim_amount) verts1 = verts1[:T] verts2 = verts2[:T] # Compute fixed camera targets fixed_center1 = verts1[0].mean(axis=0) fixed_center2 = verts2[0].mean(axis=0) labels = [label1, label2] frames = [] for t in range(T): frame = render_side_by_side_frame( [verts1[t], verts2[t]], faces1, labels, fixed_centers=[fixed_center1, fixed_center2], camera_distance=camera_distance, focal_length=focal_length, frame_width=frame_width, frame_height=frame_height ) for _ in range(slowdown): frames.append(frame) # Save video Path(output_path).parent.mkdir(parents=True, exist_ok=True) if len(frames) > 0: imageio.mimsave(output_path, frames, fps=fps, codec='libx264', quality=8) return output_path # ===================================================================== # Main Processing Functions # ===================================================================== def generate_verts_for_word(word: str, pid: str) -> tuple: """Generate vertices and faces for a word-PID pair.""" generated_tokens = generate_motion_tokens(word, pid) token_ids = parse_motion_tokens(generated_tokens) if not token_ids: return None, None, generated_tokens if _model_cache["vqvae_model"] is None or _model_cache["smplx_model"] is None: return None, None, generated_tokens params = decode_tokens_to_params(token_ids) if params.shape[0] == 0: return None, None, generated_tokens verts, faces = params_to_vertices(params) return verts, faces, generated_tokens def generate_video_for_word(word: str, pid: str) -> tuple: """Generate video and tokens for a word. Returns (video_path, tokens).""" verts, faces, tokens = generate_verts_for_word(word, pid) if verts is None: return None, tokens # Generate unique filename video_filename = f"motion_{word}_{pid}_{uuid.uuid4().hex[:8]}.mp4" video_path = os.path.join(OUTPUT_DIR, video_filename) render_video(verts, faces, video_path, label=f"{pid}") return video_path, tokens def process_word(word: str): """Main processing: generate side-by-side comparison video for two random PIDs.""" if not word or not word.strip(): return None, "" word = word.strip().lower() pids = get_random_pids_for_word(word, 2) if not pids: return None, f"Word '{word}' not found in dataset" if len(pids) == 1: pids = [pids[0], pids[0]] try: verts1, faces1, tokens1 = generate_verts_for_word(word, pids[0]) verts2, faces2, tokens2 = generate_verts_for_word(word, pids[1]) if verts1 is None and verts2 is None: return None, tokens1 or tokens2 or "Failed to generate motion" # Generate unique filename video_filename = f"comparison_{word}_{uuid.uuid4().hex[:8]}.mp4" video_path = os.path.join(OUTPUT_DIR, video_filename) if verts1 is None: render_video(verts2, faces2, video_path, label=pids[1]) return video_path, tokens2 if verts2 is None: render_video(verts1, faces1, video_path, label=pids[0]) return video_path, tokens1 render_comparison_video( verts1, faces1, verts2, faces2, video_path, label1=pids[0], label2=pids[1] ) combined_tokens = f"[{pids[0]}] {tokens1}\n\n[{pids[1]}] {tokens2}" return video_path, combined_tokens except Exception as e: return None, f"Error: {str(e)[:100]}" def get_example_video(word: str, pid: str): """Get pre-computed example video.""" key = f"{word}_{pid}" if key in _example_cache: cached = _example_cache[key] return cached.get("video_path"), cached.get("tokens", "") video_path, tokens = generate_video_for_word(word, pid) return video_path, tokens # ===================================================================== # Gradio Interface # ===================================================================== def create_gradio_interface(): custom_css = """ .gradio-container { max-width: 1400px !important; } .example-row { margin-top: 15px; padding: 12px; background: #f8f9fa; border-radius: 6px; } .example-word-label { text-align: center; font-size: 28px !important; font-weight: bold !important; color: #2c3e50 !important; margin: 10px 0 !important; padding: 10px !important; } .example-variant-label { text-align: center; font-size: 14px !important; color: #7f8c8d !important; margin-bottom: 10px !important; } """ example_list = list(_example_cache.values()) if _example_cache else [] with gr.Blocks(title="SignMotionGPT", css=custom_css, theme=gr.themes.Default()) as demo: gr.Markdown("# SignMotionGPT Demo") gr.Markdown("Text-to-Sign Language Motion Generation with Variant Comparison") gr.Markdown("*High-quality PyRender visualization with proper hand motion rendering*") with gr.Row(): with gr.Column(scale=1, min_width=280): gr.Markdown("### Input") word_input = gr.Textbox( label="Word", placeholder="Enter a word from the dataset...", lines=1, max_lines=1 ) generate_btn = gr.Button("Generate Motion", variant="primary", size="lg") gr.Markdown("---") gr.Markdown("### Generated Tokens") tokens_output = gr.Textbox( label="Motion Tokens (both variants)", lines=8, interactive=False, ) if _word_pid_map: sample_words = list(_word_pid_map.keys())[:10] gr.Markdown(f"**Available words:** {', '.join(sample_words)}, ...") with gr.Column(scale=2, min_width=700): gr.Markdown("### Motion Comparison (Two Signer Variants)") video_output = gr.Video( label="Generated Motion", autoplay=True, ) if example_list: gr.Markdown("---") gr.Markdown("### Pre-computed Examples") for item in example_list: word, pid = item['word'], item['pid'] with gr.Row(elem_classes="example-row"): with gr.Column(scale=1, min_width=180): gr.HTML(f'
{word.upper()}
') gr.HTML(f'
Variant: {pid}
') example_btn = gr.Button("Load Example", size="sm", variant="secondary") with gr.Column(scale=3, min_width=500): example_video = gr.Video( label=f"Example: {word}", autoplay=False ) example_btn.click( fn=lambda w=word, p=pid: get_example_video(w, p), inputs=[], outputs=[example_video, tokens_output] ) gr.Markdown("---") gr.Markdown("*SignMotionGPT: LLM-based sign language motion generation with PyRender visualization*") generate_btn.click( fn=process_word, inputs=[word_input], outputs=[video_output, tokens_output] ) word_input.submit( fn=process_word, inputs=[word_input], outputs=[video_output, tokens_output] ) return demo # ===================================================================== # Main Entry Point for HuggingFace Spaces # ===================================================================== print("\n" + "="*60) print(" SignMotionGPT - HuggingFace Spaces (PyRender)") print("="*60) print(f"Device: {DEVICE}") print(f"Model: {HF_REPO_ID}/{HF_SUBFOLDER}") print(f"Data Directory: {DATA_DIR}") print(f"Output Directory: {OUTPUT_DIR}") print(f"Dataset: {DATASET_PATH}") print(f"PyRender Available: {PYRENDER_AVAILABLE}") print("="*60 + "\n") # Initialize models at startup initialize_models() # Pre-compute example animations precompute_examples() # Create and launch interface demo = create_gradio_interface() if __name__ == "__main__": # Launch with settings for HuggingFace Spaces demo.launch( server_name="0.0.0.0", server_port=7860, share=False )