import os import tempfile from pathlib import Path from typing import Tuple, Optional from functools import lru_cache import gradio as gr import numpy as np import torch import soundfile as sf import librosa from huggingface_hub import hf_hub_download # ----------------------------- # Config # ----------------------------- DEFAULT_WEIGHTS_REPO = os.environ.get("WEIGHTS_REPO", "isYes/HuMoGen-X-weights") # private model repo WEIGHTS_FILENAME = os.environ.get("WEIGHTS_FILENAME", "train-0090.pt") # in the private repo # Space는 CPU일 수도 있고 GPU일 수도 있음 DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # ----------------------------- # Secure download + load # ----------------------------- @lru_cache def load_model(): """ Loads model weights from a PRIVATE HF repo using HF_TOKEN (Space Secret). Cache_resource ensures we load only once per Space runtime. """ token = os.environ.get("HF_TOKEN") if not token: raise RuntimeError( "HF_TOKEN secret is missing. Set it in Space Settings -> Secrets." ) ckpt_path = hf_hub_download( repo_id=DEFAULT_WEIGHTS_REPO, filename=WEIGHTS_FILENAME, token=token, ) # TODO: replace this with your actual model class init + load_state_dict # Example patterns: # model = HuMoGenX(...) # state = torch.load(ckpt_path, map_location="cpu") # model.load_state_dict(state["state_dict"] if "state_dict" in state else state) # model.to(DEVICE).eval() # # Here we keep a placeholder "model" object. model = torch.load(ckpt_path, map_location="cpu") if hasattr(model, "to"): model = model.to(DEVICE) if hasattr(model, "eval"): model.eval() return model # ----------------------------- # Utilities # ----------------------------- def load_audio_mono_16k(audio_path: str, target_sr: int = 16000) -> Tuple[np.ndarray, int]: """ Loads audio file and converts to mono float32 at target_sr. """ y, sr = librosa.load(audio_path, sr=target_sr, mono=True) y = y.astype(np.float32) return y, target_sr def render_motion_to_mp4( motion: np.ndarray, out_mp4_path: str, fps: int = 30, resolution: int = 512, ): """ TODO: Replace this with your real renderer. This function should create an mp4 from the generated motion. - motion: (T, D) or (T, J, 3) etc. - out_mp4_path: path to save mp4 Options: 1) lightweight: matplotlib stick figure -> imageio mp4 2) medium: pyrender / trimesh 3) heavy: Blender (보통 Space에선 비추) For now, we'll create a dummy black video so the UI pipeline is complete. """ import imageio.v2 as imageio T = int(motion.shape[0]) if motion is not None else 60 frames = [] for _ in range(T): frame = np.zeros((resolution, resolution, 3), dtype=np.uint8) frames.append(frame) writer = imageio.get_writer(out_mp4_path, fps=fps) for f in frames: writer.append_data(f) writer.close() # ----------------------------- # Inference stub (connect your code here) # ----------------------------- @torch.inference_mode() def run_inference( audio_path: str, genre: str, cfg_genre: float, cfg_music: float, seed: int, num_frames: int, fps: int, ) -> np.ndarray: """ Returns generated motion as numpy array. Replace the body with your HuMoGen-X sampling logic. """ # Load model model = load_model() # Prepare audio audio, sr = load_audio_mono_16k(audio_path, target_sr=16000) # Set seed g = torch.Generator(device=DEVICE) g.manual_seed(int(seed)) # ----------------------- # TODO: your actual inference # Example pseudo: # cond = { # "music": torch.tensor(audio)[None, ...].to(DEVICE), # "genre": genre_to_id(genre), # } # motion = model.sample( # cond=cond, # guidance={"genre": cfg_genre, "music": cfg_music}, # num_frames=num_frames, # generator=g, # ) # motion_np = motion.detach().cpu().numpy()[0] # ----------------------- # Placeholder motion (T, D) T = int(num_frames) D = 151 # adjust to your representation motion_np = np.random.randn(T, D).astype(np.float32) return motion_np def generate_demo( audio_file, genre: str, cfg_genre: float, cfg_music: float, seed: int, seconds: float, fps: int, resolution: int, ): """ Gradio handler: takes UI inputs, runs inference, renders mp4, returns mp4 path. """ if audio_file is None: raise gr.Error("음악 파일을 업로드해줘!") # audio_file can be a path string audio_path = audio_file if isinstance(audio_file, str) else audio_file.name num_frames = int(max(1, round(seconds * fps))) motion = run_inference( audio_path=audio_path, genre=genre, cfg_genre=float(cfg_genre), cfg_music=float(cfg_music), seed=int(seed), num_frames=num_frames, fps=int(fps), ) # Save output mp4 to a temp file tmp_dir = Path(tempfile.mkdtemp()) out_mp4 = str(tmp_dir / "humogenx_result.mp4") render_motion_to_mp4( motion=motion, out_mp4_path=out_mp4, fps=int(fps), resolution=int(resolution), ) return out_mp4 # ----------------------------- # Gradio UI # ----------------------------- def build_ui(): GENRES = [ "HipHop", "Breaking", "Popping", "Locking", "House", "Waacking", "Shuffle", "Disco", "Jazz", "Kpop", "Ballet", "Contemporary" ] # 네 thesis genre set으로 바꿔도 됨 with gr.Blocks(title="HuMoGen-X Demo", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # HuMoGen-X Demo (Inference-only) - **Upload music** → choose **dance genre** → adjust **CFG** → get **MP4**. - Model weights are stored in a **private repo** and loaded at runtime. """.strip() ) with gr.Row(): with gr.Column(scale=1): audio = gr.Audio(label="Music Upload", type="filepath") genre = gr.Dropdown(choices=GENRES, value=GENRES[0], label="Dance Genre") gr.Markdown("### CFG (Classifier-Free Guidance)") cfg_genre = gr.Slider(0.0, 8.0, value=3.0, step=0.1, label="CFG: Genre") cfg_music = gr.Slider(0.0, 8.0, value=3.0, step=0.1, label="CFG: Music") with gr.Row(): seed = gr.Number(value=0, precision=0, label="Seed (int)") seconds = gr.Slider(1.0, 12.0, value=6.0, step=0.5, label="Length (sec)") with gr.Row(): fps = gr.Dropdown(choices=[20, 24, 30, 60], value=30, label="FPS") resolution = gr.Dropdown(choices=[256, 512, 720], value=512, label="Render Resolution") run_btn = gr.Button("Generate", variant="primary") with gr.Column(scale=1): out_video = gr.Video(label="Result (MP4)", autoplay=True) run_btn.click( fn=generate_demo, inputs=[audio, genre, cfg_genre, cfg_music, seed, seconds, fps, resolution], outputs=[out_video], ) gr.Markdown( """ ### Notes - This Space is **inference-only**; weights are not downloadable here. - If you want higher quality rendering, replace `render_motion_to_mp4()` with your renderer. """.strip() ) return demo if __name__ == "__main__": demo = build_ui() demo.queue() demo.launch()