HuMoGen-X_Demo / app.py
Daye-Lee18
lru cache
efcaecd
import os
import tempfile
from pathlib import Path
from typing import Tuple, Optional
from functools import lru_cache
import gradio as gr
import numpy as np
import torch
import soundfile as sf
import librosa
from huggingface_hub import hf_hub_download
# -----------------------------
# Config
# -----------------------------
DEFAULT_WEIGHTS_REPO = os.environ.get("WEIGHTS_REPO", "isYes/HuMoGen-X-weights") # private model repo
WEIGHTS_FILENAME = os.environ.get("WEIGHTS_FILENAME", "train-0090.pt") # in the private repo
# Space๋Š” CPU์ผ ์ˆ˜๋„ ์žˆ๊ณ  GPU์ผ ์ˆ˜๋„ ์žˆ์Œ
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# -----------------------------
# Secure download + load
# -----------------------------
@lru_cache
def load_model():
"""
Loads model weights from a PRIVATE HF repo using HF_TOKEN (Space Secret).
Cache_resource ensures we load only once per Space runtime.
"""
token = os.environ.get("HF_TOKEN")
if not token:
raise RuntimeError(
"HF_TOKEN secret is missing. Set it in Space Settings -> Secrets."
)
ckpt_path = hf_hub_download(
repo_id=DEFAULT_WEIGHTS_REPO,
filename=WEIGHTS_FILENAME,
token=token,
)
# TODO: replace this with your actual model class init + load_state_dict
# Example patterns:
# model = HuMoGenX(...)
# state = torch.load(ckpt_path, map_location="cpu")
# model.load_state_dict(state["state_dict"] if "state_dict" in state else state)
# model.to(DEVICE).eval()
#
# Here we keep a placeholder "model" object.
model = torch.load(ckpt_path, map_location="cpu")
if hasattr(model, "to"):
model = model.to(DEVICE)
if hasattr(model, "eval"):
model.eval()
return model
# -----------------------------
# Utilities
# -----------------------------
def load_audio_mono_16k(audio_path: str, target_sr: int = 16000) -> Tuple[np.ndarray, int]:
"""
Loads audio file and converts to mono float32 at target_sr.
"""
y, sr = librosa.load(audio_path, sr=target_sr, mono=True)
y = y.astype(np.float32)
return y, target_sr
def render_motion_to_mp4(
motion: np.ndarray,
out_mp4_path: str,
fps: int = 30,
resolution: int = 512,
):
"""
TODO: Replace this with your real renderer.
This function should create an mp4 from the generated motion.
- motion: (T, D) or (T, J, 3) etc.
- out_mp4_path: path to save mp4
Options:
1) lightweight: matplotlib stick figure -> imageio mp4
2) medium: pyrender / trimesh
3) heavy: Blender (๋ณดํ†ต Space์—์„  ๋น„์ถ”)
For now, we'll create a dummy black video so the UI pipeline is complete.
"""
import imageio.v2 as imageio
T = int(motion.shape[0]) if motion is not None else 60
frames = []
for _ in range(T):
frame = np.zeros((resolution, resolution, 3), dtype=np.uint8)
frames.append(frame)
writer = imageio.get_writer(out_mp4_path, fps=fps)
for f in frames:
writer.append_data(f)
writer.close()
# -----------------------------
# Inference stub (connect your code here)
# -----------------------------
@torch.inference_mode()
def run_inference(
audio_path: str,
genre: str,
cfg_genre: float,
cfg_music: float,
seed: int,
num_frames: int,
fps: int,
) -> np.ndarray:
"""
Returns generated motion as numpy array.
Replace the body with your HuMoGen-X sampling logic.
"""
# Load model
model = load_model()
# Prepare audio
audio, sr = load_audio_mono_16k(audio_path, target_sr=16000)
# Set seed
g = torch.Generator(device=DEVICE)
g.manual_seed(int(seed))
# -----------------------
# TODO: your actual inference
# Example pseudo:
# cond = {
# "music": torch.tensor(audio)[None, ...].to(DEVICE),
# "genre": genre_to_id(genre),
# }
# motion = model.sample(
# cond=cond,
# guidance={"genre": cfg_genre, "music": cfg_music},
# num_frames=num_frames,
# generator=g,
# )
# motion_np = motion.detach().cpu().numpy()[0]
# -----------------------
# Placeholder motion (T, D)
T = int(num_frames)
D = 151 # adjust to your representation
motion_np = np.random.randn(T, D).astype(np.float32)
return motion_np
def generate_demo(
audio_file,
genre: str,
cfg_genre: float,
cfg_music: float,
seed: int,
seconds: float,
fps: int,
resolution: int,
):
"""
Gradio handler: takes UI inputs, runs inference, renders mp4, returns mp4 path.
"""
if audio_file is None:
raise gr.Error("์Œ์•… ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ค˜!")
# audio_file can be a path string
audio_path = audio_file if isinstance(audio_file, str) else audio_file.name
num_frames = int(max(1, round(seconds * fps)))
motion = run_inference(
audio_path=audio_path,
genre=genre,
cfg_genre=float(cfg_genre),
cfg_music=float(cfg_music),
seed=int(seed),
num_frames=num_frames,
fps=int(fps),
)
# Save output mp4 to a temp file
tmp_dir = Path(tempfile.mkdtemp())
out_mp4 = str(tmp_dir / "humogenx_result.mp4")
render_motion_to_mp4(
motion=motion,
out_mp4_path=out_mp4,
fps=int(fps),
resolution=int(resolution),
)
return out_mp4
# -----------------------------
# Gradio UI
# -----------------------------
def build_ui():
GENRES = [
"HipHop", "Breaking", "Popping", "Locking",
"House", "Waacking", "Shuffle", "Disco",
"Jazz", "Kpop", "Ballet", "Contemporary"
] # ๋„ค thesis genre set์œผ๋กœ ๋ฐ”๊ฟ”๋„ ๋จ
with gr.Blocks(title="HuMoGen-X Demo", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# HuMoGen-X Demo (Inference-only)
- **Upload music** โ†’ choose **dance genre** โ†’ adjust **CFG** โ†’ get **MP4**.
- Model weights are stored in a **private repo** and loaded at runtime.
""".strip()
)
with gr.Row():
with gr.Column(scale=1):
audio = gr.Audio(label="Music Upload", type="filepath")
genre = gr.Dropdown(choices=GENRES, value=GENRES[0], label="Dance Genre")
gr.Markdown("### CFG (Classifier-Free Guidance)")
cfg_genre = gr.Slider(0.0, 8.0, value=3.0, step=0.1, label="CFG: Genre")
cfg_music = gr.Slider(0.0, 8.0, value=3.0, step=0.1, label="CFG: Music")
with gr.Row():
seed = gr.Number(value=0, precision=0, label="Seed (int)")
seconds = gr.Slider(1.0, 12.0, value=6.0, step=0.5, label="Length (sec)")
with gr.Row():
fps = gr.Dropdown(choices=[20, 24, 30, 60], value=30, label="FPS")
resolution = gr.Dropdown(choices=[256, 512, 720], value=512, label="Render Resolution")
run_btn = gr.Button("Generate", variant="primary")
with gr.Column(scale=1):
out_video = gr.Video(label="Result (MP4)", autoplay=True)
run_btn.click(
fn=generate_demo,
inputs=[audio, genre, cfg_genre, cfg_music, seed, seconds, fps, resolution],
outputs=[out_video],
)
gr.Markdown(
"""
### Notes
- This Space is **inference-only**; weights are not downloadable here.
- If you want higher quality rendering, replace `render_motion_to_mp4()` with your renderer.
""".strip()
)
return demo
if __name__ == "__main__":
demo = build_ui()
demo.queue()
demo.launch()