Spaces:

isYes
/

HuMoGen-X_Demo

Sleeping

File size: 7,686 Bytes

import os
import tempfile
from pathlib import Path
from typing import Tuple, Optional

from functools import lru_cache
import gradio as gr
import numpy as np
import torch
import soundfile as sf
import librosa

from huggingface_hub import hf_hub_download

# -----------------------------
# Config
# -----------------------------
DEFAULT_WEIGHTS_REPO = os.environ.get("WEIGHTS_REPO", "isYes/HuMoGen-X-weights")  # private model repo
WEIGHTS_FILENAME = os.environ.get("WEIGHTS_FILENAME", "train-0090.pt")                # in the private repo

# Space는 CPU일 수도 있고 GPU일 수도 있음
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# -----------------------------
# Secure download + load
# -----------------------------
@lru_cache
def load_model():
    """
    Loads model weights from a PRIVATE HF repo using HF_TOKEN (Space Secret).
    Cache_resource ensures we load only once per Space runtime.
    """
    token = os.environ.get("HF_TOKEN")
    if not token:
        raise RuntimeError(
            "HF_TOKEN secret is missing. Set it in Space Settings -> Secrets."
        )

    ckpt_path = hf_hub_download(
        repo_id=DEFAULT_WEIGHTS_REPO,
        filename=WEIGHTS_FILENAME,
        token=token,
    )

    # TODO: replace this with your actual model class init + load_state_dict
    # Example patterns:
    #   model = HuMoGenX(...)
    #   state = torch.load(ckpt_path, map_location="cpu")
    #   model.load_state_dict(state["state_dict"] if "state_dict" in state else state)
    #   model.to(DEVICE).eval()
    #
    # Here we keep a placeholder "model" object.
    model = torch.load(ckpt_path, map_location="cpu")
    if hasattr(model, "to"):
        model = model.to(DEVICE)
    if hasattr(model, "eval"):
        model.eval()
    return model


# -----------------------------
# Utilities
# -----------------------------
def load_audio_mono_16k(audio_path: str, target_sr: int = 16000) -> Tuple[np.ndarray, int]:
    """
    Loads audio file and converts to mono float32 at target_sr.
    """
    y, sr = librosa.load(audio_path, sr=target_sr, mono=True)
    y = y.astype(np.float32)
    return y, target_sr


def render_motion_to_mp4(
    motion: np.ndarray,
    out_mp4_path: str,
    fps: int = 30,
    resolution: int = 512,
):
    """
    TODO: Replace this with your real renderer.
    This function should create an mp4 from the generated motion.
    - motion: (T, D) or (T, J, 3) etc.
    - out_mp4_path: path to save mp4

    Options:
    1) lightweight: matplotlib stick figure -> imageio mp4
    2) medium: pyrender / trimesh
    3) heavy: Blender (보통 Space에선 비추)

    For now, we'll create a dummy black video so the UI pipeline is complete.
    """
    import imageio.v2 as imageio

    T = int(motion.shape[0]) if motion is not None else 60
    frames = []
    for _ in range(T):
        frame = np.zeros((resolution, resolution, 3), dtype=np.uint8)
        frames.append(frame)

    writer = imageio.get_writer(out_mp4_path, fps=fps)
    for f in frames:
        writer.append_data(f)
    writer.close()


# -----------------------------
# Inference stub (connect your code here)
# -----------------------------
@torch.inference_mode()
def run_inference(
    audio_path: str,
    genre: str,
    cfg_genre: float,
    cfg_music: float,
    seed: int,
    num_frames: int,
    fps: int,
) -> np.ndarray:
    """
    Returns generated motion as numpy array.
    Replace the body with your HuMoGen-X sampling logic.
    """
    # Load model
    model = load_model()

    # Prepare audio
    audio, sr = load_audio_mono_16k(audio_path, target_sr=16000)

    # Set seed
    g = torch.Generator(device=DEVICE)
    g.manual_seed(int(seed))

    # -----------------------
    # TODO: your actual inference
    # Example pseudo:
    #   cond = {
    #       "music": torch.tensor(audio)[None, ...].to(DEVICE),
    #       "genre": genre_to_id(genre),
    #   }
    #   motion = model.sample(
    #       cond=cond,
    #       guidance={"genre": cfg_genre, "music": cfg_music},
    #       num_frames=num_frames,
    #       generator=g,
    #   )
    #   motion_np = motion.detach().cpu().numpy()[0]
    # -----------------------

    # Placeholder motion (T, D)
    T = int(num_frames)
    D = 151  # adjust to your representation
    motion_np = np.random.randn(T, D).astype(np.float32)
    return motion_np


def generate_demo(
    audio_file,
    genre: str,
    cfg_genre: float,
    cfg_music: float,
    seed: int,
    seconds: float,
    fps: int,
    resolution: int,
):
    """
    Gradio handler: takes UI inputs, runs inference, renders mp4, returns mp4 path.
    """
    if audio_file is None:
        raise gr.Error("음악 파일을 업로드해줘!")

    # audio_file can be a path string
    audio_path = audio_file if isinstance(audio_file, str) else audio_file.name

    num_frames = int(max(1, round(seconds * fps)))

    motion = run_inference(
        audio_path=audio_path,
        genre=genre,
        cfg_genre=float(cfg_genre),
        cfg_music=float(cfg_music),
        seed=int(seed),
        num_frames=num_frames,
        fps=int(fps),
    )

    # Save output mp4 to a temp file
    tmp_dir = Path(tempfile.mkdtemp())
    out_mp4 = str(tmp_dir / "humogenx_result.mp4")

    render_motion_to_mp4(
        motion=motion,
        out_mp4_path=out_mp4,
        fps=int(fps),
        resolution=int(resolution),
    )

    return out_mp4


# -----------------------------
# Gradio UI
# -----------------------------
def build_ui():
    GENRES = [
        "HipHop", "Breaking", "Popping", "Locking",
        "House", "Waacking", "Shuffle", "Disco",
        "Jazz", "Kpop", "Ballet", "Contemporary"
    ]  # 네 thesis genre set으로 바꿔도 됨

    with gr.Blocks(title="HuMoGen-X Demo", theme=gr.themes.Soft()) as demo:
        gr.Markdown(
            """
# HuMoGen-X Demo (Inference-only)
- **Upload music** → choose **dance genre** → adjust **CFG** → get **MP4**.
- Model weights are stored in a **private repo** and loaded at runtime.
            """.strip()
        )

        with gr.Row():
            with gr.Column(scale=1):
                audio = gr.Audio(label="Music Upload", type="filepath")
                genre = gr.Dropdown(choices=GENRES, value=GENRES[0], label="Dance Genre")

                gr.Markdown("### CFG (Classifier-Free Guidance)")
                cfg_genre = gr.Slider(0.0, 8.0, value=3.0, step=0.1, label="CFG: Genre")
                cfg_music = gr.Slider(0.0, 8.0, value=3.0, step=0.1, label="CFG: Music")

                with gr.Row():
                    seed = gr.Number(value=0, precision=0, label="Seed (int)")
                    seconds = gr.Slider(1.0, 12.0, value=6.0, step=0.5, label="Length (sec)")

                with gr.Row():
                    fps = gr.Dropdown(choices=[20, 24, 30, 60], value=30, label="FPS")
                    resolution = gr.Dropdown(choices=[256, 512, 720], value=512, label="Render Resolution")

                run_btn = gr.Button("Generate", variant="primary")

            with gr.Column(scale=1):
                out_video = gr.Video(label="Result (MP4)", autoplay=True)

        run_btn.click(
            fn=generate_demo,
            inputs=[audio, genre, cfg_genre, cfg_music, seed, seconds, fps, resolution],
            outputs=[out_video],
        )

        gr.Markdown(
            """
### Notes
- This Space is **inference-only**; weights are not downloadable here.
- If you want higher quality rendering, replace `render_motion_to_mp4()` with your renderer.
            """.strip()
        )

    return demo


if __name__ == "__main__":
    demo = build_ui()
    demo.queue()
    demo.launch()