File size: 7,686 Bytes
a30a67a
 
 
 
 
efcaecd
769235d
 
 
a30a67a
 
 
 
769235d
a30a67a
 
 
 
 
769235d
a30a67a
 
769235d
 
a30a67a
 
 
efcaecd
a30a67a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
769235d
a30a67a
 
 
 
 
 
 
 
 
 
 
 
 
 
769235d
a30a67a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
769235d
a30a67a
 
 
 
 
769235d
a30a67a
 
 
 
769235d
a30a67a
 
 
769235d
a30a67a
 
 
 
 
769235d
a30a67a
 
 
 
769235d
 
a30a67a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
769235d
a30a67a
 
769235d
a30a67a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
769235d
 
a30a67a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
769235d
a30a67a
 
769235d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import os
import tempfile
from pathlib import Path
from typing import Tuple, Optional

from functools import lru_cache
import gradio as gr
import numpy as np
import torch
import soundfile as sf
import librosa

from huggingface_hub import hf_hub_download

# -----------------------------
# Config
# -----------------------------
DEFAULT_WEIGHTS_REPO = os.environ.get("WEIGHTS_REPO", "isYes/HuMoGen-X-weights")  # private model repo
WEIGHTS_FILENAME = os.environ.get("WEIGHTS_FILENAME", "train-0090.pt")                # in the private repo

# Space๋Š” CPU์ผ ์ˆ˜๋„ ์žˆ๊ณ  GPU์ผ ์ˆ˜๋„ ์žˆ์Œ
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# -----------------------------
# Secure download + load
# -----------------------------
@lru_cache
def load_model():
    """
    Loads model weights from a PRIVATE HF repo using HF_TOKEN (Space Secret).
    Cache_resource ensures we load only once per Space runtime.
    """
    token = os.environ.get("HF_TOKEN")
    if not token:
        raise RuntimeError(
            "HF_TOKEN secret is missing. Set it in Space Settings -> Secrets."
        )

    ckpt_path = hf_hub_download(
        repo_id=DEFAULT_WEIGHTS_REPO,
        filename=WEIGHTS_FILENAME,
        token=token,
    )

    # TODO: replace this with your actual model class init + load_state_dict
    # Example patterns:
    #   model = HuMoGenX(...)
    #   state = torch.load(ckpt_path, map_location="cpu")
    #   model.load_state_dict(state["state_dict"] if "state_dict" in state else state)
    #   model.to(DEVICE).eval()
    #
    # Here we keep a placeholder "model" object.
    model = torch.load(ckpt_path, map_location="cpu")
    if hasattr(model, "to"):
        model = model.to(DEVICE)
    if hasattr(model, "eval"):
        model.eval()
    return model


# -----------------------------
# Utilities
# -----------------------------
def load_audio_mono_16k(audio_path: str, target_sr: int = 16000) -> Tuple[np.ndarray, int]:
    """
    Loads audio file and converts to mono float32 at target_sr.
    """
    y, sr = librosa.load(audio_path, sr=target_sr, mono=True)
    y = y.astype(np.float32)
    return y, target_sr


def render_motion_to_mp4(
    motion: np.ndarray,
    out_mp4_path: str,
    fps: int = 30,
    resolution: int = 512,
):
    """
    TODO: Replace this with your real renderer.
    This function should create an mp4 from the generated motion.
    - motion: (T, D) or (T, J, 3) etc.
    - out_mp4_path: path to save mp4

    Options:
    1) lightweight: matplotlib stick figure -> imageio mp4
    2) medium: pyrender / trimesh
    3) heavy: Blender (๋ณดํ†ต Space์—์„  ๋น„์ถ”)

    For now, we'll create a dummy black video so the UI pipeline is complete.
    """
    import imageio.v2 as imageio

    T = int(motion.shape[0]) if motion is not None else 60
    frames = []
    for _ in range(T):
        frame = np.zeros((resolution, resolution, 3), dtype=np.uint8)
        frames.append(frame)

    writer = imageio.get_writer(out_mp4_path, fps=fps)
    for f in frames:
        writer.append_data(f)
    writer.close()


# -----------------------------
# Inference stub (connect your code here)
# -----------------------------
@torch.inference_mode()
def run_inference(
    audio_path: str,
    genre: str,
    cfg_genre: float,
    cfg_music: float,
    seed: int,
    num_frames: int,
    fps: int,
) -> np.ndarray:
    """
    Returns generated motion as numpy array.
    Replace the body with your HuMoGen-X sampling logic.
    """
    # Load model
    model = load_model()

    # Prepare audio
    audio, sr = load_audio_mono_16k(audio_path, target_sr=16000)

    # Set seed
    g = torch.Generator(device=DEVICE)
    g.manual_seed(int(seed))

    # -----------------------
    # TODO: your actual inference
    # Example pseudo:
    #   cond = {
    #       "music": torch.tensor(audio)[None, ...].to(DEVICE),
    #       "genre": genre_to_id(genre),
    #   }
    #   motion = model.sample(
    #       cond=cond,
    #       guidance={"genre": cfg_genre, "music": cfg_music},
    #       num_frames=num_frames,
    #       generator=g,
    #   )
    #   motion_np = motion.detach().cpu().numpy()[0]
    # -----------------------

    # Placeholder motion (T, D)
    T = int(num_frames)
    D = 151  # adjust to your representation
    motion_np = np.random.randn(T, D).astype(np.float32)
    return motion_np


def generate_demo(
    audio_file,
    genre: str,
    cfg_genre: float,
    cfg_music: float,
    seed: int,
    seconds: float,
    fps: int,
    resolution: int,
):
    """
    Gradio handler: takes UI inputs, runs inference, renders mp4, returns mp4 path.
    """
    if audio_file is None:
        raise gr.Error("์Œ์•… ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ค˜!")

    # audio_file can be a path string
    audio_path = audio_file if isinstance(audio_file, str) else audio_file.name

    num_frames = int(max(1, round(seconds * fps)))

    motion = run_inference(
        audio_path=audio_path,
        genre=genre,
        cfg_genre=float(cfg_genre),
        cfg_music=float(cfg_music),
        seed=int(seed),
        num_frames=num_frames,
        fps=int(fps),
    )

    # Save output mp4 to a temp file
    tmp_dir = Path(tempfile.mkdtemp())
    out_mp4 = str(tmp_dir / "humogenx_result.mp4")

    render_motion_to_mp4(
        motion=motion,
        out_mp4_path=out_mp4,
        fps=int(fps),
        resolution=int(resolution),
    )

    return out_mp4


# -----------------------------
# Gradio UI
# -----------------------------
def build_ui():
    GENRES = [
        "HipHop", "Breaking", "Popping", "Locking",
        "House", "Waacking", "Shuffle", "Disco",
        "Jazz", "Kpop", "Ballet", "Contemporary"
    ]  # ๋„ค thesis genre set์œผ๋กœ ๋ฐ”๊ฟ”๋„ ๋จ

    with gr.Blocks(title="HuMoGen-X Demo", theme=gr.themes.Soft()) as demo:
        gr.Markdown(
            """
# HuMoGen-X Demo (Inference-only)
- **Upload music** โ†’ choose **dance genre** โ†’ adjust **CFG** โ†’ get **MP4**.
- Model weights are stored in a **private repo** and loaded at runtime.
            """.strip()
        )

        with gr.Row():
            with gr.Column(scale=1):
                audio = gr.Audio(label="Music Upload", type="filepath")
                genre = gr.Dropdown(choices=GENRES, value=GENRES[0], label="Dance Genre")

                gr.Markdown("### CFG (Classifier-Free Guidance)")
                cfg_genre = gr.Slider(0.0, 8.0, value=3.0, step=0.1, label="CFG: Genre")
                cfg_music = gr.Slider(0.0, 8.0, value=3.0, step=0.1, label="CFG: Music")

                with gr.Row():
                    seed = gr.Number(value=0, precision=0, label="Seed (int)")
                    seconds = gr.Slider(1.0, 12.0, value=6.0, step=0.5, label="Length (sec)")

                with gr.Row():
                    fps = gr.Dropdown(choices=[20, 24, 30, 60], value=30, label="FPS")
                    resolution = gr.Dropdown(choices=[256, 512, 720], value=512, label="Render Resolution")

                run_btn = gr.Button("Generate", variant="primary")

            with gr.Column(scale=1):
                out_video = gr.Video(label="Result (MP4)", autoplay=True)

        run_btn.click(
            fn=generate_demo,
            inputs=[audio, genre, cfg_genre, cfg_music, seed, seconds, fps, resolution],
            outputs=[out_video],
        )

        gr.Markdown(
            """
### Notes
- This Space is **inference-only**; weights are not downloadable here.
- If you want higher quality rendering, replace `render_motion_to_mp4()` with your renderer.
            """.strip()
        )

    return demo


if __name__ == "__main__":
    demo = build_ui()
    demo.queue()
    demo.launch()