Spaces:

isYes
/

HuMoGen-X_Demo

Sleeping

Daye-Lee18

lru cache

efcaecd 2 months ago

7.69 kB

	import os
	import tempfile
	from pathlib import Path
	from typing import Tuple, Optional

	from functools import lru_cache
	import gradio as gr
	import numpy as np
	import torch
	import soundfile as sf
	import librosa

	from huggingface_hub import hf_hub_download

	# -----------------------------
	# Config
	# -----------------------------
	DEFAULT_WEIGHTS_REPO = os.environ.get("WEIGHTS_REPO", "isYes/HuMoGen-X-weights") # private model repo
	WEIGHTS_FILENAME = os.environ.get("WEIGHTS_FILENAME", "train-0090.pt") # in the private repo

	# Space는 CPU일 수도 있고 GPU일 수도 있음
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


	# -----------------------------
	# Secure download + load
	# -----------------------------
	@lru_cache
	def load_model():
	"""
	Loads model weights from a PRIVATE HF repo using HF_TOKEN (Space Secret).
	Cache_resource ensures we load only once per Space runtime.
	"""
	token = os.environ.get("HF_TOKEN")
	if not token:
	raise RuntimeError(
	"HF_TOKEN secret is missing. Set it in Space Settings -> Secrets."
	)

	ckpt_path = hf_hub_download(
	repo_id=DEFAULT_WEIGHTS_REPO,
	filename=WEIGHTS_FILENAME,
	token=token,
	)

	# TODO: replace this with your actual model class init + load_state_dict
	# Example patterns:
	# model = HuMoGenX(...)
	# state = torch.load(ckpt_path, map_location="cpu")
	# model.load_state_dict(state["state_dict"] if "state_dict" in state else state)
	# model.to(DEVICE).eval()
	#
	# Here we keep a placeholder "model" object.
	model = torch.load(ckpt_path, map_location="cpu")
	if hasattr(model, "to"):
	model = model.to(DEVICE)
	if hasattr(model, "eval"):
	model.eval()
	return model


	# -----------------------------
	# Utilities
	# -----------------------------
	def load_audio_mono_16k(audio_path: str, target_sr: int = 16000) -> Tuple[np.ndarray, int]:
	"""
	Loads audio file and converts to mono float32 at target_sr.
	"""
	y, sr = librosa.load(audio_path, sr=target_sr, mono=True)
	y = y.astype(np.float32)
	return y, target_sr


	def render_motion_to_mp4(
	motion: np.ndarray,
	out_mp4_path: str,
	fps: int = 30,
	resolution: int = 512,
	):
	"""
	TODO: Replace this with your real renderer.
	This function should create an mp4 from the generated motion.
	- motion: (T, D) or (T, J, 3) etc.
	- out_mp4_path: path to save mp4

	Options:
	1) lightweight: matplotlib stick figure -> imageio mp4
	2) medium: pyrender / trimesh
	3) heavy: Blender (보통 Space에선 비추)

	For now, we'll create a dummy black video so the UI pipeline is complete.
	"""
	import imageio.v2 as imageio

	T = int(motion.shape[0]) if motion is not None else 60
	frames = []
	for _ in range(T):
	frame = np.zeros((resolution, resolution, 3), dtype=np.uint8)
	frames.append(frame)

	writer = imageio.get_writer(out_mp4_path, fps=fps)
	for f in frames:
	writer.append_data(f)
	writer.close()


	# -----------------------------
	# Inference stub (connect your code here)
	# -----------------------------
	@torch.inference_mode()
	def run_inference(
	audio_path: str,
	genre: str,
	cfg_genre: float,
	cfg_music: float,
	seed: int,
	num_frames: int,
	fps: int,
	) -> np.ndarray:
	"""
	Returns generated motion as numpy array.
	Replace the body with your HuMoGen-X sampling logic.
	"""
	# Load model
	model = load_model()

	# Prepare audio
	audio, sr = load_audio_mono_16k(audio_path, target_sr=16000)

	# Set seed
	g = torch.Generator(device=DEVICE)
	g.manual_seed(int(seed))

	# -----------------------
	# TODO: your actual inference
	# Example pseudo:
	# cond = {
	# "music": torch.tensor(audio)[None, ...].to(DEVICE),
	# "genre": genre_to_id(genre),
	# }
	# motion = model.sample(
	# cond=cond,
	# guidance={"genre": cfg_genre, "music": cfg_music},
	# num_frames=num_frames,
	# generator=g,
	# )
	# motion_np = motion.detach().cpu().numpy()[0]
	# -----------------------

	# Placeholder motion (T, D)
	T = int(num_frames)
	D = 151 # adjust to your representation
	motion_np = np.random.randn(T, D).astype(np.float32)
	return motion_np


	def generate_demo(
	audio_file,
	genre: str,
	cfg_genre: float,
	cfg_music: float,
	seed: int,
	seconds: float,
	fps: int,
	resolution: int,
	):
	"""
	Gradio handler: takes UI inputs, runs inference, renders mp4, returns mp4 path.
	"""
	if audio_file is None:
	raise gr.Error("음악 파일을 업로드해줘!")

	# audio_file can be a path string
	audio_path = audio_file if isinstance(audio_file, str) else audio_file.name

	num_frames = int(max(1, round(seconds * fps)))

	motion = run_inference(
	audio_path=audio_path,
	genre=genre,
	cfg_genre=float(cfg_genre),
	cfg_music=float(cfg_music),
	seed=int(seed),
	num_frames=num_frames,
	fps=int(fps),
	)

	# Save output mp4 to a temp file
	tmp_dir = Path(tempfile.mkdtemp())
	out_mp4 = str(tmp_dir / "humogenx_result.mp4")

	render_motion_to_mp4(
	motion=motion,
	out_mp4_path=out_mp4,
	fps=int(fps),
	resolution=int(resolution),
	)

	return out_mp4


	# -----------------------------
	# Gradio UI
	# -----------------------------
	def build_ui():
	GENRES = [
	"HipHop", "Breaking", "Popping", "Locking",
	"House", "Waacking", "Shuffle", "Disco",
	"Jazz", "Kpop", "Ballet", "Contemporary"
	] # 네 thesis genre set으로 바꿔도 됨

	with gr.Blocks(title="HuMoGen-X Demo", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# HuMoGen-X Demo (Inference-only)
	- Upload music → choose dance genre → adjust CFG → get MP4.
	- Model weights are stored in a private repo and loaded at runtime.
	""".strip()
	)

	with gr.Row():
	with gr.Column(scale=1):
	audio = gr.Audio(label="Music Upload", type="filepath")
	genre = gr.Dropdown(choices=GENRES, value=GENRES[0], label="Dance Genre")

	gr.Markdown("### CFG (Classifier-Free Guidance)")
	cfg_genre = gr.Slider(0.0, 8.0, value=3.0, step=0.1, label="CFG: Genre")
	cfg_music = gr.Slider(0.0, 8.0, value=3.0, step=0.1, label="CFG: Music")

	with gr.Row():
	seed = gr.Number(value=0, precision=0, label="Seed (int)")
	seconds = gr.Slider(1.0, 12.0, value=6.0, step=0.5, label="Length (sec)")

	with gr.Row():
	fps = gr.Dropdown(choices=[20, 24, 30, 60], value=30, label="FPS")
	resolution = gr.Dropdown(choices=[256, 512, 720], value=512, label="Render Resolution")

	run_btn = gr.Button("Generate", variant="primary")

	with gr.Column(scale=1):
	out_video = gr.Video(label="Result (MP4)", autoplay=True)

	run_btn.click(
	fn=generate_demo,
	inputs=[audio, genre, cfg_genre, cfg_music, seed, seconds, fps, resolution],
	outputs=[out_video],
	)

	gr.Markdown(
	"""
	### Notes
	- This Space is inference-only; weights are not downloadable here.
	- If you want higher quality rendering, replace `render_motion_to_mp4()` with your renderer.
	""".strip()
	)

	return demo


	if __name__ == "__main__":
	demo = build_ui()
	demo.queue()
	demo.launch()