timefractal's picture
Fix: move pipe.to(cuda) inside @spaces.GPU for ZeroGPU compatibility
f0a4dff verified
import spaces
import gradio as gr
import torch
import numpy as np
import tempfile
import soundfile as sf
from diffusers import AceStepPipeline
# ─── Model Loading (CPU at module level for ZeroGPU) ─────────────────────
MODEL_ID = "ACE-Step/acestep-v15-xl-turbo-diffusers"
pipe = AceStepPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
pipe.vae.enable_tiling()
# ─── Inference ───────────────────────────────────────────────────────────
@spaces.GPU(duration=180)
def generate_music(prompt, lyrics, duration, seed, num_steps):
"""Generate music from text prompt and optional lyrics."""
if not prompt.strip():
raise gr.Error("Please enter a music prompt!")
# Move to GPU inside @spaces.GPU where CUDA is available
pipe.to("cuda")
generator = torch.Generator(device="cuda").manual_seed(int(seed))
output = pipe(
prompt=prompt,
lyrics=lyrics if lyrics.strip() else None,
audio_duration=float(duration),
num_inference_steps=int(num_steps),
generator=generator,
)
audio = output.audios[0] # (channels, samples) @ 48kHz
# Convert to numpy and save as wav
if isinstance(audio, torch.Tensor):
audio = audio.cpu().numpy()
# If stereo (2, samples), transpose for soundfile
if audio.ndim == 2:
audio = audio.T # (samples, channels)
# Save to temp file
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sf.write(tmp.name, audio, samplerate=48000)
return tmp.name
# ─── UI ──────────────────────────────────────────────────────────────────
DESCRIPTION = """
# 🎵 ACE-Step Turbo — AI Music Generator
Generate music from text descriptions and optional lyrics using **ACE-Step v1.5 XL Turbo** —
a 5B-parameter flow-matching DiT model distilled for fast 8-step generation.
### What's New
This uses the **brand-new diffusers-formatted Turbo model** (released April 22, 2026) —
guidance-distilled for speed without sacrificing quality. MIT licensed.
"""
EXAMPLES = [
["An upbeat synthwave track with driving drums and a catchy lead synth melody", "[Verse]\nNeon lights are calling me\nRunning through the city free\n[Chorus]\nRide the wave tonight\nEverything feels right", 30, 42, 8],
["A peaceful acoustic guitar piece with gentle fingerpicking, nature ambience", "", 20, 123, 8],
["Heavy metal with distorted guitars, double bass drums, and aggressive vocals", "[Verse]\nFire in the sky\nWe will never die\n[Chorus]\nRise up, stand tall\nWe won't ever fall", 30, 456, 8],
["Lo-fi hip hop beats to relax/study to, warm vinyl crackle, mellow piano", "", 30, 789, 8],
["Epic orchestral film score with soaring strings, brass fanfare, and timpani", "", 30, 321, 8],
["Jazz trio — upright bass, piano, and brushed drums, smoky nightclub vibe", "", 25, 654, 8],
]
with gr.Blocks(
title="ACE-Step Turbo — AI Music Generator",
theme=gr.themes.Soft(primary_hue="orange", secondary_hue="amber"),
css="footer { display: none !important; }"
) as demo:
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column(scale=2):
prompt = gr.Textbox(
label="🎶 Music Description",
placeholder="Describe the music style, instruments, mood, tempo...",
lines=3,
)
lyrics = gr.Textbox(
label="📝 Lyrics (Optional)",
placeholder="[Verse]\nYour lyrics here...\n[Chorus]\nChorus lyrics...",
lines=5,
)
with gr.Column(scale=1):
duration = gr.Slider(5, 60, value=30, step=5, label="⏱️ Duration (seconds)")
num_steps = gr.Slider(4, 16, value=8, step=1, label="🔄 Inference Steps")
seed = gr.Number(value=42, label="🎲 Seed", precision=0)
btn = gr.Button("🎵 Generate Music", variant="primary", size="lg")
audio_output = gr.Audio(label="Generated Music", type="filepath")
btn.click(
fn=generate_music,
inputs=[prompt, lyrics, duration, seed, num_steps],
outputs=audio_output,
)
gr.Examples(
examples=EXAMPLES,
inputs=[prompt, lyrics, duration, seed, num_steps],
outputs=audio_output,
fn=generate_music,
cache_examples=False,
)
gr.Markdown("""
---
**Model:** [ACE-Step v1.5 XL Turbo](https://huggingface.co/ACE-Step/acestep-v15-xl-turbo-diffusers) | **Architecture:** 5B DiT, flow-matching, guidance-distilled | **License:** MIT | **Audio:** 48kHz stereo
""")
demo.launch()