|
|
""" |
|
|
Stable Audio Open Gradio Inference App for HuggingFace Spaces |
|
|
|
|
|
This app provides a simple interface for generating high-quality instrumental music |
|
|
using Stable Audio Open with the SAO-Instrumental-Finetune model. |
|
|
|
|
|
Designed to be used as a remote computation tool for WeaveMuse. |
|
|
|
|
|
Architecture: |
|
|
- Stable Audio model is loaded OUTSIDE the GPU-decorated function |
|
|
- Only the inference itself runs on GPU (cost-efficient for HF Spaces Zero GPU) |
|
|
- Model initialization happens once at startup |
|
|
""" |
|
|
|
|
|
import torch |
|
|
import torchaudio |
|
|
from einops import rearrange |
|
|
import gradio as gr |
|
|
import spaces |
|
|
import os |
|
|
import uuid |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
from stable_audio_tools.inference.generation import generate_diffusion_cond |
|
|
import json |
|
|
from stable_audio_tools.models.factory import create_model_from_config |
|
|
from stable_audio_tools.models.utils import load_ckpt_state_dict |
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
|
|
|
def get_pretrained_model(name="santifiorino/SAO-Instrumental-Finetune"): |
|
|
|
|
|
model_config_path = hf_hub_download(name, filename="model_config.json", repo_type='model') |
|
|
|
|
|
with open(model_config_path) as f: |
|
|
model_config = json.load(f) |
|
|
|
|
|
model = create_model_from_config(model_config) |
|
|
|
|
|
|
|
|
try: |
|
|
model_ckpt_path = hf_hub_download(name, filename="model.safetensors", repo_type='model') |
|
|
except Exception as e: |
|
|
model_ckpt_path = hf_hub_download(name, filename="SAO_Instrumental_Finetune.ckpt", repo_type='model') |
|
|
model.load_state_dict(load_ckpt_state_dict(model_ckpt_path)) |
|
|
|
|
|
return model, model_config |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_model(): |
|
|
""" |
|
|
Load the Stable Audio model outside GPU function. |
|
|
This is called once at startup to download and cache the model. |
|
|
""" |
|
|
print("Loading model...") |
|
|
model, model_config = get_pretrained_model("santifiorino/SAO-Instrumental-Finetune") |
|
|
print("Model loaded successfully.") |
|
|
return model, model_config |
|
|
|
|
|
import numpy as np |
|
|
import gradio as gr |
|
|
import spaces |
|
|
import torch |
|
|
from einops import rearrange |
|
|
|
|
|
|
|
|
model, model_config = load_model() |
|
|
model = model.to("cuda") |
|
|
SAMPLE_RATE = model_config["sample_rate"] |
|
|
SAMPLE_SIZE = model_config["sample_size"] |
|
|
|
|
|
|
|
|
@spaces.GPU() |
|
|
def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7): |
|
|
""" |
|
|
Returns (sample_rate, waveform) so the API returns raw audio, not a file. |
|
|
""" |
|
|
|
|
|
conditioning = [{ |
|
|
"prompt": prompt, |
|
|
"seconds_start": 0, |
|
|
"seconds_total": seconds_total |
|
|
}] |
|
|
|
|
|
audio = generate_diffusion_cond( |
|
|
model, |
|
|
steps=steps, |
|
|
cfg_scale=cfg_scale, |
|
|
conditioning=conditioning, |
|
|
sample_size=SAMPLE_SIZE, |
|
|
sigma_min=0.3, |
|
|
sigma_max=500, |
|
|
sampler_type="dpmpp-3m-sde", |
|
|
device="cuda", |
|
|
) |
|
|
|
|
|
audio = rearrange(audio, "b c n -> c (b n)") |
|
|
audio = audio.to(torch.float32) |
|
|
audio = audio / (audio.abs().max() + 1e-12) |
|
|
audio = (audio.clamp(-1, 1) * 32767).to(torch.int16) |
|
|
audio_np = audio.cpu().numpy().T |
|
|
|
|
|
return SAMPLE_RATE, audio_np |
|
|
|
|
|
|
|
|
|
|
|
interface = gr.Interface( |
|
|
fn=generate_audio, |
|
|
inputs=[ |
|
|
gr.Textbox(label="Prompt", |
|
|
placeholder="Describe the instrumental music...", |
|
|
value="Upbeat rock guitar with drums and bass"), |
|
|
gr.Slider(0, 47, value=30, label="Duration in Seconds"), |
|
|
gr.Slider(10, 150, value=100, step=10, label="Number of Diffusion Steps"), |
|
|
gr.Slider(1, 15, value=7, step=0.1, label="CFG Scale"), |
|
|
], |
|
|
outputs=gr.Audio(type="numpy", format="wav", label="Generated Music"), |
|
|
api_name="generate", |
|
|
title="🎸 Stable Audio Instrumental Generator", |
|
|
description=""" |
|
|
Generate high-quality instrumental music at 44.1kHz from text prompts using the SAO-Instrumental-Finetune model. |
|
|
|
|
|
**Features:** |
|
|
- 🎹 Piano, guitar, drums, bass, and orchestral instruments |
|
|
- 🎵 Various musical genres and styles |
|
|
- ⚡ High-quality stereo audio |
|
|
- 🎼 Perfect for music composition and production |
|
|
|
|
|
**Tips:** |
|
|
- Be specific about instruments, tempo, and mood |
|
|
- Higher steps = better quality (recommended: 100-120) |
|
|
- CFG Scale 7-10 works well for most prompts |
|
|
""", |
|
|
examples=[ |
|
|
[ |
|
|
"Energetic rock guitar riff with powerful drums and bass", |
|
|
30, |
|
|
100, |
|
|
7, |
|
|
], |
|
|
[ |
|
|
"Smooth jazz piano trio with upright bass and brushed drums", |
|
|
35, |
|
|
110, |
|
|
8, |
|
|
], |
|
|
[ |
|
|
"Epic orchestral strings and brass with cinematic percussion", |
|
|
45, |
|
|
120, |
|
|
10, |
|
|
], |
|
|
[ |
|
|
"Funky electric bass groove with rhythm guitar and tight drums", |
|
|
30, |
|
|
100, |
|
|
7, |
|
|
], |
|
|
[ |
|
|
"Acoustic guitar fingerpicking with soft percussion", |
|
|
40, |
|
|
110, |
|
|
6, |
|
|
], |
|
|
[ |
|
|
"Electronic synthesizer pads with ambient textures and subtle beats", |
|
|
35, |
|
|
100, |
|
|
7.5, |
|
|
], |
|
|
[ |
|
|
"Classical piano solo with expressive dynamics and sustain pedal", |
|
|
30, |
|
|
110, |
|
|
8, |
|
|
], |
|
|
[ |
|
|
"Blues guitar solo with bending notes over a shuffle rhythm section", |
|
|
30, |
|
|
100, |
|
|
7, |
|
|
], |
|
|
[ |
|
|
"Latin percussion ensemble with congas, bongos, and timbales", |
|
|
30, |
|
|
100, |
|
|
7, |
|
|
], |
|
|
[ |
|
|
"Rock beat played in a treated studio, session drumming on an acoustic kit", |
|
|
30, |
|
|
100, |
|
|
7, |
|
|
] |
|
|
], |
|
|
article=""" |
|
|
--- |
|
|
### About SAO-Instrumental-Finetune |
|
|
|
|
|
This model is a fine-tuned version of **Stable Audio Open 1.0** specifically trained for instrumental music generation. |
|
|
|
|
|
**Capabilities:** |
|
|
- 🎸 **Guitar**: Acoustic, electric, classical, jazz, rock |
|
|
- 🥁 **Drums**: Rock, jazz, electronic, orchestral percussion |
|
|
- 🎹 **Piano**: Classical, jazz, modern, ambient |
|
|
- � **Orchestral**: Strings, brass, woodwinds |
|
|
- � **Other**: Bass, synthesizers, ethnic instruments |
|
|
|
|
|
**Technical Details:** |
|
|
- Model: SAO-Instrumental-Finetune (based on Stable Audio Open 1.0) |
|
|
- Sample Rate: 44.1kHz (CD quality) |
|
|
- Max Duration: 47 seconds |
|
|
- Architecture: Latent diffusion model with conditioning |
|
|
|
|
|
**Integration:** |
|
|
This space is designed to work with **WeaveMuse** for AI-assisted music composition. |
|
|
Use the API endpoint for programmatic access in your music production workflows. |
|
|
|
|
|
--- |
|
|
|
|
|
*Powered by [Stability AI](https://stability.ai/) and [WeaveMuse](https://github.com/manoskary/weavemuse)* |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
interface.launch() |
|
|
|