|
|
import spaces |
|
|
import os |
|
|
import uuid |
|
|
|
|
|
os.putenv('PYTORCH_NVML_BASED_CUDA_CHECK','1') |
|
|
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1') |
|
|
alloc_conf_parts = [ |
|
|
'expandable_segments:True', |
|
|
'pinned_use_background_threads:True' |
|
|
] |
|
|
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ','.join(alloc_conf_parts) |
|
|
os.environ["SAFETENSORS_FAST_GPU"] = "1" |
|
|
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1') |
|
|
|
|
|
import torch |
|
|
torch.backends.cuda.matmul.allow_tf32 = False |
|
|
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False |
|
|
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False |
|
|
torch.backends.cudnn.allow_tf32 = False |
|
|
torch.backends.cudnn.deterministic = False |
|
|
torch.backends.cudnn.benchmark = False |
|
|
torch.backends.cuda.preferred_blas_library="cublas" |
|
|
torch.backends.cuda.preferred_linalg_library="cusolver" |
|
|
torch.set_float32_matmul_precision("highest") |
|
|
|
|
|
import torchaudio |
|
|
from einops import rearrange |
|
|
import gradio as gr |
|
|
|
|
|
from stable_audio_tools import get_pretrained_model |
|
|
from stable_audio_tools.inference.generation import generate_diffusion_cond |
|
|
|
|
|
model, model_config = get_pretrained_model("ford442/stable-audio-open-1.0") |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"Using device: {device}") |
|
|
model.to(device,torch.float32) |
|
|
|
|
|
@spaces.GPU(duration=60) |
|
|
def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7, use_bfloat=False, use_eval=False): |
|
|
print(f"Prompt received: {prompt}") |
|
|
print(f"Settings: Duration={seconds_total}s, Steps={steps}, CFG Scale={cfg_scale}") |
|
|
sample_rate = model_config["sample_rate"] |
|
|
sample_size = model_config["sample_size"] |
|
|
print(f"Sample rate: {sample_rate}, Sample size: {sample_size}") |
|
|
print("Model moved to device.") |
|
|
conditioning = [{ |
|
|
"prompt": prompt, |
|
|
"seconds_start": 0, |
|
|
"seconds_total": seconds_total |
|
|
}] |
|
|
print(f"Conditioning: {conditioning}") |
|
|
print("Generating audio...") |
|
|
if use_bfloat==True: |
|
|
model.to(torch.bfloat16) |
|
|
if use_eval==True: |
|
|
model.eval() |
|
|
output = generate_diffusion_cond( |
|
|
model, |
|
|
steps=steps, |
|
|
cfg_scale=cfg_scale, |
|
|
conditioning=conditioning, |
|
|
sample_size=sample_size, |
|
|
sigma_min=0.3, |
|
|
sigma_max=500, |
|
|
sampler_type="dpmpp-3m-sde", |
|
|
device=device |
|
|
) |
|
|
print("Audio generated.") |
|
|
output = rearrange(output, "b d n -> d (b n)") |
|
|
|
|
|
output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu() |
|
|
unique_filename = f"output_{uuid.uuid4().hex}.mp3" |
|
|
print(f"Saving audio to file: {unique_filename}") |
|
|
torchaudio.save( |
|
|
unique_filename, |
|
|
output, |
|
|
sample_rate, |
|
|
format="mp3", |
|
|
encoding="MP3", |
|
|
bits_per_sample=320 |
|
|
) |
|
|
print(f"Audio saved: {unique_filename}") |
|
|
return unique_filename |
|
|
|
|
|
|
|
|
interface = gr.Interface( |
|
|
fn=generate_audio, |
|
|
inputs=[ |
|
|
gr.Textbox(label="Prompt", placeholder="Enter your text prompt here"), |
|
|
gr.Slider(0, 420, value=30, label="Duration in Seconds"), |
|
|
gr.Slider(10, 420, value=100, step=10, label="Number of Diffusion Steps"), |
|
|
gr.Slider(1.0, 32.0, value=7.0, step=0.1, label="CFG Scale"), |
|
|
gr.Checkbox(value=False, label="Use Brainfloat"), |
|
|
gr.Checkbox(value=False, label="Use eval()") |
|
|
], |
|
|
outputs=gr.Audio(type="filepath", label="Generated Audio"), |
|
|
title="Stable Audio Generator", |
|
|
description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0.", |
|
|
examples=[ |
|
|
[ |
|
|
"Create a serene soundscape of a quiet beach at sunset.", |
|
|
45, |
|
|
100, |
|
|
10.0, |
|
|
], |
|
|
[ |
|
|
"Generate an energetic and bustling city street scene with distant traffic and close conversations.", |
|
|
30, |
|
|
120, |
|
|
5.0, |
|
|
], |
|
|
[ |
|
|
"Simulate a forest ambiance with birds chirping and wind rustling through the leaves.", |
|
|
60, |
|
|
140, |
|
|
7.5, |
|
|
], |
|
|
[ |
|
|
"Recreate a gentle rainfall with distant thunder.", |
|
|
|
|
|
35, |
|
|
110, |
|
|
8.0, |
|
|
|
|
|
], |
|
|
[ |
|
|
"Imagine a jazz cafe environment with soft music and ambient chatter.", |
|
|
25, |
|
|
90, |
|
|
6.0, |
|
|
], |
|
|
["Rock beat played in a treated studio, session drumming on an acoustic kit.", |
|
|
30, |
|
|
100, |
|
|
7.0, |
|
|
|
|
|
] |
|
|
]) |
|
|
|
|
|
interface.launch() |
|
|
|