| | import random |
| | import spaces |
| | import os |
| | import uuid |
| |
|
| | os.putenv('PYTORCH_NVML_BASED_CUDA_CHECK','1') |
| | os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1') |
| | alloc_conf_parts = [ |
| | 'expandable_segments:True', |
| | 'pinned_use_background_threads:True' |
| | ] |
| | os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ','.join(alloc_conf_parts) |
| | os.environ["SAFETENSORS_FAST_GPU"] = "1" |
| | os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1') |
| |
|
| | import torch |
| | torch.backends.cuda.matmul.allow_tf32 = False |
| | torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False |
| | torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False |
| | torch.backends.cudnn.allow_tf32 = False |
| | torch.backends.cudnn.deterministic = False |
| | torch.backends.cudnn.benchmark = False |
| | torch.backends.cuda.preferred_blas_library="cublas" |
| | torch.backends.cuda.preferred_linalg_library="cusolver" |
| | torch.set_float32_matmul_precision("highest") |
| |
|
| | import torchaudio |
| | from einops import rearrange |
| | import gradio as gr |
| |
|
| | from stable_audio_tools import get_pretrained_model |
| | from stable_audio_tools.inference.generation import generate_diffusion_cond |
| |
|
| | model, model_config = get_pretrained_model("ford442/stable-audio-open-1.0") |
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | print(f"Using device: {device}") |
| | model.to(device,torch.float32) |
| |
|
| | @spaces.GPU(duration=60) |
| | def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7, use_bfloat=False, use_eval=False): |
| | print(f"Prompt received: {prompt}") |
| | print(f"Settings: Duration={seconds_total}s, Steps={steps}, CFG Scale={cfg_scale}") |
| |
|
| | seed = random.randint(0, 2**63 - 1) |
| | random.seed(seed) |
| | torch.manual_seed(seed) |
| | print(f"Using seed: {seed}") |
| |
|
| | sample_rate = model_config["sample_rate"] |
| | sample_size = model_config["sample_size"] |
| | print(f"Sample rate: {sample_rate}, Sample size: {sample_size}") |
| | print("Model moved to device.") |
| | conditioning = [{ |
| | "prompt": prompt, |
| | "seconds_start": 0, |
| | "seconds_total": seconds_total |
| | }] |
| | print(f"Conditioning: {conditioning}") |
| | print("Generating audio...") |
| | if use_bfloat==True: |
| | model.to(torch.bfloat16) |
| | if use_eval==True: |
| | model.eval() |
| | output = generate_diffusion_cond( |
| | model, |
| | steps=steps, |
| | cfg_scale=cfg_scale, |
| | conditioning=conditioning, |
| | sample_size=sample_size, |
| | sigma_min=0.3, |
| | sigma_max=500, |
| | sampler_type="dpmpp-3m-sde", |
| | device=device |
| | ) |
| | print("Audio generated.") |
| | output = rearrange(output, "b d n -> d (b n)") |
| | |
| | output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu() |
| | unique_filename = f"output_{uuid.uuid4().hex}.mp3" |
| | print(f"Saving audio to file: {unique_filename}") |
| | torchaudio.save( |
| | unique_filename, |
| | output, |
| | sample_rate, |
| | format="mp3", |
| | encoding="MP3", |
| | bits_per_sample=320 |
| | ) |
| | print(f"Audio saved: {unique_filename}") |
| | return unique_filename |
| |
|
| |
|
| | interface = gr.Interface( |
| | fn=generate_audio, |
| | inputs=[ |
| | gr.Textbox(label="Prompt", placeholder="Enter your text prompt here"), |
| | gr.Slider(0, 420, value=30, label="Duration in Seconds"), |
| | gr.Slider(10, 420, value=100, step=10, label="Number of Diffusion Steps"), |
| | gr.Slider(1.0, 32.0, value=7.0, step=0.1, label="CFG Scale"), |
| | gr.Checkbox(value=False, label="Use Brainfloat"), |
| | gr.Checkbox(value=False, label="Use eval()") |
| | ], |
| | outputs=gr.Audio(type="filepath", label="Generated Audio"), |
| | title="Stable Audio Generator", |
| | description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0.", |
| | examples=[ |
| | [ |
| | "Create a serene soundscape of a quiet beach at sunset.", |
| | 45, |
| | 100, |
| | 10.0, |
| | ], |
| | [ |
| | "Generate an energetic and bustling city street scene with distant traffic and close conversations.", |
| | 30, |
| | 120, |
| | 5.0, |
| | ], |
| | [ |
| | "Simulate a forest ambiance with birds chirping and wind rustling through the leaves.", |
| | 60, |
| | 140, |
| | 7.5, |
| | ], |
| | [ |
| | "Recreate a gentle rainfall with distant thunder.", |
| | |
| | 35, |
| | 110, |
| | 8.0, |
| | |
| | ], |
| | [ |
| | "Imagine a jazz cafe environment with soft music and ambient chatter.", |
| | 25, |
| | 90, |
| | 6.0, |
| | ], |
| | ["Rock beat played in a treated studio, session drumming on an acoustic kit.", |
| | 30, |
| | 100, |
| | 7.0, |
| | |
| | ] |
| | ]) |
| |
|
| | interface.launch() |
| |
|