ambientgen / app.py
sonicase's picture
ZeroGPU version with AudioLDM2
a547e9b
import gradio as gr
import spaces
import torch
import numpy as np
from diffusers import AudioLDM2Pipeline
# --- Model Loading ---
print("Loading AudioLDM2 model...")
pipe = AudioLDM2Pipeline.from_pretrained(
"cvssp/audioldm2",
torch_dtype=torch.float16,
)
print("Model loaded!")
SAMPLE_RATE = 16000
PRESETS = {
"🌧️ Rain": "ambient soundscape of gentle rain falling on a window",
"🌊 Ocean": "field recording of ocean waves crashing on a rocky shore with distant seagulls",
"🔥 Campfire": "high quality recording of a campfire crackling and popping at night with crickets",
"🌲 Forest": "ambient soundscape of a forest at night with crickets and a gentle breeze through trees",
"⛈️ Thunderstorm": "field recording of a thunderstorm with heavy rain and distant rolling thunder",
"☕ Coffee Shop": "ambient soundscape of a busy coffee shop with quiet chatter and cups clinking",
"🌬️ Wind": "field recording of gentle wind blowing through pine trees on a mountain, leaves rustling",
"🦉 Night": "ambient soundscape of a quiet night with owls hooting and distant frogs",
}
@spaces.GPU
def generate_sound(prompt, preset, seed):
"""Generate a single ambient sound layer."""
if preset and preset != "Custom":
final_prompt = PRESETS[preset]
else:
if not prompt or prompt.strip() == "":
raise gr.Error("Please enter a prompt or select a preset.")
final_prompt = prompt
pipe.to("cuda")
generator = torch.Generator("cuda").manual_seed(int(seed))
audio = pipe(
final_prompt,
negative_prompt="Low quality.",
num_inference_steps=40,
audio_length_in_s=8.0,
guidance_scale=3.0,
generator=generator,
).audios[0]
return (SAMPLE_RATE, audio)
@spaces.GPU
def generate_custom(prompt, seed, guidance_scale, steps):
"""Generate from custom prompt with advanced settings."""
if not prompt or prompt.strip() == "":
raise gr.Error("Please enter a prompt.")
pipe.to("cuda")
generator = torch.Generator("cuda").manual_seed(int(seed))
audio = pipe(
prompt,
negative_prompt="Low quality.",
num_inference_steps=int(steps),
audio_length_in_s=8.0,
guidance_scale=guidance_scale,
generator=generator,
).audios[0]
return (SAMPLE_RATE, audio)
def mix_layers(audio1, audio2, audio3, vol1, vol2, vol3):
"""Mix up to 3 audio layers with volume control."""
layers = []
for audio, vol in [(audio1, vol1), (audio2, vol2), (audio3, vol3)]:
if audio is not None:
sr, data = audio
data = data.astype(np.float32)
if np.max(np.abs(data)) > 0:
data = data / np.max(np.abs(data))
data = data * vol
layers.append((sr, data))
if not layers:
raise gr.Error("Generate at least one layer before mixing.")
sr = layers[0][0]
max_len = max(len(l[1]) for l in layers)
padded = [np.pad(l[1], (0, max_len - len(l[1]))) for l in layers]
mixed = sum(padded) / len(padded)
if np.max(np.abs(mixed)) > 0:
mixed = mixed / np.max(np.abs(mixed)) * 0.9
return (sr, mixed)
# --- Gradio Interface ---
with gr.Blocks(title="AmbientGen", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# 🌧️ AmbientGen
### Generate ambient soundscapes with AI
Create layered ambient sounds by generating individual elements and mixing them together.
Powered by [AudioLDM2](https://huggingface.co/cvssp/audioldm2).
**Tip:** Use presets for quick results, or write custom prompts. Start with a quality modifier
like "ambient soundscape of" or "field recording of" for best results.
"""
)
with gr.Tab("🎧 Quick Generate"):
with gr.Row():
preset_dropdown = gr.Dropdown(
choices=list(PRESETS.keys()),
label="Choose a preset",
value="🌧️ Rain",
)
seed_quick = gr.Number(label="Seed", value=42, precision=0)
quick_btn = gr.Button("Generate", variant="primary", size="lg")
quick_output = gr.Audio(label="Generated Sound", type="numpy")
quick_btn.click(
fn=generate_sound,
inputs=[gr.Textbox(visible=False, value=""), preset_dropdown, seed_quick],
outputs=quick_output,
)
with gr.Tab("🎛️ Layer Mixer"):
gr.Markdown("Generate up to 3 layers and mix them into a soundscape.")
with gr.Group():
gr.Markdown("### Layer 1")
with gr.Row():
preset1 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="🌧️ Rain")
prompt1 = gr.Textbox(label="Custom prompt (used when preset is 'Custom')", placeholder="ambient soundscape of...")
seed1 = gr.Number(label="Seed", value=42, precision=0)
btn1 = gr.Button("Generate Layer 1")
audio1 = gr.Audio(label="Layer 1", type="numpy")
vol1 = gr.Slider(0, 1, value=0.8, step=0.1, label="Volume")
with gr.Group():
gr.Markdown("### Layer 2")
with gr.Row():
preset2 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="⛈️ Thunderstorm")
prompt2 = gr.Textbox(label="Custom prompt", placeholder="field recording of...")
seed2 = gr.Number(label="Seed", value=7, precision=0)
btn2 = gr.Button("Generate Layer 2")
audio2 = gr.Audio(label="Layer 2", type="numpy")
vol2 = gr.Slider(0, 1, value=0.5, step=0.1, label="Volume")
with gr.Group():
gr.Markdown("### Layer 3")
with gr.Row():
preset3 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="🔥 Campfire")
prompt3 = gr.Textbox(label="Custom prompt", placeholder="high quality recording of...")
seed3 = gr.Number(label="Seed", value=0, precision=0)
btn3 = gr.Button("Generate Layer 3")
audio3 = gr.Audio(label="Layer 3", type="numpy")
vol3 = gr.Slider(0, 1, value=0.4, step=0.1, label="Volume")
btn1.click(fn=generate_sound, inputs=[prompt1, preset1, seed1], outputs=audio1)
btn2.click(fn=generate_sound, inputs=[prompt2, preset2, seed2], outputs=audio2)
btn3.click(fn=generate_sound, inputs=[prompt3, preset3, seed3], outputs=audio3)
gr.Markdown("### 🎚️ Mix")
mix_btn = gr.Button("Mix All Layers", variant="primary", size="lg")
mix_output = gr.Audio(label="Mixed Soundscape", type="numpy")
mix_btn.click(fn=mix_layers, inputs=[audio1, audio2, audio3, vol1, vol2, vol3], outputs=mix_output)
with gr.Tab("✍️ Custom"):
gr.Markdown(
"""
Write your own prompt. For best results:
- Start with a quality modifier: *"ambient soundscape of"*, *"field recording of"*
- Add specific sounds: *"rain"*, *"crackling fire"*, *"flowing stream"*
- Add spatial context: *"in a forest"*, *"on a mountain"*
- Keep to 2-3 sound elements per generation
"""
)
custom_prompt = gr.Textbox(
label="Your prompt",
placeholder="ambient soundscape of a gentle stream flowing over rocks in a forest with birdsong",
lines=2,
)
with gr.Row():
custom_seed = gr.Number(label="Seed", value=42, precision=0)
custom_guidance = gr.Slider(1.0, 7.0, value=3.0, step=0.5, label="Guidance Scale")
custom_steps = gr.Slider(20, 100, value=40, step=10, label="Inference Steps")
custom_btn = gr.Button("Generate", variant="primary", size="lg")
custom_output = gr.Audio(label="Generated Sound", type="numpy")
custom_btn.click(fn=generate_custom, inputs=[custom_prompt, custom_seed, custom_guidance, custom_steps], outputs=custom_output)
gr.Markdown(
"""
---
Built as a learning project exploring GenAI for audio.
[Blog](https://my-sonicase.github.io/ambientgen/) ·
[GitHub](https://github.com/my-sonicase/ambientgen) ·
Powered by [AudioLDM2](https://huggingface.co/cvssp/audioldm2)
"""
)
demo.launch()