Spaces:
Sleeping
Sleeping
File size: 8,421 Bytes
f88f16b a547e9b f88f16b a547e9b f88f16b a547e9b f88f16b a547e9b 63494cb a547e9b f88f16b 63494cb f88f16b a547e9b f88f16b a547e9b 63494cb a547e9b 63494cb a547e9b f88f16b 63494cb f88f16b 63494cb f88f16b 63494cb f88f16b 63494cb f88f16b 63494cb f88f16b 63494cb f88f16b 63494cb f88f16b 63494cb f88f16b 63494cb f88f16b 63494cb f88f16b 63494cb f88f16b a547e9b f88f16b a547e9b f88f16b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 | import gradio as gr
import spaces
import torch
import numpy as np
from diffusers import AudioLDM2Pipeline
# --- Model Loading ---
print("Loading AudioLDM2 model...")
pipe = AudioLDM2Pipeline.from_pretrained(
"cvssp/audioldm2",
torch_dtype=torch.float16,
)
print("Model loaded!")
SAMPLE_RATE = 16000
PRESETS = {
"🌧️ Rain": "ambient soundscape of gentle rain falling on a window",
"🌊 Ocean": "field recording of ocean waves crashing on a rocky shore with distant seagulls",
"🔥 Campfire": "high quality recording of a campfire crackling and popping at night with crickets",
"🌲 Forest": "ambient soundscape of a forest at night with crickets and a gentle breeze through trees",
"⛈️ Thunderstorm": "field recording of a thunderstorm with heavy rain and distant rolling thunder",
"☕ Coffee Shop": "ambient soundscape of a busy coffee shop with quiet chatter and cups clinking",
"🌬️ Wind": "field recording of gentle wind blowing through pine trees on a mountain, leaves rustling",
"🦉 Night": "ambient soundscape of a quiet night with owls hooting and distant frogs",
}
@spaces.GPU
def generate_sound(prompt, preset, seed):
"""Generate a single ambient sound layer."""
if preset and preset != "Custom":
final_prompt = PRESETS[preset]
else:
if not prompt or prompt.strip() == "":
raise gr.Error("Please enter a prompt or select a preset.")
final_prompt = prompt
pipe.to("cuda")
generator = torch.Generator("cuda").manual_seed(int(seed))
audio = pipe(
final_prompt,
negative_prompt="Low quality.",
num_inference_steps=40,
audio_length_in_s=8.0,
guidance_scale=3.0,
generator=generator,
).audios[0]
return (SAMPLE_RATE, audio)
@spaces.GPU
def generate_custom(prompt, seed, guidance_scale, steps):
"""Generate from custom prompt with advanced settings."""
if not prompt or prompt.strip() == "":
raise gr.Error("Please enter a prompt.")
pipe.to("cuda")
generator = torch.Generator("cuda").manual_seed(int(seed))
audio = pipe(
prompt,
negative_prompt="Low quality.",
num_inference_steps=int(steps),
audio_length_in_s=8.0,
guidance_scale=guidance_scale,
generator=generator,
).audios[0]
return (SAMPLE_RATE, audio)
def mix_layers(audio1, audio2, audio3, vol1, vol2, vol3):
"""Mix up to 3 audio layers with volume control."""
layers = []
for audio, vol in [(audio1, vol1), (audio2, vol2), (audio3, vol3)]:
if audio is not None:
sr, data = audio
data = data.astype(np.float32)
if np.max(np.abs(data)) > 0:
data = data / np.max(np.abs(data))
data = data * vol
layers.append((sr, data))
if not layers:
raise gr.Error("Generate at least one layer before mixing.")
sr = layers[0][0]
max_len = max(len(l[1]) for l in layers)
padded = [np.pad(l[1], (0, max_len - len(l[1]))) for l in layers]
mixed = sum(padded) / len(padded)
if np.max(np.abs(mixed)) > 0:
mixed = mixed / np.max(np.abs(mixed)) * 0.9
return (sr, mixed)
# --- Gradio Interface ---
with gr.Blocks(title="AmbientGen", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# 🌧️ AmbientGen
### Generate ambient soundscapes with AI
Create layered ambient sounds by generating individual elements and mixing them together.
Powered by [AudioLDM2](https://huggingface.co/cvssp/audioldm2).
**Tip:** Use presets for quick results, or write custom prompts. Start with a quality modifier
like "ambient soundscape of" or "field recording of" for best results.
"""
)
with gr.Tab("🎧 Quick Generate"):
with gr.Row():
preset_dropdown = gr.Dropdown(
choices=list(PRESETS.keys()),
label="Choose a preset",
value="🌧️ Rain",
)
seed_quick = gr.Number(label="Seed", value=42, precision=0)
quick_btn = gr.Button("Generate", variant="primary", size="lg")
quick_output = gr.Audio(label="Generated Sound", type="numpy")
quick_btn.click(
fn=generate_sound,
inputs=[gr.Textbox(visible=False, value=""), preset_dropdown, seed_quick],
outputs=quick_output,
)
with gr.Tab("🎛️ Layer Mixer"):
gr.Markdown("Generate up to 3 layers and mix them into a soundscape.")
with gr.Group():
gr.Markdown("### Layer 1")
with gr.Row():
preset1 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="🌧️ Rain")
prompt1 = gr.Textbox(label="Custom prompt (used when preset is 'Custom')", placeholder="ambient soundscape of...")
seed1 = gr.Number(label="Seed", value=42, precision=0)
btn1 = gr.Button("Generate Layer 1")
audio1 = gr.Audio(label="Layer 1", type="numpy")
vol1 = gr.Slider(0, 1, value=0.8, step=0.1, label="Volume")
with gr.Group():
gr.Markdown("### Layer 2")
with gr.Row():
preset2 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="⛈️ Thunderstorm")
prompt2 = gr.Textbox(label="Custom prompt", placeholder="field recording of...")
seed2 = gr.Number(label="Seed", value=7, precision=0)
btn2 = gr.Button("Generate Layer 2")
audio2 = gr.Audio(label="Layer 2", type="numpy")
vol2 = gr.Slider(0, 1, value=0.5, step=0.1, label="Volume")
with gr.Group():
gr.Markdown("### Layer 3")
with gr.Row():
preset3 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="🔥 Campfire")
prompt3 = gr.Textbox(label="Custom prompt", placeholder="high quality recording of...")
seed3 = gr.Number(label="Seed", value=0, precision=0)
btn3 = gr.Button("Generate Layer 3")
audio3 = gr.Audio(label="Layer 3", type="numpy")
vol3 = gr.Slider(0, 1, value=0.4, step=0.1, label="Volume")
btn1.click(fn=generate_sound, inputs=[prompt1, preset1, seed1], outputs=audio1)
btn2.click(fn=generate_sound, inputs=[prompt2, preset2, seed2], outputs=audio2)
btn3.click(fn=generate_sound, inputs=[prompt3, preset3, seed3], outputs=audio3)
gr.Markdown("### 🎚️ Mix")
mix_btn = gr.Button("Mix All Layers", variant="primary", size="lg")
mix_output = gr.Audio(label="Mixed Soundscape", type="numpy")
mix_btn.click(fn=mix_layers, inputs=[audio1, audio2, audio3, vol1, vol2, vol3], outputs=mix_output)
with gr.Tab("✍️ Custom"):
gr.Markdown(
"""
Write your own prompt. For best results:
- Start with a quality modifier: *"ambient soundscape of"*, *"field recording of"*
- Add specific sounds: *"rain"*, *"crackling fire"*, *"flowing stream"*
- Add spatial context: *"in a forest"*, *"on a mountain"*
- Keep to 2-3 sound elements per generation
"""
)
custom_prompt = gr.Textbox(
label="Your prompt",
placeholder="ambient soundscape of a gentle stream flowing over rocks in a forest with birdsong",
lines=2,
)
with gr.Row():
custom_seed = gr.Number(label="Seed", value=42, precision=0)
custom_guidance = gr.Slider(1.0, 7.0, value=3.0, step=0.5, label="Guidance Scale")
custom_steps = gr.Slider(20, 100, value=40, step=10, label="Inference Steps")
custom_btn = gr.Button("Generate", variant="primary", size="lg")
custom_output = gr.Audio(label="Generated Sound", type="numpy")
custom_btn.click(fn=generate_custom, inputs=[custom_prompt, custom_seed, custom_guidance, custom_steps], outputs=custom_output)
gr.Markdown(
"""
---
Built as a learning project exploring GenAI for audio.
[Blog](https://my-sonicase.github.io/ambientgen/) ·
[GitHub](https://github.com/my-sonicase/ambientgen) ·
Powered by [AudioLDM2](https://huggingface.co/cvssp/audioldm2)
"""
)
demo.launch()
|