Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import spaces | |
| import torch | |
| import numpy as np | |
| from diffusers import AudioLDM2Pipeline | |
| # --- Model Loading --- | |
| print("Loading AudioLDM2 model...") | |
| pipe = AudioLDM2Pipeline.from_pretrained( | |
| "cvssp/audioldm2", | |
| torch_dtype=torch.float16, | |
| ) | |
| print("Model loaded!") | |
| SAMPLE_RATE = 16000 | |
| PRESETS = { | |
| "🌧️ Rain": "ambient soundscape of gentle rain falling on a window", | |
| "🌊 Ocean": "field recording of ocean waves crashing on a rocky shore with distant seagulls", | |
| "🔥 Campfire": "high quality recording of a campfire crackling and popping at night with crickets", | |
| "🌲 Forest": "ambient soundscape of a forest at night with crickets and a gentle breeze through trees", | |
| "⛈️ Thunderstorm": "field recording of a thunderstorm with heavy rain and distant rolling thunder", | |
| "☕ Coffee Shop": "ambient soundscape of a busy coffee shop with quiet chatter and cups clinking", | |
| "🌬️ Wind": "field recording of gentle wind blowing through pine trees on a mountain, leaves rustling", | |
| "🦉 Night": "ambient soundscape of a quiet night with owls hooting and distant frogs", | |
| } | |
| def generate_sound(prompt, preset, seed): | |
| """Generate a single ambient sound layer.""" | |
| if preset and preset != "Custom": | |
| final_prompt = PRESETS[preset] | |
| else: | |
| if not prompt or prompt.strip() == "": | |
| raise gr.Error("Please enter a prompt or select a preset.") | |
| final_prompt = prompt | |
| pipe.to("cuda") | |
| generator = torch.Generator("cuda").manual_seed(int(seed)) | |
| audio = pipe( | |
| final_prompt, | |
| negative_prompt="Low quality.", | |
| num_inference_steps=40, | |
| audio_length_in_s=8.0, | |
| guidance_scale=3.0, | |
| generator=generator, | |
| ).audios[0] | |
| return (SAMPLE_RATE, audio) | |
| def generate_custom(prompt, seed, guidance_scale, steps): | |
| """Generate from custom prompt with advanced settings.""" | |
| if not prompt or prompt.strip() == "": | |
| raise gr.Error("Please enter a prompt.") | |
| pipe.to("cuda") | |
| generator = torch.Generator("cuda").manual_seed(int(seed)) | |
| audio = pipe( | |
| prompt, | |
| negative_prompt="Low quality.", | |
| num_inference_steps=int(steps), | |
| audio_length_in_s=8.0, | |
| guidance_scale=guidance_scale, | |
| generator=generator, | |
| ).audios[0] | |
| return (SAMPLE_RATE, audio) | |
| def mix_layers(audio1, audio2, audio3, vol1, vol2, vol3): | |
| """Mix up to 3 audio layers with volume control.""" | |
| layers = [] | |
| for audio, vol in [(audio1, vol1), (audio2, vol2), (audio3, vol3)]: | |
| if audio is not None: | |
| sr, data = audio | |
| data = data.astype(np.float32) | |
| if np.max(np.abs(data)) > 0: | |
| data = data / np.max(np.abs(data)) | |
| data = data * vol | |
| layers.append((sr, data)) | |
| if not layers: | |
| raise gr.Error("Generate at least one layer before mixing.") | |
| sr = layers[0][0] | |
| max_len = max(len(l[1]) for l in layers) | |
| padded = [np.pad(l[1], (0, max_len - len(l[1]))) for l in layers] | |
| mixed = sum(padded) / len(padded) | |
| if np.max(np.abs(mixed)) > 0: | |
| mixed = mixed / np.max(np.abs(mixed)) * 0.9 | |
| return (sr, mixed) | |
| # --- Gradio Interface --- | |
| with gr.Blocks(title="AmbientGen", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🌧️ AmbientGen | |
| ### Generate ambient soundscapes with AI | |
| Create layered ambient sounds by generating individual elements and mixing them together. | |
| Powered by [AudioLDM2](https://huggingface.co/cvssp/audioldm2). | |
| **Tip:** Use presets for quick results, or write custom prompts. Start with a quality modifier | |
| like "ambient soundscape of" or "field recording of" for best results. | |
| """ | |
| ) | |
| with gr.Tab("🎧 Quick Generate"): | |
| with gr.Row(): | |
| preset_dropdown = gr.Dropdown( | |
| choices=list(PRESETS.keys()), | |
| label="Choose a preset", | |
| value="🌧️ Rain", | |
| ) | |
| seed_quick = gr.Number(label="Seed", value=42, precision=0) | |
| quick_btn = gr.Button("Generate", variant="primary", size="lg") | |
| quick_output = gr.Audio(label="Generated Sound", type="numpy") | |
| quick_btn.click( | |
| fn=generate_sound, | |
| inputs=[gr.Textbox(visible=False, value=""), preset_dropdown, seed_quick], | |
| outputs=quick_output, | |
| ) | |
| with gr.Tab("🎛️ Layer Mixer"): | |
| gr.Markdown("Generate up to 3 layers and mix them into a soundscape.") | |
| with gr.Group(): | |
| gr.Markdown("### Layer 1") | |
| with gr.Row(): | |
| preset1 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="🌧️ Rain") | |
| prompt1 = gr.Textbox(label="Custom prompt (used when preset is 'Custom')", placeholder="ambient soundscape of...") | |
| seed1 = gr.Number(label="Seed", value=42, precision=0) | |
| btn1 = gr.Button("Generate Layer 1") | |
| audio1 = gr.Audio(label="Layer 1", type="numpy") | |
| vol1 = gr.Slider(0, 1, value=0.8, step=0.1, label="Volume") | |
| with gr.Group(): | |
| gr.Markdown("### Layer 2") | |
| with gr.Row(): | |
| preset2 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="⛈️ Thunderstorm") | |
| prompt2 = gr.Textbox(label="Custom prompt", placeholder="field recording of...") | |
| seed2 = gr.Number(label="Seed", value=7, precision=0) | |
| btn2 = gr.Button("Generate Layer 2") | |
| audio2 = gr.Audio(label="Layer 2", type="numpy") | |
| vol2 = gr.Slider(0, 1, value=0.5, step=0.1, label="Volume") | |
| with gr.Group(): | |
| gr.Markdown("### Layer 3") | |
| with gr.Row(): | |
| preset3 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="🔥 Campfire") | |
| prompt3 = gr.Textbox(label="Custom prompt", placeholder="high quality recording of...") | |
| seed3 = gr.Number(label="Seed", value=0, precision=0) | |
| btn3 = gr.Button("Generate Layer 3") | |
| audio3 = gr.Audio(label="Layer 3", type="numpy") | |
| vol3 = gr.Slider(0, 1, value=0.4, step=0.1, label="Volume") | |
| btn1.click(fn=generate_sound, inputs=[prompt1, preset1, seed1], outputs=audio1) | |
| btn2.click(fn=generate_sound, inputs=[prompt2, preset2, seed2], outputs=audio2) | |
| btn3.click(fn=generate_sound, inputs=[prompt3, preset3, seed3], outputs=audio3) | |
| gr.Markdown("### 🎚️ Mix") | |
| mix_btn = gr.Button("Mix All Layers", variant="primary", size="lg") | |
| mix_output = gr.Audio(label="Mixed Soundscape", type="numpy") | |
| mix_btn.click(fn=mix_layers, inputs=[audio1, audio2, audio3, vol1, vol2, vol3], outputs=mix_output) | |
| with gr.Tab("✍️ Custom"): | |
| gr.Markdown( | |
| """ | |
| Write your own prompt. For best results: | |
| - Start with a quality modifier: *"ambient soundscape of"*, *"field recording of"* | |
| - Add specific sounds: *"rain"*, *"crackling fire"*, *"flowing stream"* | |
| - Add spatial context: *"in a forest"*, *"on a mountain"* | |
| - Keep to 2-3 sound elements per generation | |
| """ | |
| ) | |
| custom_prompt = gr.Textbox( | |
| label="Your prompt", | |
| placeholder="ambient soundscape of a gentle stream flowing over rocks in a forest with birdsong", | |
| lines=2, | |
| ) | |
| with gr.Row(): | |
| custom_seed = gr.Number(label="Seed", value=42, precision=0) | |
| custom_guidance = gr.Slider(1.0, 7.0, value=3.0, step=0.5, label="Guidance Scale") | |
| custom_steps = gr.Slider(20, 100, value=40, step=10, label="Inference Steps") | |
| custom_btn = gr.Button("Generate", variant="primary", size="lg") | |
| custom_output = gr.Audio(label="Generated Sound", type="numpy") | |
| custom_btn.click(fn=generate_custom, inputs=[custom_prompt, custom_seed, custom_guidance, custom_steps], outputs=custom_output) | |
| gr.Markdown( | |
| """ | |
| --- | |
| Built as a learning project exploring GenAI for audio. | |
| [Blog](https://my-sonicase.github.io/ambientgen/) · | |
| [GitHub](https://github.com/my-sonicase/ambientgen) · | |
| Powered by [AudioLDM2](https://huggingface.co/cvssp/audioldm2) | |
| """ | |
| ) | |
| demo.launch() | |