import gradio as gr import spaces import torch import numpy as np from diffusers import AudioLDM2Pipeline # --- Model Loading --- print("Loading AudioLDM2 model...") pipe = AudioLDM2Pipeline.from_pretrained( "cvssp/audioldm2", torch_dtype=torch.float16, ) print("Model loaded!") SAMPLE_RATE = 16000 PRESETS = { "๐ŸŒง๏ธ Rain": "ambient soundscape of gentle rain falling on a window", "๐ŸŒŠ Ocean": "field recording of ocean waves crashing on a rocky shore with distant seagulls", "๐Ÿ”ฅ Campfire": "high quality recording of a campfire crackling and popping at night with crickets", "๐ŸŒฒ Forest": "ambient soundscape of a forest at night with crickets and a gentle breeze through trees", "โ›ˆ๏ธ Thunderstorm": "field recording of a thunderstorm with heavy rain and distant rolling thunder", "โ˜• Coffee Shop": "ambient soundscape of a busy coffee shop with quiet chatter and cups clinking", "๐ŸŒฌ๏ธ Wind": "field recording of gentle wind blowing through pine trees on a mountain, leaves rustling", "๐Ÿฆ‰ Night": "ambient soundscape of a quiet night with owls hooting and distant frogs", } @spaces.GPU def generate_sound(prompt, preset, seed): """Generate a single ambient sound layer.""" if preset and preset != "Custom": final_prompt = PRESETS[preset] else: if not prompt or prompt.strip() == "": raise gr.Error("Please enter a prompt or select a preset.") final_prompt = prompt pipe.to("cuda") generator = torch.Generator("cuda").manual_seed(int(seed)) audio = pipe( final_prompt, negative_prompt="Low quality.", num_inference_steps=40, audio_length_in_s=8.0, guidance_scale=3.0, generator=generator, ).audios[0] return (SAMPLE_RATE, audio) @spaces.GPU def generate_custom(prompt, seed, guidance_scale, steps): """Generate from custom prompt with advanced settings.""" if not prompt or prompt.strip() == "": raise gr.Error("Please enter a prompt.") pipe.to("cuda") generator = torch.Generator("cuda").manual_seed(int(seed)) audio = pipe( prompt, negative_prompt="Low quality.", num_inference_steps=int(steps), audio_length_in_s=8.0, guidance_scale=guidance_scale, generator=generator, ).audios[0] return (SAMPLE_RATE, audio) def mix_layers(audio1, audio2, audio3, vol1, vol2, vol3): """Mix up to 3 audio layers with volume control.""" layers = [] for audio, vol in [(audio1, vol1), (audio2, vol2), (audio3, vol3)]: if audio is not None: sr, data = audio data = data.astype(np.float32) if np.max(np.abs(data)) > 0: data = data / np.max(np.abs(data)) data = data * vol layers.append((sr, data)) if not layers: raise gr.Error("Generate at least one layer before mixing.") sr = layers[0][0] max_len = max(len(l[1]) for l in layers) padded = [np.pad(l[1], (0, max_len - len(l[1]))) for l in layers] mixed = sum(padded) / len(padded) if np.max(np.abs(mixed)) > 0: mixed = mixed / np.max(np.abs(mixed)) * 0.9 return (sr, mixed) # --- Gradio Interface --- with gr.Blocks(title="AmbientGen", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # ๐ŸŒง๏ธ AmbientGen ### Generate ambient soundscapes with AI Create layered ambient sounds by generating individual elements and mixing them together. Powered by [AudioLDM2](https://huggingface.co/cvssp/audioldm2). **Tip:** Use presets for quick results, or write custom prompts. Start with a quality modifier like "ambient soundscape of" or "field recording of" for best results. """ ) with gr.Tab("๐ŸŽง Quick Generate"): with gr.Row(): preset_dropdown = gr.Dropdown( choices=list(PRESETS.keys()), label="Choose a preset", value="๐ŸŒง๏ธ Rain", ) seed_quick = gr.Number(label="Seed", value=42, precision=0) quick_btn = gr.Button("Generate", variant="primary", size="lg") quick_output = gr.Audio(label="Generated Sound", type="numpy") quick_btn.click( fn=generate_sound, inputs=[gr.Textbox(visible=False, value=""), preset_dropdown, seed_quick], outputs=quick_output, ) with gr.Tab("๐ŸŽ›๏ธ Layer Mixer"): gr.Markdown("Generate up to 3 layers and mix them into a soundscape.") with gr.Group(): gr.Markdown("### Layer 1") with gr.Row(): preset1 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="๐ŸŒง๏ธ Rain") prompt1 = gr.Textbox(label="Custom prompt (used when preset is 'Custom')", placeholder="ambient soundscape of...") seed1 = gr.Number(label="Seed", value=42, precision=0) btn1 = gr.Button("Generate Layer 1") audio1 = gr.Audio(label="Layer 1", type="numpy") vol1 = gr.Slider(0, 1, value=0.8, step=0.1, label="Volume") with gr.Group(): gr.Markdown("### Layer 2") with gr.Row(): preset2 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="โ›ˆ๏ธ Thunderstorm") prompt2 = gr.Textbox(label="Custom prompt", placeholder="field recording of...") seed2 = gr.Number(label="Seed", value=7, precision=0) btn2 = gr.Button("Generate Layer 2") audio2 = gr.Audio(label="Layer 2", type="numpy") vol2 = gr.Slider(0, 1, value=0.5, step=0.1, label="Volume") with gr.Group(): gr.Markdown("### Layer 3") with gr.Row(): preset3 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="๐Ÿ”ฅ Campfire") prompt3 = gr.Textbox(label="Custom prompt", placeholder="high quality recording of...") seed3 = gr.Number(label="Seed", value=0, precision=0) btn3 = gr.Button("Generate Layer 3") audio3 = gr.Audio(label="Layer 3", type="numpy") vol3 = gr.Slider(0, 1, value=0.4, step=0.1, label="Volume") btn1.click(fn=generate_sound, inputs=[prompt1, preset1, seed1], outputs=audio1) btn2.click(fn=generate_sound, inputs=[prompt2, preset2, seed2], outputs=audio2) btn3.click(fn=generate_sound, inputs=[prompt3, preset3, seed3], outputs=audio3) gr.Markdown("### ๐ŸŽš๏ธ Mix") mix_btn = gr.Button("Mix All Layers", variant="primary", size="lg") mix_output = gr.Audio(label="Mixed Soundscape", type="numpy") mix_btn.click(fn=mix_layers, inputs=[audio1, audio2, audio3, vol1, vol2, vol3], outputs=mix_output) with gr.Tab("โœ๏ธ Custom"): gr.Markdown( """ Write your own prompt. For best results: - Start with a quality modifier: *"ambient soundscape of"*, *"field recording of"* - Add specific sounds: *"rain"*, *"crackling fire"*, *"flowing stream"* - Add spatial context: *"in a forest"*, *"on a mountain"* - Keep to 2-3 sound elements per generation """ ) custom_prompt = gr.Textbox( label="Your prompt", placeholder="ambient soundscape of a gentle stream flowing over rocks in a forest with birdsong", lines=2, ) with gr.Row(): custom_seed = gr.Number(label="Seed", value=42, precision=0) custom_guidance = gr.Slider(1.0, 7.0, value=3.0, step=0.5, label="Guidance Scale") custom_steps = gr.Slider(20, 100, value=40, step=10, label="Inference Steps") custom_btn = gr.Button("Generate", variant="primary", size="lg") custom_output = gr.Audio(label="Generated Sound", type="numpy") custom_btn.click(fn=generate_custom, inputs=[custom_prompt, custom_seed, custom_guidance, custom_steps], outputs=custom_output) gr.Markdown( """ --- Built as a learning project exploring GenAI for audio. [Blog](https://my-sonicase.github.io/ambientgen/) ยท [GitHub](https://github.com/my-sonicase/ambientgen) ยท Powered by [AudioLDM2](https://huggingface.co/cvssp/audioldm2) """ ) demo.launch()