Spaces:

sonicase
/

ambientgen

Sleeping

File size: 8,421 Bytes

import gradio as gr
import spaces
import torch
import numpy as np
from diffusers import AudioLDM2Pipeline

# --- Model Loading ---
print("Loading AudioLDM2 model...")
pipe = AudioLDM2Pipeline.from_pretrained(
    "cvssp/audioldm2",
    torch_dtype=torch.float16,
)
print("Model loaded!")

SAMPLE_RATE = 16000

PRESETS = {
    "🌧️ Rain": "ambient soundscape of gentle rain falling on a window",
    "🌊 Ocean": "field recording of ocean waves crashing on a rocky shore with distant seagulls",
    "🔥 Campfire": "high quality recording of a campfire crackling and popping at night with crickets",
    "🌲 Forest": "ambient soundscape of a forest at night with crickets and a gentle breeze through trees",
    "⛈️ Thunderstorm": "field recording of a thunderstorm with heavy rain and distant rolling thunder",
    "☕ Coffee Shop": "ambient soundscape of a busy coffee shop with quiet chatter and cups clinking",
    "🌬️ Wind": "field recording of gentle wind blowing through pine trees on a mountain, leaves rustling",
    "🦉 Night": "ambient soundscape of a quiet night with owls hooting and distant frogs",
}


@spaces.GPU
def generate_sound(prompt, preset, seed):
    """Generate a single ambient sound layer."""
    if preset and preset != "Custom":
        final_prompt = PRESETS[preset]
    else:
        if not prompt or prompt.strip() == "":
            raise gr.Error("Please enter a prompt or select a preset.")
        final_prompt = prompt

    pipe.to("cuda")
    generator = torch.Generator("cuda").manual_seed(int(seed))

    audio = pipe(
        final_prompt,
        negative_prompt="Low quality.",
        num_inference_steps=40,
        audio_length_in_s=8.0,
        guidance_scale=3.0,
        generator=generator,
    ).audios[0]

    return (SAMPLE_RATE, audio)


@spaces.GPU
def generate_custom(prompt, seed, guidance_scale, steps):
    """Generate from custom prompt with advanced settings."""
    if not prompt or prompt.strip() == "":
        raise gr.Error("Please enter a prompt.")

    pipe.to("cuda")
    generator = torch.Generator("cuda").manual_seed(int(seed))

    audio = pipe(
        prompt,
        negative_prompt="Low quality.",
        num_inference_steps=int(steps),
        audio_length_in_s=8.0,
        guidance_scale=guidance_scale,
        generator=generator,
    ).audios[0]

    return (SAMPLE_RATE, audio)


def mix_layers(audio1, audio2, audio3, vol1, vol2, vol3):
    """Mix up to 3 audio layers with volume control."""
    layers = []
    for audio, vol in [(audio1, vol1), (audio2, vol2), (audio3, vol3)]:
        if audio is not None:
            sr, data = audio
            data = data.astype(np.float32)
            if np.max(np.abs(data)) > 0:
                data = data / np.max(np.abs(data))
            data = data * vol
            layers.append((sr, data))

    if not layers:
        raise gr.Error("Generate at least one layer before mixing.")

    sr = layers[0][0]
    max_len = max(len(l[1]) for l in layers)
    padded = [np.pad(l[1], (0, max_len - len(l[1]))) for l in layers]

    mixed = sum(padded) / len(padded)
    if np.max(np.abs(mixed)) > 0:
        mixed = mixed / np.max(np.abs(mixed)) * 0.9

    return (sr, mixed)


# --- Gradio Interface ---
with gr.Blocks(title="AmbientGen", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🌧️ AmbientGen
        ### Generate ambient soundscapes with AI

        Create layered ambient sounds by generating individual elements and mixing them together.
        Powered by [AudioLDM2](https://huggingface.co/cvssp/audioldm2).

        **Tip:** Use presets for quick results, or write custom prompts. Start with a quality modifier
        like "ambient soundscape of" or "field recording of" for best results.
        """
    )

    with gr.Tab("🎧 Quick Generate"):
        with gr.Row():
            preset_dropdown = gr.Dropdown(
                choices=list(PRESETS.keys()),
                label="Choose a preset",
                value="🌧️ Rain",
            )
            seed_quick = gr.Number(label="Seed", value=42, precision=0)

        quick_btn = gr.Button("Generate", variant="primary", size="lg")
        quick_output = gr.Audio(label="Generated Sound", type="numpy")

        quick_btn.click(
            fn=generate_sound,
            inputs=[gr.Textbox(visible=False, value=""), preset_dropdown, seed_quick],
            outputs=quick_output,
        )

    with gr.Tab("🎛️ Layer Mixer"):
        gr.Markdown("Generate up to 3 layers and mix them into a soundscape.")

        with gr.Group():
            gr.Markdown("### Layer 1")
            with gr.Row():
                preset1 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="🌧️ Rain")
                prompt1 = gr.Textbox(label="Custom prompt (used when preset is 'Custom')", placeholder="ambient soundscape of...")
                seed1 = gr.Number(label="Seed", value=42, precision=0)
            btn1 = gr.Button("Generate Layer 1")
            audio1 = gr.Audio(label="Layer 1", type="numpy")
            vol1 = gr.Slider(0, 1, value=0.8, step=0.1, label="Volume")

        with gr.Group():
            gr.Markdown("### Layer 2")
            with gr.Row():
                preset2 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="⛈️ Thunderstorm")
                prompt2 = gr.Textbox(label="Custom prompt", placeholder="field recording of...")
                seed2 = gr.Number(label="Seed", value=7, precision=0)
            btn2 = gr.Button("Generate Layer 2")
            audio2 = gr.Audio(label="Layer 2", type="numpy")
            vol2 = gr.Slider(0, 1, value=0.5, step=0.1, label="Volume")

        with gr.Group():
            gr.Markdown("### Layer 3")
            with gr.Row():
                preset3 = gr.Dropdown(choices=["Custom"] + list(PRESETS.keys()), label="Preset", value="🔥 Campfire")
                prompt3 = gr.Textbox(label="Custom prompt", placeholder="high quality recording of...")
                seed3 = gr.Number(label="Seed", value=0, precision=0)
            btn3 = gr.Button("Generate Layer 3")
            audio3 = gr.Audio(label="Layer 3", type="numpy")
            vol3 = gr.Slider(0, 1, value=0.4, step=0.1, label="Volume")

        btn1.click(fn=generate_sound, inputs=[prompt1, preset1, seed1], outputs=audio1)
        btn2.click(fn=generate_sound, inputs=[prompt2, preset2, seed2], outputs=audio2)
        btn3.click(fn=generate_sound, inputs=[prompt3, preset3, seed3], outputs=audio3)

        gr.Markdown("### 🎚️ Mix")
        mix_btn = gr.Button("Mix All Layers", variant="primary", size="lg")
        mix_output = gr.Audio(label="Mixed Soundscape", type="numpy")
        mix_btn.click(fn=mix_layers, inputs=[audio1, audio2, audio3, vol1, vol2, vol3], outputs=mix_output)

    with gr.Tab("✍️ Custom"):
        gr.Markdown(
            """
            Write your own prompt. For best results:
            - Start with a quality modifier: *"ambient soundscape of"*, *"field recording of"*
            - Add specific sounds: *"rain"*, *"crackling fire"*, *"flowing stream"*
            - Add spatial context: *"in a forest"*, *"on a mountain"*
            - Keep to 2-3 sound elements per generation
            """
        )
        custom_prompt = gr.Textbox(
            label="Your prompt",
            placeholder="ambient soundscape of a gentle stream flowing over rocks in a forest with birdsong",
            lines=2,
        )
        with gr.Row():
            custom_seed = gr.Number(label="Seed", value=42, precision=0)
            custom_guidance = gr.Slider(1.0, 7.0, value=3.0, step=0.5, label="Guidance Scale")
            custom_steps = gr.Slider(20, 100, value=40, step=10, label="Inference Steps")

        custom_btn = gr.Button("Generate", variant="primary", size="lg")
        custom_output = gr.Audio(label="Generated Sound", type="numpy")
        custom_btn.click(fn=generate_custom, inputs=[custom_prompt, custom_seed, custom_guidance, custom_steps], outputs=custom_output)

    gr.Markdown(
        """
        ---
        Built as a learning project exploring GenAI for audio.
        [Blog](https://my-sonicase.github.io/ambientgen/) ·
        [GitHub](https://github.com/my-sonicase/ambientgen) ·
        Powered by [AudioLDM2](https://huggingface.co/cvssp/audioldm2)
        """
    )

demo.launch()