Spaces:

pratik-250620
/

MultiModal-Coherence-AI

Running

File size: 8,559 Bytes
"""
Audio generator with explicit backend tracking.

Phase 2: Stability over realism.
- Backend is always recorded (never ambiguous)
- Fallback ambient is the deterministic baseline
- AudioLDM 2 used only if explicitly available and stable

Upgrade note: AudioLDM 1 → AudioLDM 2 (cvssp/audioldm2)
- Better audio quality, same API surface
- unload() method added for sequential model loading within 16GB RAM
"""

from __future__ import annotations

from dataclasses import dataclass, asdict
from typing import Optional, Dict, Any

import numpy as np
import soundfile as sf
from pathlib import Path


@dataclass(frozen=True)
class AudioGenResult:
    """Result of audio generation with full metadata."""
    audio_path: str
    backend: str  # "audioldm2" or "fallback_ambient" — always explicit
    prompt_hash: int  # Deterministic hash of (prompt, seed) for reproducibility
    duration_sec: float
    sample_rate: int
    note: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


class AudioGenerator:
    """
    Audio generator with explicit backend selection.

    Strategy (Phase 2B):
    - Default: fallback_ambient (fully deterministic, always works)
    - Optional: AudioLDM 2 (if force_audioldm=True and model is available)

    The fallback ambient generator produces prompt-seeded ambient soundscapes.
    This is acceptable for a case study testing alignment behavior, not audio realism.
    """

    def __init__(self, device: str = "cpu", force_audioldm: bool = False):
        self.device = device
        self._audioldm_pipe = None
        self._audioldm_backend_name = None
        self._torch = None
        self._audioldm_error = None

        if force_audioldm:
            try:
                from diffusers import AudioLDM2Pipeline
                import torch

                model_id = "cvssp/audioldm2"
                self._audioldm_pipe = AudioLDM2Pipeline.from_pretrained(
                    model_id,
                    torch_dtype=torch.float16 if device != "cpu" else torch.float32,
                )
                self._audioldm_pipe.to(self.device)
                self._audioldm_backend_name = f"AudioLDM2Pipeline({model_id})"
                self._torch = torch
            except Exception as exc:
                self._audioldm_error = str(exc)

    def generate(
        self,
        prompt: str,
        out_path: str,
        duration_sec: float = 6.0,
        sr: int = 48000,
        seed: Optional[int] = None,
    ) -> AudioGenResult:
        """
        Generate audio for a prompt.

        Backend selection:
        1. If AudioLDM was loaded (force_audioldm=True): try it, fallback on error
        2. Otherwise: use fallback_ambient (deterministic baseline)
        """
        if self._audioldm_pipe is not None:
            try:
                return self._generate_audioldm(prompt, out_path, duration_sec, sr, seed)
            except Exception as exc:
                return self._generate_fallback(
                    prompt, out_path, duration_sec, sr, seed,
                    note=f"AudioLDM failed at runtime: {exc}",
                )

        return self._generate_fallback(
            prompt, out_path, duration_sec, sr, seed,
            note=self._audioldm_error or "Using deterministic fallback (default)",
        )

    def unload(self) -> None:
        """Free GPU/MPS memory by deleting the pipeline. Critical for 16GB RAM constraint."""
        if self._audioldm_pipe is not None:
            del self._audioldm_pipe
            self._audioldm_pipe = None
            if self._torch is not None:
                if self._torch.cuda.is_available():
                    self._torch.cuda.empty_cache()
                elif hasattr(self._torch.backends, "mps") and self._torch.backends.mps.is_available():
                    self._torch.mps.empty_cache()
            import gc
            gc.collect()

    def _generate_audioldm(
        self, prompt: str, out_path: str, duration_sec: float, sr: int, seed: Optional[int],
    ) -> AudioGenResult:
        """Generate with AudioLDM 2."""
        generator = None
        if seed is not None and self._torch is not None:
            # MPS generator must be created on CPU then used
            gen_device = "cpu" if self.device == "mps" else self.device
            generator = self._torch.Generator(device=gen_device).manual_seed(seed)
        kwargs = {"audio_length_in_s": duration_sec}
        if generator is not None:
            kwargs["generator"] = generator
        result = self._audioldm_pipe(prompt, **kwargs)
        audio = result.audios[0]
        sf.write(out_path, audio, sr)

        prompt_hash = abs(hash((prompt, seed))) % (2**32)
        return AudioGenResult(
            audio_path=out_path,
            backend="audioldm2",
            prompt_hash=prompt_hash,
            duration_sec=duration_sec,
            sample_rate=sr,
        )

    def _generate_fallback(
        self,
        prompt: str,
        out_path: str,
        duration_sec: float,
        sr: int,
        seed: Optional[int],
        note: str = "",
    ) -> AudioGenResult:
        """
        Deterministic ambient soundscape generator.

        Produces prompt-dependent audio by seeding RNG from hash(prompt) + seed.
        Different prompts produce different spectral characteristics:
        - Drone frequency varies with prompt
        - Noise filtering varies with prompt
        - Amplitude envelope varies with prompt

        This ensures wrong_audio perturbations produce genuinely different audio.
        """
        # Deterministic seed from prompt content
        base_seed = abs(hash(prompt)) % (2**32)
        if seed is not None:
            base_seed = (base_seed + seed) % (2**32)
        rng = np.random.default_rng(base_seed)

        n = int(duration_sec * sr)
        t = np.linspace(0, duration_sec, n, endpoint=False)

        # Prompt-dependent parameters — different prompts get different sounds
        prompt_val = sum(ord(c) for c in prompt)
        drone_freq = 80.0 + (prompt_val % 200)  # 80-280 Hz range
        filter_width = 2000 + (prompt_val % 6000)  # 2000-8000 sample filter
        noise_amplitude = 0.02 + (prompt_val % 50) * 0.001  # 0.02-0.07
        drone_amplitude = 0.06 + (prompt_val % 40) * 0.001  # 0.06-0.10

        # Generate noise with prompt-dependent filtering
        noise = rng.normal(0, 1, size=n).astype(np.float32)
        kernel = np.ones(filter_width, dtype=np.float32) / filter_width
        noise = np.convolve(noise, kernel, mode="same")

        # Prompt-dependent drone
        drone = drone_amplitude * np.sin(2 * np.pi * drone_freq * t).astype(np.float32)

        # Add second harmonic for richer sound
        harmonic_freq = drone_freq * 1.5 + (prompt_val % 100)
        harmonic = (drone_amplitude * 0.3) * np.sin(2 * np.pi * harmonic_freq * t).astype(np.float32)

        audio = (noise_amplitude * noise + drone + harmonic).astype(np.float32)
        audio = np.clip(audio, -1.0, 1.0)

        sf.write(out_path, audio, sr)

        return AudioGenResult(
            audio_path=out_path,
            backend="fallback_ambient",
            prompt_hash=base_seed,
            duration_sec=duration_sec,
            sample_rate=sr,
            note=note,
        )


def generate_audio(
    prompt: str,
    out_dir: str,
    filename: str = "audio.wav",
    device: str = "cpu",
    deterministic: bool = True,
    seed: int = 42,
) -> str:
    """
    Generate audio for a prompt. Returns path to audio file.

    Uses deterministic fallback by default (stable for experiments).
    """
    out_path = Path(out_dir) / filename
    out_path.parent.mkdir(parents=True, exist_ok=True)
    generator = AudioGenerator(device=device)
    seed_value = seed if deterministic else None
    result = generator.generate(prompt=prompt, out_path=str(out_path), seed=seed_value)
    return result.audio_path


def generate_audio_with_metadata(
    prompt: str,
    out_dir: str,
    filename: str = "audio.wav",
    device: str = "cpu",
    deterministic: bool = True,
    seed: int = 42,
) -> AudioGenResult:
    """
    Generate audio and return full metadata.

    Use this in experiment pipelines where backend tracking matters.
    """
    out_path = Path(out_dir) / filename
    out_path.parent.mkdir(parents=True, exist_ok=True)
    generator = AudioGenerator(device=device)
    seed_value = seed if deterministic else None
    return generator.generate(prompt=prompt, out_path=str(out_path), seed=seed_value)