pratik-250620's picture
Upload folder using huggingface_hub
6835659 verified
"""
Audio generator with explicit backend tracking.
Phase 2: Stability over realism.
- Backend is always recorded (never ambiguous)
- Fallback ambient is the deterministic baseline
- AudioLDM 2 used only if explicitly available and stable
Upgrade note: AudioLDM 1 → AudioLDM 2 (cvssp/audioldm2)
- Better audio quality, same API surface
- unload() method added for sequential model loading within 16GB RAM
"""
from __future__ import annotations
from dataclasses import dataclass, asdict
from typing import Optional, Dict, Any
import numpy as np
import soundfile as sf
from pathlib import Path
@dataclass(frozen=True)
class AudioGenResult:
"""Result of audio generation with full metadata."""
audio_path: str
backend: str # "audioldm2" or "fallback_ambient" — always explicit
prompt_hash: int # Deterministic hash of (prompt, seed) for reproducibility
duration_sec: float
sample_rate: int
note: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
class AudioGenerator:
"""
Audio generator with explicit backend selection.
Strategy (Phase 2B):
- Default: fallback_ambient (fully deterministic, always works)
- Optional: AudioLDM 2 (if force_audioldm=True and model is available)
The fallback ambient generator produces prompt-seeded ambient soundscapes.
This is acceptable for a case study testing alignment behavior, not audio realism.
"""
def __init__(self, device: str = "cpu", force_audioldm: bool = False):
self.device = device
self._audioldm_pipe = None
self._audioldm_backend_name = None
self._torch = None
self._audioldm_error = None
if force_audioldm:
try:
from diffusers import AudioLDM2Pipeline
import torch
model_id = "cvssp/audioldm2"
self._audioldm_pipe = AudioLDM2Pipeline.from_pretrained(
model_id,
torch_dtype=torch.float16 if device != "cpu" else torch.float32,
)
self._audioldm_pipe.to(self.device)
self._audioldm_backend_name = f"AudioLDM2Pipeline({model_id})"
self._torch = torch
except Exception as exc:
self._audioldm_error = str(exc)
def generate(
self,
prompt: str,
out_path: str,
duration_sec: float = 6.0,
sr: int = 48000,
seed: Optional[int] = None,
) -> AudioGenResult:
"""
Generate audio for a prompt.
Backend selection:
1. If AudioLDM was loaded (force_audioldm=True): try it, fallback on error
2. Otherwise: use fallback_ambient (deterministic baseline)
"""
if self._audioldm_pipe is not None:
try:
return self._generate_audioldm(prompt, out_path, duration_sec, sr, seed)
except Exception as exc:
return self._generate_fallback(
prompt, out_path, duration_sec, sr, seed,
note=f"AudioLDM failed at runtime: {exc}",
)
return self._generate_fallback(
prompt, out_path, duration_sec, sr, seed,
note=self._audioldm_error or "Using deterministic fallback (default)",
)
def unload(self) -> None:
"""Free GPU/MPS memory by deleting the pipeline. Critical for 16GB RAM constraint."""
if self._audioldm_pipe is not None:
del self._audioldm_pipe
self._audioldm_pipe = None
if self._torch is not None:
if self._torch.cuda.is_available():
self._torch.cuda.empty_cache()
elif hasattr(self._torch.backends, "mps") and self._torch.backends.mps.is_available():
self._torch.mps.empty_cache()
import gc
gc.collect()
def _generate_audioldm(
self, prompt: str, out_path: str, duration_sec: float, sr: int, seed: Optional[int],
) -> AudioGenResult:
"""Generate with AudioLDM 2."""
generator = None
if seed is not None and self._torch is not None:
# MPS generator must be created on CPU then used
gen_device = "cpu" if self.device == "mps" else self.device
generator = self._torch.Generator(device=gen_device).manual_seed(seed)
kwargs = {"audio_length_in_s": duration_sec}
if generator is not None:
kwargs["generator"] = generator
result = self._audioldm_pipe(prompt, **kwargs)
audio = result.audios[0]
sf.write(out_path, audio, sr)
prompt_hash = abs(hash((prompt, seed))) % (2**32)
return AudioGenResult(
audio_path=out_path,
backend="audioldm2",
prompt_hash=prompt_hash,
duration_sec=duration_sec,
sample_rate=sr,
)
def _generate_fallback(
self,
prompt: str,
out_path: str,
duration_sec: float,
sr: int,
seed: Optional[int],
note: str = "",
) -> AudioGenResult:
"""
Deterministic ambient soundscape generator.
Produces prompt-dependent audio by seeding RNG from hash(prompt) + seed.
Different prompts produce different spectral characteristics:
- Drone frequency varies with prompt
- Noise filtering varies with prompt
- Amplitude envelope varies with prompt
This ensures wrong_audio perturbations produce genuinely different audio.
"""
# Deterministic seed from prompt content
base_seed = abs(hash(prompt)) % (2**32)
if seed is not None:
base_seed = (base_seed + seed) % (2**32)
rng = np.random.default_rng(base_seed)
n = int(duration_sec * sr)
t = np.linspace(0, duration_sec, n, endpoint=False)
# Prompt-dependent parameters — different prompts get different sounds
prompt_val = sum(ord(c) for c in prompt)
drone_freq = 80.0 + (prompt_val % 200) # 80-280 Hz range
filter_width = 2000 + (prompt_val % 6000) # 2000-8000 sample filter
noise_amplitude = 0.02 + (prompt_val % 50) * 0.001 # 0.02-0.07
drone_amplitude = 0.06 + (prompt_val % 40) * 0.001 # 0.06-0.10
# Generate noise with prompt-dependent filtering
noise = rng.normal(0, 1, size=n).astype(np.float32)
kernel = np.ones(filter_width, dtype=np.float32) / filter_width
noise = np.convolve(noise, kernel, mode="same")
# Prompt-dependent drone
drone = drone_amplitude * np.sin(2 * np.pi * drone_freq * t).astype(np.float32)
# Add second harmonic for richer sound
harmonic_freq = drone_freq * 1.5 + (prompt_val % 100)
harmonic = (drone_amplitude * 0.3) * np.sin(2 * np.pi * harmonic_freq * t).astype(np.float32)
audio = (noise_amplitude * noise + drone + harmonic).astype(np.float32)
audio = np.clip(audio, -1.0, 1.0)
sf.write(out_path, audio, sr)
return AudioGenResult(
audio_path=out_path,
backend="fallback_ambient",
prompt_hash=base_seed,
duration_sec=duration_sec,
sample_rate=sr,
note=note,
)
def generate_audio(
prompt: str,
out_dir: str,
filename: str = "audio.wav",
device: str = "cpu",
deterministic: bool = True,
seed: int = 42,
) -> str:
"""
Generate audio for a prompt. Returns path to audio file.
Uses deterministic fallback by default (stable for experiments).
"""
out_path = Path(out_dir) / filename
out_path.parent.mkdir(parents=True, exist_ok=True)
generator = AudioGenerator(device=device)
seed_value = seed if deterministic else None
result = generator.generate(prompt=prompt, out_path=str(out_path), seed=seed_value)
return result.audio_path
def generate_audio_with_metadata(
prompt: str,
out_dir: str,
filename: str = "audio.wav",
device: str = "cpu",
deterministic: bool = True,
seed: int = 42,
) -> AudioGenResult:
"""
Generate audio and return full metadata.
Use this in experiment pipelines where backend tracking matters.
"""
out_path = Path(out_dir) / filename
out_path.parent.mkdir(parents=True, exist_ok=True)
generator = AudioGenerator(device=device)
seed_value = seed if deterministic else None
return generator.generate(prompt=prompt, out_path=str(out_path), seed=seed_value)