| | """ |
| | Audio generator with explicit backend tracking. |
| | |
| | Phase 2: Stability over realism. |
| | - Backend is always recorded (never ambiguous) |
| | - Fallback ambient is the deterministic baseline |
| | - AudioLDM 2 used only if explicitly available and stable |
| | |
| | Upgrade note: AudioLDM 1 → AudioLDM 2 (cvssp/audioldm2) |
| | - Better audio quality, same API surface |
| | - unload() method added for sequential model loading within 16GB RAM |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | from dataclasses import dataclass, asdict |
| | from typing import Optional, Dict, Any |
| |
|
| | import numpy as np |
| | import soundfile as sf |
| | from pathlib import Path |
| |
|
| |
|
| | @dataclass(frozen=True) |
| | class AudioGenResult: |
| | """Result of audio generation with full metadata.""" |
| | audio_path: str |
| | backend: str |
| | prompt_hash: int |
| | duration_sec: float |
| | sample_rate: int |
| | note: Optional[str] = None |
| |
|
| | def to_dict(self) -> Dict[str, Any]: |
| | return asdict(self) |
| |
|
| |
|
| | class AudioGenerator: |
| | """ |
| | Audio generator with explicit backend selection. |
| | |
| | Strategy (Phase 2B): |
| | - Default: fallback_ambient (fully deterministic, always works) |
| | - Optional: AudioLDM 2 (if force_audioldm=True and model is available) |
| | |
| | The fallback ambient generator produces prompt-seeded ambient soundscapes. |
| | This is acceptable for a case study testing alignment behavior, not audio realism. |
| | """ |
| |
|
| | def __init__(self, device: str = "cpu", force_audioldm: bool = False): |
| | self.device = device |
| | self._audioldm_pipe = None |
| | self._audioldm_backend_name = None |
| | self._torch = None |
| | self._audioldm_error = None |
| |
|
| | if force_audioldm: |
| | try: |
| | from diffusers import AudioLDM2Pipeline |
| | import torch |
| |
|
| | model_id = "cvssp/audioldm2" |
| | self._audioldm_pipe = AudioLDM2Pipeline.from_pretrained( |
| | model_id, |
| | torch_dtype=torch.float16 if device != "cpu" else torch.float32, |
| | ) |
| | self._audioldm_pipe.to(self.device) |
| | self._audioldm_backend_name = f"AudioLDM2Pipeline({model_id})" |
| | self._torch = torch |
| | except Exception as exc: |
| | self._audioldm_error = str(exc) |
| |
|
| | def generate( |
| | self, |
| | prompt: str, |
| | out_path: str, |
| | duration_sec: float = 6.0, |
| | sr: int = 48000, |
| | seed: Optional[int] = None, |
| | ) -> AudioGenResult: |
| | """ |
| | Generate audio for a prompt. |
| | |
| | Backend selection: |
| | 1. If AudioLDM was loaded (force_audioldm=True): try it, fallback on error |
| | 2. Otherwise: use fallback_ambient (deterministic baseline) |
| | """ |
| | if self._audioldm_pipe is not None: |
| | try: |
| | return self._generate_audioldm(prompt, out_path, duration_sec, sr, seed) |
| | except Exception as exc: |
| | return self._generate_fallback( |
| | prompt, out_path, duration_sec, sr, seed, |
| | note=f"AudioLDM failed at runtime: {exc}", |
| | ) |
| |
|
| | return self._generate_fallback( |
| | prompt, out_path, duration_sec, sr, seed, |
| | note=self._audioldm_error or "Using deterministic fallback (default)", |
| | ) |
| |
|
| | def unload(self) -> None: |
| | """Free GPU/MPS memory by deleting the pipeline. Critical for 16GB RAM constraint.""" |
| | if self._audioldm_pipe is not None: |
| | del self._audioldm_pipe |
| | self._audioldm_pipe = None |
| | if self._torch is not None: |
| | if self._torch.cuda.is_available(): |
| | self._torch.cuda.empty_cache() |
| | elif hasattr(self._torch.backends, "mps") and self._torch.backends.mps.is_available(): |
| | self._torch.mps.empty_cache() |
| | import gc |
| | gc.collect() |
| |
|
| | def _generate_audioldm( |
| | self, prompt: str, out_path: str, duration_sec: float, sr: int, seed: Optional[int], |
| | ) -> AudioGenResult: |
| | """Generate with AudioLDM 2.""" |
| | generator = None |
| | if seed is not None and self._torch is not None: |
| | |
| | gen_device = "cpu" if self.device == "mps" else self.device |
| | generator = self._torch.Generator(device=gen_device).manual_seed(seed) |
| | kwargs = {"audio_length_in_s": duration_sec} |
| | if generator is not None: |
| | kwargs["generator"] = generator |
| | result = self._audioldm_pipe(prompt, **kwargs) |
| | audio = result.audios[0] |
| | sf.write(out_path, audio, sr) |
| |
|
| | prompt_hash = abs(hash((prompt, seed))) % (2**32) |
| | return AudioGenResult( |
| | audio_path=out_path, |
| | backend="audioldm2", |
| | prompt_hash=prompt_hash, |
| | duration_sec=duration_sec, |
| | sample_rate=sr, |
| | ) |
| |
|
| | def _generate_fallback( |
| | self, |
| | prompt: str, |
| | out_path: str, |
| | duration_sec: float, |
| | sr: int, |
| | seed: Optional[int], |
| | note: str = "", |
| | ) -> AudioGenResult: |
| | """ |
| | Deterministic ambient soundscape generator. |
| | |
| | Produces prompt-dependent audio by seeding RNG from hash(prompt) + seed. |
| | Different prompts produce different spectral characteristics: |
| | - Drone frequency varies with prompt |
| | - Noise filtering varies with prompt |
| | - Amplitude envelope varies with prompt |
| | |
| | This ensures wrong_audio perturbations produce genuinely different audio. |
| | """ |
| | |
| | base_seed = abs(hash(prompt)) % (2**32) |
| | if seed is not None: |
| | base_seed = (base_seed + seed) % (2**32) |
| | rng = np.random.default_rng(base_seed) |
| |
|
| | n = int(duration_sec * sr) |
| | t = np.linspace(0, duration_sec, n, endpoint=False) |
| |
|
| | |
| | prompt_val = sum(ord(c) for c in prompt) |
| | drone_freq = 80.0 + (prompt_val % 200) |
| | filter_width = 2000 + (prompt_val % 6000) |
| | noise_amplitude = 0.02 + (prompt_val % 50) * 0.001 |
| | drone_amplitude = 0.06 + (prompt_val % 40) * 0.001 |
| |
|
| | |
| | noise = rng.normal(0, 1, size=n).astype(np.float32) |
| | kernel = np.ones(filter_width, dtype=np.float32) / filter_width |
| | noise = np.convolve(noise, kernel, mode="same") |
| |
|
| | |
| | drone = drone_amplitude * np.sin(2 * np.pi * drone_freq * t).astype(np.float32) |
| |
|
| | |
| | harmonic_freq = drone_freq * 1.5 + (prompt_val % 100) |
| | harmonic = (drone_amplitude * 0.3) * np.sin(2 * np.pi * harmonic_freq * t).astype(np.float32) |
| |
|
| | audio = (noise_amplitude * noise + drone + harmonic).astype(np.float32) |
| | audio = np.clip(audio, -1.0, 1.0) |
| |
|
| | sf.write(out_path, audio, sr) |
| |
|
| | return AudioGenResult( |
| | audio_path=out_path, |
| | backend="fallback_ambient", |
| | prompt_hash=base_seed, |
| | duration_sec=duration_sec, |
| | sample_rate=sr, |
| | note=note, |
| | ) |
| |
|
| |
|
| | def generate_audio( |
| | prompt: str, |
| | out_dir: str, |
| | filename: str = "audio.wav", |
| | device: str = "cpu", |
| | deterministic: bool = True, |
| | seed: int = 42, |
| | ) -> str: |
| | """ |
| | Generate audio for a prompt. Returns path to audio file. |
| | |
| | Uses deterministic fallback by default (stable for experiments). |
| | """ |
| | out_path = Path(out_dir) / filename |
| | out_path.parent.mkdir(parents=True, exist_ok=True) |
| | generator = AudioGenerator(device=device) |
| | seed_value = seed if deterministic else None |
| | result = generator.generate(prompt=prompt, out_path=str(out_path), seed=seed_value) |
| | return result.audio_path |
| |
|
| |
|
| | def generate_audio_with_metadata( |
| | prompt: str, |
| | out_dir: str, |
| | filename: str = "audio.wav", |
| | device: str = "cpu", |
| | deterministic: bool = True, |
| | seed: int = 42, |
| | ) -> AudioGenResult: |
| | """ |
| | Generate audio and return full metadata. |
| | |
| | Use this in experiment pipelines where backend tracking matters. |
| | """ |
| | out_path = Path(out_dir) / filename |
| | out_path.parent.mkdir(parents=True, exist_ok=True) |
| | generator = AudioGenerator(device=device) |
| | seed_value = seed if deterministic else None |
| | return generator.generate(prompt=prompt, out_path=str(out_path), seed=seed_value) |
| |
|