File size: 8,559 Bytes
6835659
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
"""
Audio generator with explicit backend tracking.

Phase 2: Stability over realism.
- Backend is always recorded (never ambiguous)
- Fallback ambient is the deterministic baseline
- AudioLDM 2 used only if explicitly available and stable

Upgrade note: AudioLDM 1 → AudioLDM 2 (cvssp/audioldm2)
- Better audio quality, same API surface
- unload() method added for sequential model loading within 16GB RAM
"""

from __future__ import annotations

from dataclasses import dataclass, asdict
from typing import Optional, Dict, Any

import numpy as np
import soundfile as sf
from pathlib import Path


@dataclass(frozen=True)
class AudioGenResult:
    """Result of audio generation with full metadata."""
    audio_path: str
    backend: str  # "audioldm2" or "fallback_ambient" — always explicit
    prompt_hash: int  # Deterministic hash of (prompt, seed) for reproducibility
    duration_sec: float
    sample_rate: int
    note: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)


class AudioGenerator:
    """
    Audio generator with explicit backend selection.

    Strategy (Phase 2B):
    - Default: fallback_ambient (fully deterministic, always works)
    - Optional: AudioLDM 2 (if force_audioldm=True and model is available)

    The fallback ambient generator produces prompt-seeded ambient soundscapes.
    This is acceptable for a case study testing alignment behavior, not audio realism.
    """

    def __init__(self, device: str = "cpu", force_audioldm: bool = False):
        self.device = device
        self._audioldm_pipe = None
        self._audioldm_backend_name = None
        self._torch = None
        self._audioldm_error = None

        if force_audioldm:
            try:
                from diffusers import AudioLDM2Pipeline
                import torch

                model_id = "cvssp/audioldm2"
                self._audioldm_pipe = AudioLDM2Pipeline.from_pretrained(
                    model_id,
                    torch_dtype=torch.float16 if device != "cpu" else torch.float32,
                )
                self._audioldm_pipe.to(self.device)
                self._audioldm_backend_name = f"AudioLDM2Pipeline({model_id})"
                self._torch = torch
            except Exception as exc:
                self._audioldm_error = str(exc)

    def generate(
        self,
        prompt: str,
        out_path: str,
        duration_sec: float = 6.0,
        sr: int = 48000,
        seed: Optional[int] = None,
    ) -> AudioGenResult:
        """
        Generate audio for a prompt.

        Backend selection:
        1. If AudioLDM was loaded (force_audioldm=True): try it, fallback on error
        2. Otherwise: use fallback_ambient (deterministic baseline)
        """
        if self._audioldm_pipe is not None:
            try:
                return self._generate_audioldm(prompt, out_path, duration_sec, sr, seed)
            except Exception as exc:
                return self._generate_fallback(
                    prompt, out_path, duration_sec, sr, seed,
                    note=f"AudioLDM failed at runtime: {exc}",
                )

        return self._generate_fallback(
            prompt, out_path, duration_sec, sr, seed,
            note=self._audioldm_error or "Using deterministic fallback (default)",
        )

    def unload(self) -> None:
        """Free GPU/MPS memory by deleting the pipeline. Critical for 16GB RAM constraint."""
        if self._audioldm_pipe is not None:
            del self._audioldm_pipe
            self._audioldm_pipe = None
            if self._torch is not None:
                if self._torch.cuda.is_available():
                    self._torch.cuda.empty_cache()
                elif hasattr(self._torch.backends, "mps") and self._torch.backends.mps.is_available():
                    self._torch.mps.empty_cache()
            import gc
            gc.collect()

    def _generate_audioldm(
        self, prompt: str, out_path: str, duration_sec: float, sr: int, seed: Optional[int],
    ) -> AudioGenResult:
        """Generate with AudioLDM 2."""
        generator = None
        if seed is not None and self._torch is not None:
            # MPS generator must be created on CPU then used
            gen_device = "cpu" if self.device == "mps" else self.device
            generator = self._torch.Generator(device=gen_device).manual_seed(seed)
        kwargs = {"audio_length_in_s": duration_sec}
        if generator is not None:
            kwargs["generator"] = generator
        result = self._audioldm_pipe(prompt, **kwargs)
        audio = result.audios[0]
        sf.write(out_path, audio, sr)

        prompt_hash = abs(hash((prompt, seed))) % (2**32)
        return AudioGenResult(
            audio_path=out_path,
            backend="audioldm2",
            prompt_hash=prompt_hash,
            duration_sec=duration_sec,
            sample_rate=sr,
        )

    def _generate_fallback(
        self,
        prompt: str,
        out_path: str,
        duration_sec: float,
        sr: int,
        seed: Optional[int],
        note: str = "",
    ) -> AudioGenResult:
        """
        Deterministic ambient soundscape generator.

        Produces prompt-dependent audio by seeding RNG from hash(prompt) + seed.
        Different prompts produce different spectral characteristics:
        - Drone frequency varies with prompt
        - Noise filtering varies with prompt
        - Amplitude envelope varies with prompt

        This ensures wrong_audio perturbations produce genuinely different audio.
        """
        # Deterministic seed from prompt content
        base_seed = abs(hash(prompt)) % (2**32)
        if seed is not None:
            base_seed = (base_seed + seed) % (2**32)
        rng = np.random.default_rng(base_seed)

        n = int(duration_sec * sr)
        t = np.linspace(0, duration_sec, n, endpoint=False)

        # Prompt-dependent parameters — different prompts get different sounds
        prompt_val = sum(ord(c) for c in prompt)
        drone_freq = 80.0 + (prompt_val % 200)  # 80-280 Hz range
        filter_width = 2000 + (prompt_val % 6000)  # 2000-8000 sample filter
        noise_amplitude = 0.02 + (prompt_val % 50) * 0.001  # 0.02-0.07
        drone_amplitude = 0.06 + (prompt_val % 40) * 0.001  # 0.06-0.10

        # Generate noise with prompt-dependent filtering
        noise = rng.normal(0, 1, size=n).astype(np.float32)
        kernel = np.ones(filter_width, dtype=np.float32) / filter_width
        noise = np.convolve(noise, kernel, mode="same")

        # Prompt-dependent drone
        drone = drone_amplitude * np.sin(2 * np.pi * drone_freq * t).astype(np.float32)

        # Add second harmonic for richer sound
        harmonic_freq = drone_freq * 1.5 + (prompt_val % 100)
        harmonic = (drone_amplitude * 0.3) * np.sin(2 * np.pi * harmonic_freq * t).astype(np.float32)

        audio = (noise_amplitude * noise + drone + harmonic).astype(np.float32)
        audio = np.clip(audio, -1.0, 1.0)

        sf.write(out_path, audio, sr)

        return AudioGenResult(
            audio_path=out_path,
            backend="fallback_ambient",
            prompt_hash=base_seed,
            duration_sec=duration_sec,
            sample_rate=sr,
            note=note,
        )


def generate_audio(
    prompt: str,
    out_dir: str,
    filename: str = "audio.wav",
    device: str = "cpu",
    deterministic: bool = True,
    seed: int = 42,
) -> str:
    """
    Generate audio for a prompt. Returns path to audio file.

    Uses deterministic fallback by default (stable for experiments).
    """
    out_path = Path(out_dir) / filename
    out_path.parent.mkdir(parents=True, exist_ok=True)
    generator = AudioGenerator(device=device)
    seed_value = seed if deterministic else None
    result = generator.generate(prompt=prompt, out_path=str(out_path), seed=seed_value)
    return result.audio_path


def generate_audio_with_metadata(
    prompt: str,
    out_dir: str,
    filename: str = "audio.wav",
    device: str = "cpu",
    deterministic: bool = True,
    seed: int = 42,
) -> AudioGenResult:
    """
    Generate audio and return full metadata.

    Use this in experiment pipelines where backend tracking matters.
    """
    out_path = Path(out_dir) / filename
    out_path.parent.mkdir(parents=True, exist_ok=True)
    generator = AudioGenerator(device=device)
    seed_value = seed if deterministic else None
    return generator.generate(prompt=prompt, out_path=str(out_path), seed=seed_value)