File size: 4,435 Bytes
722a5d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""One-off renderer for the sampled ambience beds.

vinyl_crackle and tape_hiss are synthesized live in ambience.py; the seven
beds below only need to exist once on disk. This script fills
assets/ambience/ with text-to-audio renders from AudioLDM2.

(AudioGen would also work, but it lives in the unmaintained audiocraft
package which doesn't install on Python 3.13; AudioLDM2 ships in diffusers
and runs next to the project's torch/transformers as-is.)

Usage:
    pip install diffusers
    python scripts/make_ambience.py              # render whatever is missing
    python scripts/make_ambience.py ocean_waves --force   # redo one

Each clip is ~12 s; the runtime mixer tiles it with crossfades, so it does
not need to loop perfectly. Re-run any slug whose render sounds off —
text-to-audio is a slot machine, two pulls usually land one keeper.
"""

import argparse
import os
import sys
import wave
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
OUT_DIR = ROOT / "assets" / "ambience"

PROMPTS = {
    "soft_rain": "gentle steady rain falling on leaves, calm rain ambience, no thunder",
    "ocean_waves": "calm ocean waves gently rolling onto a sandy beach, soft surf",
    "fireplace_crackle": "cozy fireplace, fire crackling and popping softly",
    "birdsong": "soft morning birdsong, small birds chirping in a quiet garden",
    "night_crickets": "crickets chirping steadily on a calm summer night",
    "cafe_murmur": "quiet coffee shop ambience, soft murmur of distant conversation, occasional clink of cups",
    "wind_in_trees": "soft wind rustling through tree leaves, gentle breeze",
}
NEGATIVE = "music, melody, singing, speech, voice, loud, harsh, low quality, distortion"


def write_wav(samples, rate: int, path: Path) -> None:
    import numpy as np

    peak = float(np.abs(samples).max() or 1.0)
    pcm = (samples * (0.9 / peak) * 32767).astype("<i2")
    with wave.open(str(path), "wb") as w:
        w.setnchannels(1)
        w.setsampwidth(2)
        w.setframerate(rate)
        w.writeframes(pcm.tobytes())


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__.split("\n")[0])
    parser.add_argument("slugs", nargs="*", choices=[*PROMPTS, []], metavar="slug",
                        help=f"which beds to render (default: all missing). One of: {', '.join(PROMPTS)}")
    parser.add_argument("--force", action="store_true", help="re-render even if the wav exists")
    parser.add_argument("--duration", type=float, default=12.0, help="clip length in seconds")
    parser.add_argument("--steps", type=int, default=200, help="diffusion steps (more = cleaner, slower)")
    parser.add_argument("--candidates", type=int, default=2,
                        help="waveforms per prompt; the pipeline keeps the best text match")
    args = parser.parse_args()

    todo = args.slugs or [s for s in PROMPTS if args.force or not (OUT_DIR / f"{s}.wav").exists()]
    if not todo:
        print("all ambience beds already rendered — use --force to redo")
        return 0
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    import torch
    from diffusers import AudioLDM2Pipeline

    device = os.getenv("LOFINITY_DEVICE") or ("mps" if torch.backends.mps.is_available() else "cpu")
    print(f"first run downloads ~3 GB (cvssp/audioldm2); rendering on {device}")
    pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
    pipe.to(device)

    for slug in todo:
        path = OUT_DIR / f"{slug}.wav"
        if path.exists() and not args.force and not args.slugs:
            continue
        print(f"rendering {slug}: {PROMPTS[slug]!r}")

        def run():
            return pipe(
                prompt=PROMPTS[slug],
                negative_prompt=NEGATIVE,
                num_inference_steps=args.steps,
                audio_length_in_s=args.duration,
                num_waveforms_per_prompt=args.candidates,
            ).audios[0]  # audios come back ranked by text alignment

        try:
            audio = run()
        except Exception as e:  # noqa: BLE001 — mps kernels are still patchy
            if device == "cpu":
                raise
            print(f"  {device} failed ({e!r}), retrying on cpu")
            pipe.to("cpu")
            device = "cpu"
            audio = run()
        write_wav(audio, 16000, path)
        print(f"  -> {path.relative_to(ROOT)}")
    return 0


if __name__ == "__main__":
    sys.exit(main())