"""One-off renderer for the sampled ambience beds. vinyl_crackle and tape_hiss are synthesized live in ambience.py; the seven beds below only need to exist once on disk. This script fills assets/ambience/ with text-to-audio renders from AudioLDM2. (AudioGen would also work, but it lives in the unmaintained audiocraft package which doesn't install on Python 3.13; AudioLDM2 ships in diffusers and runs next to the project's torch/transformers as-is.) Usage: pip install diffusers python scripts/make_ambience.py # render whatever is missing python scripts/make_ambience.py ocean_waves --force # redo one Each clip is ~12 s; the runtime mixer tiles it with crossfades, so it does not need to loop perfectly. Re-run any slug whose render sounds off — text-to-audio is a slot machine, two pulls usually land one keeper. """ import argparse import os import sys import wave from pathlib import Path ROOT = Path(__file__).resolve().parent.parent OUT_DIR = ROOT / "assets" / "ambience" PROMPTS = { "soft_rain": "gentle steady rain falling on leaves, calm rain ambience, no thunder", "ocean_waves": "calm ocean waves gently rolling onto a sandy beach, soft surf", "fireplace_crackle": "cozy fireplace, fire crackling and popping softly", "birdsong": "soft morning birdsong, small birds chirping in a quiet garden", "night_crickets": "crickets chirping steadily on a calm summer night", "cafe_murmur": "quiet coffee shop ambience, soft murmur of distant conversation, occasional clink of cups", "wind_in_trees": "soft wind rustling through tree leaves, gentle breeze", } NEGATIVE = "music, melody, singing, speech, voice, loud, harsh, low quality, distortion" def write_wav(samples, rate: int, path: Path) -> None: import numpy as np peak = float(np.abs(samples).max() or 1.0) pcm = (samples * (0.9 / peak) * 32767).astype(" int: parser = argparse.ArgumentParser(description=__doc__.split("\n")[0]) parser.add_argument("slugs", nargs="*", choices=[*PROMPTS, []], metavar="slug", help=f"which beds to render (default: all missing). One of: {', '.join(PROMPTS)}") parser.add_argument("--force", action="store_true", help="re-render even if the wav exists") parser.add_argument("--duration", type=float, default=12.0, help="clip length in seconds") parser.add_argument("--steps", type=int, default=200, help="diffusion steps (more = cleaner, slower)") parser.add_argument("--candidates", type=int, default=2, help="waveforms per prompt; the pipeline keeps the best text match") args = parser.parse_args() todo = args.slugs or [s for s in PROMPTS if args.force or not (OUT_DIR / f"{s}.wav").exists()] if not todo: print("all ambience beds already rendered — use --force to redo") return 0 OUT_DIR.mkdir(parents=True, exist_ok=True) import torch from diffusers import AudioLDM2Pipeline device = os.getenv("LOFINITY_DEVICE") or ("mps" if torch.backends.mps.is_available() else "cpu") print(f"first run downloads ~3 GB (cvssp/audioldm2); rendering on {device}") pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2") pipe.to(device) for slug in todo: path = OUT_DIR / f"{slug}.wav" if path.exists() and not args.force and not args.slugs: continue print(f"rendering {slug}: {PROMPTS[slug]!r}") def run(): return pipe( prompt=PROMPTS[slug], negative_prompt=NEGATIVE, num_inference_steps=args.steps, audio_length_in_s=args.duration, num_waveforms_per_prompt=args.candidates, ).audios[0] # audios come back ranked by text alignment try: audio = run() except Exception as e: # noqa: BLE001 — mps kernels are still patchy if device == "cpu": raise print(f" {device} failed ({e!r}), retrying on cpu") pipe.to("cpu") device = "cpu" audio = run() write_wav(audio, 16000, path) print(f" -> {path.relative_to(ROOT)}") return 0 if __name__ == "__main__": sys.exit(main())