Spaces:
Running on Zero
Running on Zero
File size: 4,435 Bytes
722a5d8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | """One-off renderer for the sampled ambience beds.
vinyl_crackle and tape_hiss are synthesized live in ambience.py; the seven
beds below only need to exist once on disk. This script fills
assets/ambience/ with text-to-audio renders from AudioLDM2.
(AudioGen would also work, but it lives in the unmaintained audiocraft
package which doesn't install on Python 3.13; AudioLDM2 ships in diffusers
and runs next to the project's torch/transformers as-is.)
Usage:
pip install diffusers
python scripts/make_ambience.py # render whatever is missing
python scripts/make_ambience.py ocean_waves --force # redo one
Each clip is ~12 s; the runtime mixer tiles it with crossfades, so it does
not need to loop perfectly. Re-run any slug whose render sounds off —
text-to-audio is a slot machine, two pulls usually land one keeper.
"""
import argparse
import os
import sys
import wave
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
OUT_DIR = ROOT / "assets" / "ambience"
PROMPTS = {
"soft_rain": "gentle steady rain falling on leaves, calm rain ambience, no thunder",
"ocean_waves": "calm ocean waves gently rolling onto a sandy beach, soft surf",
"fireplace_crackle": "cozy fireplace, fire crackling and popping softly",
"birdsong": "soft morning birdsong, small birds chirping in a quiet garden",
"night_crickets": "crickets chirping steadily on a calm summer night",
"cafe_murmur": "quiet coffee shop ambience, soft murmur of distant conversation, occasional clink of cups",
"wind_in_trees": "soft wind rustling through tree leaves, gentle breeze",
}
NEGATIVE = "music, melody, singing, speech, voice, loud, harsh, low quality, distortion"
def write_wav(samples, rate: int, path: Path) -> None:
import numpy as np
peak = float(np.abs(samples).max() or 1.0)
pcm = (samples * (0.9 / peak) * 32767).astype("<i2")
with wave.open(str(path), "wb") as w:
w.setnchannels(1)
w.setsampwidth(2)
w.setframerate(rate)
w.writeframes(pcm.tobytes())
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__.split("\n")[0])
parser.add_argument("slugs", nargs="*", choices=[*PROMPTS, []], metavar="slug",
help=f"which beds to render (default: all missing). One of: {', '.join(PROMPTS)}")
parser.add_argument("--force", action="store_true", help="re-render even if the wav exists")
parser.add_argument("--duration", type=float, default=12.0, help="clip length in seconds")
parser.add_argument("--steps", type=int, default=200, help="diffusion steps (more = cleaner, slower)")
parser.add_argument("--candidates", type=int, default=2,
help="waveforms per prompt; the pipeline keeps the best text match")
args = parser.parse_args()
todo = args.slugs or [s for s in PROMPTS if args.force or not (OUT_DIR / f"{s}.wav").exists()]
if not todo:
print("all ambience beds already rendered — use --force to redo")
return 0
OUT_DIR.mkdir(parents=True, exist_ok=True)
import torch
from diffusers import AudioLDM2Pipeline
device = os.getenv("LOFINITY_DEVICE") or ("mps" if torch.backends.mps.is_available() else "cpu")
print(f"first run downloads ~3 GB (cvssp/audioldm2); rendering on {device}")
pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
pipe.to(device)
for slug in todo:
path = OUT_DIR / f"{slug}.wav"
if path.exists() and not args.force and not args.slugs:
continue
print(f"rendering {slug}: {PROMPTS[slug]!r}")
def run():
return pipe(
prompt=PROMPTS[slug],
negative_prompt=NEGATIVE,
num_inference_steps=args.steps,
audio_length_in_s=args.duration,
num_waveforms_per_prompt=args.candidates,
).audios[0] # audios come back ranked by text alignment
try:
audio = run()
except Exception as e: # noqa: BLE001 — mps kernels are still patchy
if device == "cpu":
raise
print(f" {device} failed ({e!r}), retrying on cpu")
pipe.to("cpu")
device = "cpu"
audio = run()
write_wav(audio, 16000, path)
print(f" -> {path.relative_to(ROOT)}")
return 0
if __name__ == "__main__":
sys.exit(main())
|