Spaces:
Running on Zero
Running on Zero
File size: 7,160 Bytes
722a5d8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | """Ambience beds for LoFinity tapes.
MusicGen ignores texture words ("vinyl crackle", "ocean waves"), so the
background layer is mixed in here instead: a bed is rendered at song length
and summed a few dB under the music. Lofi ambience loops through the whole
track anyway, so nothing needs to be generated per song.
vinyl_crackle and tape_hiss are synthesized procedurally (cheap, and never
sound repeated); the other seven are loops in assets/ambience/<slug>.wav,
rendered once by scripts/make_ambience.py and tiled with crossfades. A
missing asset falls back to vinyl crackle so every tape still has texture.
"""
import wave
from pathlib import Path
import numpy as np
ASSETS = Path(__file__).parent / "assets" / "ambience"
# Bed RMS relative to the music RMS, in dB. Starting points — tune by ear:
# spiky textures (crackle, fire) read louder than their RMS suggests.
GAIN_DB = {
"vinyl_crackle": -14.0,
"tape_hiss": -18.0,
"soft_rain": -14.0,
"ocean_waves": -12.0,
"fireplace_crackle": -14.0,
"birdsong": -16.0,
"night_crickets": -16.0,
"cafe_murmur": -16.0,
"wind_in_trees": -14.0,
}
DEFAULT = "vinyl_crackle"
# Checked in order; first hit wins ("fireplace crackle" must match fire
# before crackle can claim it for vinyl).
_KEYWORDS = (
("fire", "fireplace_crackle"),
("rain", "soft_rain"),
("wave", "ocean_waves"),
("ocean", "ocean_waves"),
("sea", "ocean_waves"),
("bird", "birdsong"),
("cricket", "night_crickets"),
("cafe", "cafe_murmur"),
("coffee", "cafe_murmur"),
("murmur", "cafe_murmur"),
("chatter", "cafe_murmur"),
("wind", "wind_in_trees"),
("tree", "wind_in_trees"),
("leaves", "wind_in_trees"),
("vinyl", "vinyl_crackle"),
("crackle", "vinyl_crackle"),
("static", "vinyl_crackle"),
("record", "vinyl_crackle"),
("hiss", "tape_hiss"),
("tape", "tape_hiss"),
("noise", "tape_hiss"),
)
def normalize_slug(value) -> str:
"""Map whatever the LLM produced onto a known slug ("Ocean waves!" ->
ocean_waves); anything unrecognizable becomes the default crackle."""
text = str(value or "").strip().lower()
slug = text.replace(" ", "_").replace("-", "_")
if slug in GAIN_DB:
return slug
for word, match in _KEYWORDS:
if word in text:
return match
return DEFAULT
# --- procedural beds ----------------------------------------------------------
def _lowpassed_noise(n: int, rate: int, cutoff: float, rng) -> np.ndarray:
"""Cheap dull noise: draw at ~2*cutoff and linearly upsample (the
interpolation is the lowpass)."""
low_rate = max(int(cutoff * 2), 200)
m = max(int(n * low_rate / rate) + 2, 2)
coarse = rng.standard_normal(m)
return np.interp(np.arange(n) * (low_rate / rate), np.arange(m), coarse)
def _vinyl_crackle(n: int, rate: int, rng) -> np.ndarray:
"""Dusty surface noise plus sparse pops, tiny pops, not loud."""
out = _lowpassed_noise(n, rate, 2500, rng) * 0.06
for pos in rng.integers(0, n, max(int(n / rate * 9), 1)):
length = int(rate * rng.uniform(0.001, 0.004))
amp = rng.uniform(0.15, 1.0) ** 2 * np.sign(rng.standard_normal())
pop = amp * np.exp(-np.arange(length) / (length / 5))
end = min(pos + length, n)
out[pos:end] += pop[: end - pos]
return out
def _tape_hiss(n: int, rate: int, rng) -> np.ndarray:
white = rng.standard_normal(n)
# first difference tilts the spectrum toward the highs, where hiss lives
tilted = np.zeros(n)
tilted[1:] = np.diff(white)
hiss = 0.35 * white + 0.65 * tilted
# slow wobble so it breathes like a real transport
lfo = 0.3 # Hz
phase = rng.uniform(0, 2 * np.pi)
return hiss * (1.0 + 0.08 * np.sin(2 * np.pi * lfo * np.arange(n) / rate + phase))
_PROCEDURAL = {"vinyl_crackle": _vinyl_crackle, "tape_hiss": _tape_hiss}
# --- sampled beds ---------------------------------------------------------------
def _read_wav(path: Path) -> tuple[np.ndarray, int]:
with wave.open(str(path), "rb") as w:
rate, channels, width = w.getframerate(), w.getnchannels(), w.getsampwidth()
raw = w.readframes(w.getnframes())
if width != 2:
raise ValueError(f"{path.name}: expected 16-bit wav, got {width * 8}-bit")
data = np.frombuffer(raw, dtype="<i2").astype(np.float64) / 32768.0
if channels > 1:
data = data.reshape(-1, channels).mean(axis=1)
return data, rate
def _resample(data: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray:
if src_rate == dst_rate:
return data
m = int(len(data) * dst_rate / src_rate)
return np.interp(np.arange(m) * (src_rate / dst_rate), np.arange(len(data)), data)
def _tile(loop: np.ndarray, n: int, rate: int) -> np.ndarray:
"""Repeat the loop out to n samples, crossfading each seam so it
doesn't click. The loop does not need to be seamless.
The fade uses equal-power (sqrt) ramps, not linear: the tail and head
being blended are uncorrelated audio, so linear ramps would sum to ~3-6 dB
below the surrounding level at the crossfade midpoint (an audible dip every
loop). With sqrt ramps gain_out**2 + gain_in**2 == 1, holding power steady."""
if len(loop) >= n:
return loop[:n].copy()
fade = min(int(rate * 0.5), len(loop) // 4)
if fade == 0:
return np.tile(loop, n // len(loop) + 1)[:n]
ramp = np.sqrt(np.linspace(0.0, 1.0, fade))
out = np.zeros(n + len(loop))
pos = 0
while pos < n:
seg = loop.copy()
if pos:
seg[:fade] *= ramp
seg[-fade:] *= ramp[::-1]
out[pos : pos + len(seg)] += seg
pos += len(loop) - fade
return out[:n]
# --- public API -----------------------------------------------------------------
def render(slug: str, n: int, rate: int) -> np.ndarray:
"""A peak-normalized bed of n samples at `rate`; the caller sets the level."""
if slug in _PROCEDURAL:
bed = _PROCEDURAL[slug](n, rate, np.random.default_rng())
else:
loop, loop_rate = _read_wav(ASSETS / f"{slug}.wav")
bed = _tile(_resample(loop, loop_rate, rate), n, rate)
return bed / (float(np.abs(bed).max()) or 1.0)
def mix(music, rate: int, slug: str) -> np.ndarray:
"""Sum the ambience bed under the music at its slug's relative RMS level."""
slug = normalize_slug(slug)
if slug not in _PROCEDURAL and not (ASSETS / f"{slug}.wav").exists():
print(
f"[lofinity] no ambience asset for {slug!r} "
"(run scripts/make_ambience.py), using vinyl crackle"
)
slug = DEFAULT
music = np.asarray(music, dtype=np.float64)
music_rms = float(np.sqrt(np.mean(music**2)))
if music_rms < 1e-6: # silence in, silence out
return music
bed = render(slug, len(music), rate)
bed_rms = float(np.sqrt(np.mean(bed**2))) or 1.0
bed *= music_rms * 10 ** (GAIN_DB[slug] / 20) / bed_rms
edge = min(int(rate * 0.75), len(bed) // 4)
if edge:
ramp = np.linspace(0.0, 1.0, edge)
bed[:edge] *= ramp
bed[-edge:] *= ramp[::-1]
return music + bed
|