Spaces:

build-small-hackathon
/

LoFinity

Running on Zero

File size: 7,160 Bytes

722a5d8

"""Ambience beds for LoFinity tapes.

MusicGen ignores texture words ("vinyl crackle", "ocean waves"), so the
background layer is mixed in here instead: a bed is rendered at song length
and summed a few dB under the music. Lofi ambience loops through the whole
track anyway, so nothing needs to be generated per song.

vinyl_crackle and tape_hiss are synthesized procedurally (cheap, and never
sound repeated); the other seven are loops in assets/ambience/<slug>.wav,
rendered once by scripts/make_ambience.py and tiled with crossfades. A
missing asset falls back to vinyl crackle so every tape still has texture.
"""

import wave
from pathlib import Path

import numpy as np

ASSETS = Path(__file__).parent / "assets" / "ambience"

# Bed RMS relative to the music RMS, in dB. Starting points — tune by ear:
# spiky textures (crackle, fire) read louder than their RMS suggests.
GAIN_DB = {
    "vinyl_crackle": -14.0,
    "tape_hiss": -18.0,
    "soft_rain": -14.0,
    "ocean_waves": -12.0,
    "fireplace_crackle": -14.0,
    "birdsong": -16.0,
    "night_crickets": -16.0,
    "cafe_murmur": -16.0,
    "wind_in_trees": -14.0,
}
DEFAULT = "vinyl_crackle"

# Checked in order; first hit wins ("fireplace crackle" must match fire
# before crackle can claim it for vinyl).
_KEYWORDS = (
    ("fire", "fireplace_crackle"),
    ("rain", "soft_rain"),
    ("wave", "ocean_waves"),
    ("ocean", "ocean_waves"),
    ("sea", "ocean_waves"),
    ("bird", "birdsong"),
    ("cricket", "night_crickets"),
    ("cafe", "cafe_murmur"),
    ("coffee", "cafe_murmur"),
    ("murmur", "cafe_murmur"),
    ("chatter", "cafe_murmur"),
    ("wind", "wind_in_trees"),
    ("tree", "wind_in_trees"),
    ("leaves", "wind_in_trees"),
    ("vinyl", "vinyl_crackle"),
    ("crackle", "vinyl_crackle"),
    ("static", "vinyl_crackle"),
    ("record", "vinyl_crackle"),
    ("hiss", "tape_hiss"),
    ("tape", "tape_hiss"),
    ("noise", "tape_hiss"),
)


def normalize_slug(value) -> str:
    """Map whatever the LLM produced onto a known slug ("Ocean waves!" ->
    ocean_waves); anything unrecognizable becomes the default crackle."""
    text = str(value or "").strip().lower()
    slug = text.replace(" ", "_").replace("-", "_")
    if slug in GAIN_DB:
        return slug
    for word, match in _KEYWORDS:
        if word in text:
            return match
    return DEFAULT


# --- procedural beds ----------------------------------------------------------


def _lowpassed_noise(n: int, rate: int, cutoff: float, rng) -> np.ndarray:
    """Cheap dull noise: draw at ~2*cutoff and linearly upsample (the
    interpolation is the lowpass)."""
    low_rate = max(int(cutoff * 2), 200)
    m = max(int(n * low_rate / rate) + 2, 2)
    coarse = rng.standard_normal(m)
    return np.interp(np.arange(n) * (low_rate / rate), np.arange(m), coarse)


def _vinyl_crackle(n: int, rate: int, rng) -> np.ndarray:
    """Dusty surface noise plus sparse pops, tiny pops, not loud."""
    out = _lowpassed_noise(n, rate, 2500, rng) * 0.06
    for pos in rng.integers(0, n, max(int(n / rate * 9), 1)):
        length = int(rate * rng.uniform(0.001, 0.004))
        amp = rng.uniform(0.15, 1.0) ** 2 * np.sign(rng.standard_normal())
        pop = amp * np.exp(-np.arange(length) / (length / 5))
        end = min(pos + length, n)
        out[pos:end] += pop[: end - pos]
    return out


def _tape_hiss(n: int, rate: int, rng) -> np.ndarray:
    white = rng.standard_normal(n)
    # first difference tilts the spectrum toward the highs, where hiss lives
    tilted = np.zeros(n)
    tilted[1:] = np.diff(white)
    hiss = 0.35 * white + 0.65 * tilted
    # slow wobble so it breathes like a real transport
    lfo = 0.3  # Hz
    phase = rng.uniform(0, 2 * np.pi)
    return hiss * (1.0 + 0.08 * np.sin(2 * np.pi * lfo * np.arange(n) / rate + phase))


_PROCEDURAL = {"vinyl_crackle": _vinyl_crackle, "tape_hiss": _tape_hiss}


# --- sampled beds ---------------------------------------------------------------


def _read_wav(path: Path) -> tuple[np.ndarray, int]:
    with wave.open(str(path), "rb") as w:
        rate, channels, width = w.getframerate(), w.getnchannels(), w.getsampwidth()
        raw = w.readframes(w.getnframes())
    if width != 2:
        raise ValueError(f"{path.name}: expected 16-bit wav, got {width * 8}-bit")
    data = np.frombuffer(raw, dtype="<i2").astype(np.float64) / 32768.0
    if channels > 1:
        data = data.reshape(-1, channels).mean(axis=1)
    return data, rate


def _resample(data: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray:
    if src_rate == dst_rate:
        return data
    m = int(len(data) * dst_rate / src_rate)
    return np.interp(np.arange(m) * (src_rate / dst_rate), np.arange(len(data)), data)


def _tile(loop: np.ndarray, n: int, rate: int) -> np.ndarray:
    """Repeat the loop out to n samples, crossfading each seam so it
    doesn't click. The loop does not need to be seamless.

    The fade uses equal-power (sqrt) ramps, not linear: the tail and head
    being blended are uncorrelated audio, so linear ramps would sum to ~3-6 dB
    below the surrounding level at the crossfade midpoint (an audible dip every
    loop). With sqrt ramps gain_out**2 + gain_in**2 == 1, holding power steady."""
    if len(loop) >= n:
        return loop[:n].copy()
    fade = min(int(rate * 0.5), len(loop) // 4)
    if fade == 0:
        return np.tile(loop, n // len(loop) + 1)[:n]
    ramp = np.sqrt(np.linspace(0.0, 1.0, fade))
    out = np.zeros(n + len(loop))
    pos = 0
    while pos < n:
        seg = loop.copy()
        if pos:
            seg[:fade] *= ramp
        seg[-fade:] *= ramp[::-1]
        out[pos : pos + len(seg)] += seg
        pos += len(loop) - fade
    return out[:n]


# --- public API -----------------------------------------------------------------


def render(slug: str, n: int, rate: int) -> np.ndarray:
    """A peak-normalized bed of n samples at `rate`; the caller sets the level."""
    if slug in _PROCEDURAL:
        bed = _PROCEDURAL[slug](n, rate, np.random.default_rng())
    else:
        loop, loop_rate = _read_wav(ASSETS / f"{slug}.wav")
        bed = _tile(_resample(loop, loop_rate, rate), n, rate)
    return bed / (float(np.abs(bed).max()) or 1.0)


def mix(music, rate: int, slug: str) -> np.ndarray:
    """Sum the ambience bed under the music at its slug's relative RMS level."""
    slug = normalize_slug(slug)
    if slug not in _PROCEDURAL and not (ASSETS / f"{slug}.wav").exists():
        print(
            f"[lofinity] no ambience asset for {slug!r} "
            "(run scripts/make_ambience.py), using vinyl crackle"
        )
        slug = DEFAULT
    music = np.asarray(music, dtype=np.float64)
    music_rms = float(np.sqrt(np.mean(music**2)))
    if music_rms < 1e-6:  # silence in, silence out
        return music
    bed = render(slug, len(music), rate)
    bed_rms = float(np.sqrt(np.mean(bed**2))) or 1.0
    bed *= music_rms * 10 ** (GAIN_DB[slug] / 20) / bed_rms
    edge = min(int(rate * 0.75), len(bed) // 4)
    if edge:
        ramp = np.linspace(0.0, 1.0, edge)
        bed[:edge] *= ramp
        bed[-edge:] *= ramp[::-1]
    return music + bed