"""LoFinity — a vending machine that dispenses endless chill beats. Gradio Server backend: serves the Three.js frontend and exposes the generation API. Pipeline: user vibe -> a small LLM enriches it into a MusicGen prompt + cassette title + ambience pick -> MusicGen renders the music -> ambience.py loops a background bed (waves, crackle, rain…) underneath. MusicGen ignores texture words in prompts, hence the separate bed. The enrichment LLM is MiniCPM (on cuda) on a ZeroGPU Space, or a local Ollama daemon in dev. On a ZeroGPU Space it runs musicgen-medium and allows tapes up to 90s (chunked); without a GPU it falls back to musicgen-small and a single 30s shot (no chunking). Env knobs: LOFINITY_ENGINE musicgen (default) | stub LOFINITY_DURATION clip length in seconds (default 30, the single-shot max) LOFINITY_DEVICE cuda | mps | cpu (default: cuda on ZeroGPU, else mps if available) LOFINITY_MUSICGEN model id (default: musicgen-medium on ZeroGPU, else musicgen-small) LOFINITY_OVERLAP_S continuation seed length, seconds (default 2) LOFINITY_MAX_GEN_S cap on a continuation's total output, seconds (default 28) LOFINITY_ENRICHER MiniCPM model id for ZeroGPU enrichment (default MiniCPM5-1B) OLLAMA_URL default http://localhost:11434 (local enrichment) OLLAMA_MODEL default llama3.2:3b (local enrichment) """ import base64 import io import json import os import threading import time import wave from pathlib import Path import httpx from fastapi.responses import FileResponse from fastapi.staticfiles import StaticFiles from gradio.server import Server ROOT = Path(__file__).parent FRONTEND = ROOT / "frontend" # ZeroGPU: on a Hugging Face ZeroGPU Space a GPU is attached only for the # duration of a function wrapped in @spaces.GPU, then released. The `spaces` # package exists only in that runtime; locally we shim @spaces.GPU to a no-op so # the exact same code runs on mps/cpu untouched. # # Detect ZeroGPU via the spaces library's OWN config rather than parsing the env # var ourselves: Config.zero_gpu is the very flag that gates @spaces.GPU (see # spaces/zero/decorator.py), so IS_ZEROGPU ends up True precisely when the # decorator will really allocate a GPU. (Our first attempt compared # SPACES_ZERO_GPU to the literal "true", but the runtime sets it to a value the # library parses loosely — "1"/"t"/"true" — so the exact match failed and the # GPU path never fired: generation silently fell back to CPU.) IS_ZEROGPU = False try: import spaces # provided by the ZeroGPU Space runtime from spaces.config import Config as _ZeroGPUConfig IS_ZEROGPU = bool(_ZeroGPUConfig.zero_gpu) except ImportError: # local dev / non-ZeroGPU — make the decorator harmless class _SpacesShim: @staticmethod def GPU(*args, **kwargs): # handle both bare @spaces.GPU and @spaces.GPU(duration=...) if args and callable(args[0]): return args[0] return lambda fn: fn spaces = _SpacesShim() print( f"[lofinity] startup: IS_ZEROGPU={IS_ZEROGPU} " f"(SPACES_ZERO_GPU={os.environ.get('SPACES_ZERO_GPU')!r})" ) ENGINE = os.getenv("LOFINITY_ENGINE", "musicgen") # Model + tape length scale with the hardware: a ZeroGPU Space gets the bigger, # cleaner-continuing musicgen-medium and full chunked tapes (up to 90s); without a # GPU we fall back to the smaller, faster musicgen-small and a single 30s shot # (medium + chunking on CPU would take minutes). The env var still overrides. MUSICGEN_MODEL = os.getenv( "LOFINITY_MUSICGEN", "facebook/musicgen-medium" if IS_ZEROGPU else "facebook/musicgen-small", ) # 30s is musicgen-small's single-shot max (1500 tokens). Longer tapes are # stitched from 30s chunks: each one re-seeds the model with the last OVERLAP_S # of the track so it keeps playing from there. musicgen-small's context is 2048 # tokens (~41s at 50 tok/s), so a 2s seed + 30s of new audio (~1600 tokens) fits. CHUNK_S = 30 # length of each musicgen single-shot OVERLAP_S = float(os.getenv("LOFINITY_OVERLAP_S", "2")) # seconds of tail fed back as the continuation seed; shorter leans more on the text prompt # musicgen is trained on 30s clips, so a single shot longer than ~30s degrades # its tail into noise. A continuation prepends the seed THEN generates, so cap its # total output (seed + new) at MAX_GEN_S to stay inside that window. Env-tunable. MAX_GEN_S = float(os.getenv("LOFINITY_MAX_GEN_S", "28")) SEAM_S = 0.4 # equal-power crossfade at each stitch, to hide the join # the tape lengths the API allows (it snaps any request to the nearest). Only a # GPU gets the longer, chunked tapes; a CPU-only fallback is capped to one 30s shot. ALLOWED_SECONDS = (30, 60, 90) if IS_ZEROGPU else (30,) DEFAULT_SECONDS = int(os.getenv("LOFINITY_DURATION", "30")) OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434") OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3.2:3b") # A ZeroGPU brew renders in a separate GPU worker process, so /api/progress can't # read real per-chunk progress from it; it reports a time-based estimate from this # rough per-length budget instead (tunable; only affects the cosmetic brew bar). GPU_WARMUP_S = 5.0 # enrichment + cold-start/queue allowance before audio flows GPU_SECS_PER_CHUNK = 25.0 # rough GPU render time per 30s chunk (musicgen-medium) app = Server(title="LoFinity") # How far along the current brew is, in whole 30s chunks. generate_song runs one # at a time (concurrency_limit=1), so a single shared counter is enough; the # frontend polls /api/progress to fill its brewing bar. _PROGRESS = {"done": 0, "total": 1} # A ZeroGPU brew's wall-clock start + estimated total, so /api/progress can report # a smooth time-based estimate while the GPU worker is busy (see progress()). _BREW = {"active": False, "start": 0.0, "est": 1.0, "total": 1} # --- prompt enrichment -------------------------------------------------------- ENRICH_SYSTEM = """\ You are the creative brain of LoFinity, a magical vending machine that sells lofi cassette tapes. The user gives you a vibe. Reply ONLY with JSON with exactly these three keys: {"music_prompt": "...", "title": "...", "ambience": "..."} Build music_prompt from this template, in this order: "lofi chill, , , , , slow tempo, 75 bpm, instrumental" - instruments: 2-3 picked to EVOKE the user's vibe, never a default set (island -> ukulele, kalimba, steel pan; rainy city -> rhodes piano, soft guitar; winter -> felt piano, soft strings; desert -> slide guitar, hand drums) - mood: one or two calm words; never energetic, no vocals ambience: the background sound layered under the music. Exactly one of: vinyl_crackle, tape_hiss, soft_rain, ocean_waves, fireplace_crackle, birdsong, night_crickets, cafe_murmur, wind_in_trees. Match it to the vibe. title: a cozy cassette tape title inspired by the vibe, max 5 words, Title Case, no quotes or emoji. Examples: user: island summer {"music_prompt": "lofi chill, ukulele, kalimba, steel pan, breezy and warm, slow tempo, 75 bpm, instrumental", "title": "Coconut Daydream", "ambience": "ocean_waves"} user: studying at midnight {"music_prompt": "lofi chill, rhodes piano, muted guitar, soft bass, focused and calm, slow tempo, 75 bpm, instrumental", "title": "Midnight Study Session", "ambience": "vinyl_crackle"}""" # MiniCPM enrichment LLM (ZeroGPU only — a Space has no Ollama daemon). # MiniCPM5-1B is a standard LlamaForCausalLM (no trust_remote_code, fast # tokenizer) with a switchable mode we keep OFF so the reply is direct # JSON. Needs transformers>=5.6 (the Space's latest satisfies it); no extra deps. ENRICHER_MODEL = os.getenv("LOFINITY_ENRICHER", "openbmb/MiniCPM5-1B") _enricher = None _enricher_lock = threading.Lock() _enricher_disabled = False # set if the model can't load; forces the fallback def load_enricher(): """Lazy-load the MiniCPM enrichment LLM on cuda (ZeroGPU). Like MusicGen it is placed on cuda at module level; standard Llama arch, so no remote code.""" global _enricher with _enricher_lock: if _enricher is None: import torch # noqa: F401 — needed so the .to('cuda') below resolves from transformers import AutoModelForCausalLM, AutoTokenizer print(f"[lofinity] loading enricher {ENRICHER_MODEL} on cuda…") tok = AutoTokenizer.from_pretrained(ENRICHER_MODEL) model = AutoModelForCausalLM.from_pretrained(ENRICHER_MODEL, torch_dtype="auto") model.to("cuda") model.eval() _enricher = (tok, model) print("[lofinity] enricher ready") return _enricher def _parse_enrich_json(text: str) -> dict: """Pull the first {...} object out of an LLM reply (it may wrap the JSON in prose or ```json fences, or leak a block); {} if nothing parses.""" import re if "" in text: # belt-and-suspenders if thinking ever leaks through text = text.rsplit("", 1)[1] m = re.search(r"\{.*\}", text, re.DOTALL) if not m: return {} try: return json.loads(m.group(0)) except Exception: # noqa: BLE001 return {} def _finalize_enrichment(data: dict): """Shared post-processing for any backend: validate, force the genre to lead, snap the ambience to a renderable bed. Returns a tuple, or None if unusable.""" import ambience music_prompt = str(data.get("music_prompt") or "").strip() title = str(data.get("title") or "").strip()[:48] if not (music_prompt and title): return None # belt and suspenders: the genre must lead even if the LLM drifts if "lofi" not in music_prompt.lower(): music_prompt = f"lofi chill, {music_prompt}" # whatever the LLM picked, snap it to a bed we can actually render return music_prompt, title, ambience.normalize_slug(data.get("ambience")) def _enrich_minicpm(prompt: str): """Enrich via MiniCPM on cuda. MUST run inside @spaces.GPU. Returns a tuple or None (caller falls back). Thinking mode off so the reply is direct JSON.""" if _enricher_disabled: return None import torch tok, model = load_enricher() messages = [ {"role": "system", "content": ENRICH_SYSTEM}, {"role": "user", "content": prompt}, ] inputs = tok.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, enable_thinking=False, return_dict=True, return_tensors="pt", ).to(model.device) with torch.no_grad(): out = model.generate( **inputs, max_new_tokens=220, do_sample=True, temperature=0.7, top_p=0.95 ) reply = tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True) return _finalize_enrichment(_parse_enrich_json(reply)) def _enrich_ollama(prompt: str): """Enrich via a local Ollama daemon. Returns a tuple or None on failure.""" r = httpx.post( f"{OLLAMA_URL}/api/chat", json={ "model": OLLAMA_MODEL, "messages": [ {"role": "system", "content": ENRICH_SYSTEM}, {"role": "user", "content": prompt}, ], "format": "json", "stream": False, "options": {"temperature": 0.8, "num_predict": 220}, }, timeout=45, ) r.raise_for_status() return _finalize_enrichment(json.loads(r.json()["message"]["content"])) def _enrich_fallback(prompt: str) -> tuple[str, str, str]: """Plain, LLM-free enrichment — used whenever the chosen backend fails.""" import ambience title = f"{prompt[:28].title()} Tape" if prompt.strip() else "Untitled Tape" return ( f"lofi chill, {prompt}, mellow and warm, soft drums, slow tempo, instrumental", title, ambience.DEFAULT, ) def enrich_prompt(prompt: str) -> tuple[str, str, str]: """Vibe -> (music_prompt, cassette title, ambience slug). Backend is chosen by environment: MiniCPM on ZeroGPU, Ollama locally; a plain fallback covers any failure. On ZeroGPU this MUST be called inside @spaces.GPU (MiniCPM is cuda).""" backend = _enrich_minicpm if IS_ZEROGPU else _enrich_ollama try: result = backend(prompt) if result: return result print("[lofinity] enrichment returned junk, using fallback") except Exception as e: # noqa: BLE001 — any failure means "use fallback" print(f"[lofinity] enrichment failed ({e!r}), using fallback") return _enrich_fallback(prompt) # --- audio engines ------------------------------------------------------------ _musicgen = None _musicgen_lock = threading.Lock() def load_musicgen(): """Lazy-load MusicGen once; the first call downloads the model from the Hub (musicgen-medium is ~3.5 GB).""" global _musicgen with _musicgen_lock: if _musicgen is None: os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1") import torch from transformers import AutoProcessor, MusicgenForConditionalGeneration requested = os.getenv("LOFINITY_DEVICE") if requested: device = requested elif IS_ZEROGPU: device = "cuda" elif torch.backends.mps.is_available(): device = "mps" else: device = "cpu" print(f"[lofinity] loading {MUSICGEN_MODEL} on {device}…") processor = AutoProcessor.from_pretrained(MUSICGEN_MODEL) model = MusicgenForConditionalGeneration.from_pretrained(MUSICGEN_MODEL) model.to(device) model.eval() _musicgen = (processor, model, device) print("[lofinity] musicgen ready") return _musicgen # ZeroGPU wants models resident on cuda at startup (module import time), not # lazily inside the @spaces.GPU call — outside the decorated function a CUDA # emulation layer lets this .to('cuda') succeed without a real GPU attached, and # placements done at startup are far more efficient than per-call transfers. if IS_ZEROGPU and ENGINE != "stub": load_musicgen() try: load_enricher() except Exception as e: # noqa: BLE001 — a bad enricher must not kill the app _enricher_disabled = True print(f"[lofinity] enricher load failed ({e!r}); vends use the plain fallback") def encode_wav(samples, rate: int) -> str: """Encode mono float samples as a base64 WAV data URI, entirely in memory. Nothing is written to disk: on a shared HF Space a songs directory is visible to every visitor and grows without bound. Returning the tape inline keeps it private to the one request that asked for it.""" import numpy as np # MusicGen can exceed [-1, 1]; normalize instead of hard-clipping peak = float(np.abs(samples).max() or 1.0) if peak > 0.95: samples = samples * (0.95 / peak) pcm = (samples * 32767).astype(" float: import numpy as np return float(np.sqrt(np.mean(np.square(x)))) if len(x) else 0.0 def _match_rms(chunk, target: float): """Scale a fresh continuation to the first chunk's loudness. MusicGen continuations drift quieter each round; re-leveling keeps a long tape even.""" r = _rms(chunk) if r < 1e-6 or target < 1e-6: return chunk return chunk * min(4.0, target / r) # cap the boost so a quiet tail can't blow up def _stitch(base, tail, rate: int): """Append `tail` to `base` with a short equal-power crossfade at the seam.""" import numpy as np seam = int(SEAM_S * rate) if seam <= 0 or len(base) < seam or len(tail) < seam: return np.concatenate([base, tail]) t = np.linspace(0.0, 1.0, seam) fade_out, fade_in = np.cos(t * np.pi / 2), np.sin(t * np.pi / 2) blended = base[-seam:] * fade_out + tail[:seam] * fade_in return np.concatenate([base[:-seam], blended, tail[seam:]]) def musicgen_engine(music_prompt: str, seconds: int = CHUNK_S, progress_cb=None) -> tuple: """Returns (samples, sample_rate). Tapes longer than one 30s shot are built by re-seeding the model with the last OVERLAP_S and stitching the chunks. progress_cb(done, total) is called after each chunk finishes.""" import torch processor, model, device = load_musicgen() rate = model.config.audio_encoder.sampling_rate overlap = int(OVERLAP_S * rate) # a continuation prepends the OVERLAP_S seed, so it may generate only # MAX_GEN_S - OVERLAP_S NEW seconds to keep the whole shot inside musicgen's # ~30s training window — generating past it is what turns the tail to noise cont_new_s = max(1.0, MAX_GEN_S - OVERLAP_S) rounds = max(0, round(seconds / CHUNK_S) - 1) # 30->0, 60->1, 90->2 total = rounds + 1 def shot(dev, seed=None, new_s=CHUNK_S): if seed is None: inputs = processor(text=[music_prompt], padding=True, return_tensors="pt") else: inputs = processor( audio=seed, sampling_rate=rate, text=[music_prompt], padding=True, return_tensors="pt", ) inputs = inputs.to(dev) with torch.no_grad(): audio = model.generate( **inputs, do_sample=True, guidance_scale=3.0, max_new_tokens=int(new_s * 50), # ~50 tokens per second ) return audio[0, 0].cpu().float().numpy() def build(dev): track = shot(dev) # first 30s straight from the prompt if progress_cb: progress_cb(1, total) base_rms = _rms(track) for i in range(rounds): out = shot(dev, seed=track[-overlap:], new_s=cont_new_s) # capped continuation fresh = _match_rms(out[overlap:], base_rms) # drop the re-encoded seed track = _stitch(track, fresh, rate) if progress_cb: progress_cb(i + 2, total) print(f"[lofinity] stitched chunk {i + 2}/{total}") return track try: samples = build(device) except Exception as e: # noqa: BLE001 if device == "mps": print(f"[lofinity] mps generation failed ({e!r}), retrying on cpu") model.to("cpu") samples = build("cpu") else: raise return samples, rate def _gpu_budget(prompt: str, seconds: int = CHUNK_S) -> int: """GPU seconds to request from ZeroGPU for a brew of this length: MiniCPM enrichment + per-chunk MusicGen render plus headroom. Tighter budgets earn better queue priority; the signature must mirror gpu_brew so ZeroGPU can pass it the same args.""" chunks = max(1, round(int(seconds) / CHUNK_S)) # musicgen-medium renders slower than -small, so budget generously — a brew # that overruns its @spaces.GPU duration is KILLED mid-render; over-budgeting # only costs quota, under-budgeting loses the tape. return 40 + 40 * chunks # 30s->80, 60s->120, 90s->160 @spaces.GPU(duration=_gpu_budget) def gpu_brew(prompt: str, seconds: int = CHUNK_S) -> tuple: """ZeroGPU entry point — enrichment (MiniCPM) AND MusicGen on the real GPU in a single acquisition. Takes the raw vibe and returns (music_prompt, title, bed, samples, rate). It runs in a separate GPU worker process and can't push per-chunk progress back to the web process, so /api/progress reports a time-based estimate for the bar. This path is Space-only.""" music_prompt, title, bed = enrich_prompt(prompt) samples, rate = musicgen_engine(music_prompt, seconds) return music_prompt, title, bed, samples, rate def stub_engine(_music_prompt: str, seconds: int = CHUNK_S, progress_cb=None) -> tuple: """A short audible tone — handy when developing without the heavy model. Honors `seconds` and fakes per-chunk timing so the length slider and the brewing progress bar can be exercised without MusicGen.""" import time import numpy as np rate = 22050 total = max(1, round(seconds / CHUNK_S)) for i in range(total): time.sleep(0.8) # pretend each 30s chunk takes a moment to render if progress_cb: progress_cb(i + 1, total) secs = float(seconds) t = np.arange(int(rate * secs)) / rate fade = np.minimum(1.0, np.minimum(t * 4, (secs - t) * 4)) # a slow wobble so a longer stub is audibly (and visibly) longer return 0.25 * fade * np.sin(2 * np.pi * 220 * t) * (0.8 + 0.2 * np.sin(t)), rate # --- API ----------------------------------------------------------------------- @app.api(name="generate_song", concurrency_limit=1) def generate_song(prompt: str, seconds: int = DEFAULT_SECONDS) -> dict: import ambience # snap whatever the slider sends to a length we can actually build seconds = min(ALLOWED_SECONDS, key=lambda s: abs(s - int(seconds))) # reset progress up front, BEFORE the (sometimes slow) enrich step, so a poll # arriving early sees this brew at 0% rather than the last one at 100% chunks = max(1, round(seconds / CHUNK_S)) _PROGRESS.update(done=0, total=chunks) if IS_ZEROGPU and ENGINE != "stub": # On ZeroGPU enrichment (MiniCPM) and MusicGen share ONE @spaces.GPU # acquisition in a separate worker process, which can't push real progress # back — so /api/progress reports a smooth time-based ESTIMATE driven by # this brew's start + budget (capped <100% until the tape actually lands). est = GPU_WARMUP_S + GPU_SECS_PER_CHUNK * chunks _BREW.update(active=True, start=time.monotonic(), est=est, total=chunks) print(f"[lofinity] brewing on GPU :: {prompt!r} ({seconds}s, ~{est:.0f}s est)") try: music_prompt, title, bed, samples, rate = gpu_brew(prompt, seconds) finally: # top the bar off BEFORE clearing active, so a poll landing in between # reads 100% (from _PROGRESS), never the 0% this brew started at _PROGRESS.update(done=chunks, total=chunks) _BREW.update(active=False) print(f"[lofinity] brewed {title!r} :: {music_prompt} [+ {bed}]") else: # Local / stub: enrich in-process (Ollama or fallback), then render with # live per-chunk progress for the brewing garden. music_prompt, title, bed = enrich_prompt(prompt) print(f"[lofinity] brewing {title!r} ({seconds}s) :: {music_prompt} [+ {bed}]") engine = stub_engine if ENGINE == "stub" else musicgen_engine samples, rate = engine( music_prompt, seconds, progress_cb=lambda d, t: _PROGRESS.update(done=d, total=t), ) _PROGRESS.update(done=chunks, total=chunks) try: samples = ambience.mix(samples, rate, bed) except Exception as e: # noqa: BLE001 — a dry tape beats a failed vend print(f"[lofinity] ambience mix failed ({e!r}), vending without the bed") # The tape rides back inline as a base64 data URI — no disk write, so it is # never cached on the Space nor shared with other visitors. The frontend # keeps the collection client-side, per browser session. return {"title": title, "audio": encode_wav(samples, rate)} @app.get("/api/progress") def progress() -> dict: """Progress for the brewing bar. Local/stub report real per-chunk progress via _PROGRESS. A ZeroGPU brew runs in a separate GPU worker that can't push progress back, so report a smooth time-based ESTIMATE instead: a fractional `done` (the frontend fills the bar to done/total) capped below 100% until the real tape lands and _PROGRESS tops it off.""" if _BREW["active"] and _BREW["est"] > 0: elapsed = time.monotonic() - _BREW["start"] frac = min(0.92, elapsed / _BREW["est"]) return {"done": round(frac * _BREW["total"], 3), "total": _BREW["total"]} return dict(_PROGRESS) @app.get("/api/config") def config() -> dict: """Frontend config: the tape lengths this backend allows. Hardware-dependent — a CPU-only fallback offers only 30s — so the slider reads it and adapts.""" return {"allowed_seconds": list(ALLOWED_SECONDS)} @app.get("/") async def homepage(): return FileResponse(FRONTEND / "index.html") app.mount("/static", StaticFiles(directory=FRONTEND), name="static") if __name__ == "__main__": app.launch(show_error=True)