Spaces:
Running on Zero
Running on Zero
| """LoFinity β a vending machine that dispenses endless chill beats. | |
| Gradio Server backend: serves the Three.js frontend and exposes the | |
| generation API. | |
| Pipeline: user vibe -> a small LLM enriches it into a MusicGen prompt + | |
| cassette title + ambience pick -> MusicGen renders the music -> ambience.py | |
| loops a background bed (waves, crackle, rainβ¦) underneath. MusicGen ignores | |
| texture words in prompts, hence the separate bed. The enrichment LLM is | |
| MiniCPM (on cuda) on a ZeroGPU Space, or a local Ollama daemon in dev. | |
| On a ZeroGPU Space it runs musicgen-medium and allows tapes up to 90s (chunked); | |
| without a GPU it falls back to musicgen-small and a single 30s shot (no chunking). | |
| Env knobs: | |
| LOFINITY_ENGINE musicgen (default) | stub | |
| LOFINITY_DURATION clip length in seconds (default 30, the single-shot max) | |
| LOFINITY_DEVICE cuda | mps | cpu (default: cuda on ZeroGPU, else mps if available) | |
| LOFINITY_MUSICGEN model id (default: musicgen-medium on ZeroGPU, else musicgen-small) | |
| LOFINITY_OVERLAP_S continuation seed length, seconds (default 2) | |
| LOFINITY_MAX_GEN_S cap on a continuation's total output, seconds (default 28) | |
| LOFINITY_ENRICHER MiniCPM model id for ZeroGPU enrichment (default MiniCPM5-1B) | |
| OLLAMA_URL default http://localhost:11434 (local enrichment) | |
| OLLAMA_MODEL default llama3.2:3b (local enrichment) | |
| """ | |
| import base64 | |
| import io | |
| import json | |
| import os | |
| import threading | |
| import time | |
| import wave | |
| from pathlib import Path | |
| import httpx | |
| from fastapi.responses import FileResponse | |
| from fastapi.staticfiles import StaticFiles | |
| from gradio.server import Server | |
| ROOT = Path(__file__).parent | |
| FRONTEND = ROOT / "frontend" | |
| # ZeroGPU: on a Hugging Face ZeroGPU Space a GPU is attached only for the | |
| # duration of a function wrapped in @spaces.GPU, then released. The `spaces` | |
| # package exists only in that runtime; locally we shim @spaces.GPU to a no-op so | |
| # the exact same code runs on mps/cpu untouched. | |
| # | |
| # Detect ZeroGPU via the spaces library's OWN config rather than parsing the env | |
| # var ourselves: Config.zero_gpu is the very flag that gates @spaces.GPU (see | |
| # spaces/zero/decorator.py), so IS_ZEROGPU ends up True precisely when the | |
| # decorator will really allocate a GPU. (Our first attempt compared | |
| # SPACES_ZERO_GPU to the literal "true", but the runtime sets it to a value the | |
| # library parses loosely β "1"/"t"/"true" β so the exact match failed and the | |
| # GPU path never fired: generation silently fell back to CPU.) | |
| IS_ZEROGPU = False | |
| try: | |
| import spaces # provided by the ZeroGPU Space runtime | |
| from spaces.config import Config as _ZeroGPUConfig | |
| IS_ZEROGPU = bool(_ZeroGPUConfig.zero_gpu) | |
| except ImportError: # local dev / non-ZeroGPU β make the decorator harmless | |
| class _SpacesShim: | |
| def GPU(*args, **kwargs): | |
| # handle both bare @spaces.GPU and @spaces.GPU(duration=...) | |
| if args and callable(args[0]): | |
| return args[0] | |
| return lambda fn: fn | |
| spaces = _SpacesShim() | |
| print( | |
| f"[lofinity] startup: IS_ZEROGPU={IS_ZEROGPU} " | |
| f"(SPACES_ZERO_GPU={os.environ.get('SPACES_ZERO_GPU')!r})" | |
| ) | |
| ENGINE = os.getenv("LOFINITY_ENGINE", "musicgen") | |
| # Model + tape length scale with the hardware: a ZeroGPU Space gets the bigger, | |
| # cleaner-continuing musicgen-medium and full chunked tapes (up to 90s); without a | |
| # GPU we fall back to the smaller, faster musicgen-small and a single 30s shot | |
| # (medium + chunking on CPU would take minutes). The env var still overrides. | |
| MUSICGEN_MODEL = os.getenv( | |
| "LOFINITY_MUSICGEN", | |
| "facebook/musicgen-medium" if IS_ZEROGPU else "facebook/musicgen-small", | |
| ) | |
| # 30s is musicgen-small's single-shot max (1500 tokens). Longer tapes are | |
| # stitched from 30s chunks: each one re-seeds the model with the last OVERLAP_S | |
| # of the track so it keeps playing from there. musicgen-small's context is 2048 | |
| # tokens (~41s at 50 tok/s), so a 2s seed + 30s of new audio (~1600 tokens) fits. | |
| CHUNK_S = 30 # length of each musicgen single-shot | |
| OVERLAP_S = float(os.getenv("LOFINITY_OVERLAP_S", "2")) # seconds of tail fed back as the continuation seed; shorter leans more on the text prompt | |
| # musicgen is trained on 30s clips, so a single shot longer than ~30s degrades | |
| # its tail into noise. A continuation prepends the seed THEN generates, so cap its | |
| # total output (seed + new) at MAX_GEN_S to stay inside that window. Env-tunable. | |
| MAX_GEN_S = float(os.getenv("LOFINITY_MAX_GEN_S", "28")) | |
| SEAM_S = 0.4 # equal-power crossfade at each stitch, to hide the join | |
| # the tape lengths the API allows (it snaps any request to the nearest). Only a | |
| # GPU gets the longer, chunked tapes; a CPU-only fallback is capped to one 30s shot. | |
| ALLOWED_SECONDS = (30, 60, 90) if IS_ZEROGPU else (30,) | |
| DEFAULT_SECONDS = int(os.getenv("LOFINITY_DURATION", "30")) | |
| OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434") | |
| OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3.2:3b") | |
| # A ZeroGPU brew renders in a separate GPU worker process, so /api/progress can't | |
| # read real per-chunk progress from it; it reports a time-based estimate from this | |
| # rough per-length budget instead (tunable; only affects the cosmetic brew bar). | |
| GPU_WARMUP_S = 5.0 # enrichment + cold-start/queue allowance before audio flows | |
| GPU_SECS_PER_CHUNK = 25.0 # rough GPU render time per 30s chunk (musicgen-medium) | |
| app = Server(title="LoFinity") | |
| # How far along the current brew is, in whole 30s chunks. generate_song runs one | |
| # at a time (concurrency_limit=1), so a single shared counter is enough; the | |
| # frontend polls /api/progress to fill its brewing bar. | |
| _PROGRESS = {"done": 0, "total": 1} | |
| # A ZeroGPU brew's wall-clock start + estimated total, so /api/progress can report | |
| # a smooth time-based estimate while the GPU worker is busy (see progress()). | |
| _BREW = {"active": False, "start": 0.0, "est": 1.0, "total": 1} | |
| # --- prompt enrichment -------------------------------------------------------- | |
| ENRICH_SYSTEM = """\ | |
| You are the creative brain of LoFinity, a magical vending machine that sells | |
| lofi cassette tapes. The user gives you a vibe. Reply ONLY with JSON with | |
| exactly these three keys: {"music_prompt": "...", "title": "...", "ambience": "..."} | |
| Build music_prompt from this template, in this order: | |
| "lofi chill, <instrument 1>, <instrument 2>, <instrument 3>, <mood>, slow tempo, 75 bpm, instrumental" | |
| - instruments: 2-3 picked to EVOKE the user's vibe, never a default set | |
| (island -> ukulele, kalimba, steel pan; rainy city -> rhodes piano, soft | |
| guitar; winter -> felt piano, soft strings; desert -> slide guitar, hand drums) | |
| - mood: one or two calm words; never energetic, no vocals | |
| ambience: the background sound layered under the music. Exactly one of: | |
| vinyl_crackle, tape_hiss, soft_rain, ocean_waves, fireplace_crackle, | |
| birdsong, night_crickets, cafe_murmur, wind_in_trees. Match it to the vibe. | |
| title: a cozy cassette tape title inspired by the vibe, max 5 words, | |
| Title Case, no quotes or emoji. | |
| Examples: | |
| user: island summer | |
| {"music_prompt": "lofi chill, ukulele, kalimba, steel pan, breezy and warm, slow tempo, 75 bpm, instrumental", "title": "Coconut Daydream", "ambience": "ocean_waves"} | |
| user: studying at midnight | |
| {"music_prompt": "lofi chill, rhodes piano, muted guitar, soft bass, focused and calm, slow tempo, 75 bpm, instrumental", "title": "Midnight Study Session", "ambience": "vinyl_crackle"}""" | |
| # MiniCPM enrichment LLM (ZeroGPU only β a Space has no Ollama daemon). | |
| # MiniCPM5-1B is a standard LlamaForCausalLM (no trust_remote_code, fast | |
| # tokenizer) with a switchable <think> mode we keep OFF so the reply is direct | |
| # JSON. Needs transformers>=5.6 (the Space's latest satisfies it); no extra deps. | |
| ENRICHER_MODEL = os.getenv("LOFINITY_ENRICHER", "openbmb/MiniCPM5-1B") | |
| _enricher = None | |
| _enricher_lock = threading.Lock() | |
| _enricher_disabled = False # set if the model can't load; forces the fallback | |
| def load_enricher(): | |
| """Lazy-load the MiniCPM enrichment LLM on cuda (ZeroGPU). Like MusicGen it is | |
| placed on cuda at module level; standard Llama arch, so no remote code.""" | |
| global _enricher | |
| with _enricher_lock: | |
| if _enricher is None: | |
| import torch # noqa: F401 β needed so the .to('cuda') below resolves | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| print(f"[lofinity] loading enricher {ENRICHER_MODEL} on cudaβ¦") | |
| tok = AutoTokenizer.from_pretrained(ENRICHER_MODEL) | |
| model = AutoModelForCausalLM.from_pretrained(ENRICHER_MODEL, torch_dtype="auto") | |
| model.to("cuda") | |
| model.eval() | |
| _enricher = (tok, model) | |
| print("[lofinity] enricher ready") | |
| return _enricher | |
| def _parse_enrich_json(text: str) -> dict: | |
| """Pull the first {...} object out of an LLM reply (it may wrap the JSON in | |
| prose or ```json fences, or leak a <think> block); {} if nothing parses.""" | |
| import re | |
| if "</think>" in text: # belt-and-suspenders if thinking ever leaks through | |
| text = text.rsplit("</think>", 1)[1] | |
| m = re.search(r"\{.*\}", text, re.DOTALL) | |
| if not m: | |
| return {} | |
| try: | |
| return json.loads(m.group(0)) | |
| except Exception: # noqa: BLE001 | |
| return {} | |
| def _finalize_enrichment(data: dict): | |
| """Shared post-processing for any backend: validate, force the genre to lead, | |
| snap the ambience to a renderable bed. Returns a tuple, or None if unusable.""" | |
| import ambience | |
| music_prompt = str(data.get("music_prompt") or "").strip() | |
| title = str(data.get("title") or "").strip()[:48] | |
| if not (music_prompt and title): | |
| return None | |
| # belt and suspenders: the genre must lead even if the LLM drifts | |
| if "lofi" not in music_prompt.lower(): | |
| music_prompt = f"lofi chill, {music_prompt}" | |
| # whatever the LLM picked, snap it to a bed we can actually render | |
| return music_prompt, title, ambience.normalize_slug(data.get("ambience")) | |
| def _enrich_minicpm(prompt: str): | |
| """Enrich via MiniCPM on cuda. MUST run inside @spaces.GPU. Returns a tuple or | |
| None (caller falls back). Thinking mode off so the reply is direct JSON.""" | |
| if _enricher_disabled: | |
| return None | |
| import torch | |
| tok, model = load_enricher() | |
| messages = [ | |
| {"role": "system", "content": ENRICH_SYSTEM}, | |
| {"role": "user", "content": prompt}, | |
| ] | |
| inputs = tok.apply_chat_template( | |
| messages, tokenize=True, add_generation_prompt=True, | |
| enable_thinking=False, return_dict=True, return_tensors="pt", | |
| ).to(model.device) | |
| with torch.no_grad(): | |
| out = model.generate( | |
| **inputs, max_new_tokens=220, do_sample=True, temperature=0.7, top_p=0.95 | |
| ) | |
| reply = tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True) | |
| return _finalize_enrichment(_parse_enrich_json(reply)) | |
| def _enrich_ollama(prompt: str): | |
| """Enrich via a local Ollama daemon. Returns a tuple or None on failure.""" | |
| r = httpx.post( | |
| f"{OLLAMA_URL}/api/chat", | |
| json={ | |
| "model": OLLAMA_MODEL, | |
| "messages": [ | |
| {"role": "system", "content": ENRICH_SYSTEM}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| "format": "json", | |
| "stream": False, | |
| "options": {"temperature": 0.8, "num_predict": 220}, | |
| }, | |
| timeout=45, | |
| ) | |
| r.raise_for_status() | |
| return _finalize_enrichment(json.loads(r.json()["message"]["content"])) | |
| def _enrich_fallback(prompt: str) -> tuple[str, str, str]: | |
| """Plain, LLM-free enrichment β used whenever the chosen backend fails.""" | |
| import ambience | |
| title = f"{prompt[:28].title()} Tape" if prompt.strip() else "Untitled Tape" | |
| return ( | |
| f"lofi chill, {prompt}, mellow and warm, soft drums, slow tempo, instrumental", | |
| title, | |
| ambience.DEFAULT, | |
| ) | |
| def enrich_prompt(prompt: str) -> tuple[str, str, str]: | |
| """Vibe -> (music_prompt, cassette title, ambience slug). Backend is chosen by | |
| environment: MiniCPM on ZeroGPU, Ollama locally; a plain fallback covers any | |
| failure. On ZeroGPU this MUST be called inside @spaces.GPU (MiniCPM is cuda).""" | |
| backend = _enrich_minicpm if IS_ZEROGPU else _enrich_ollama | |
| try: | |
| result = backend(prompt) | |
| if result: | |
| return result | |
| print("[lofinity] enrichment returned junk, using fallback") | |
| except Exception as e: # noqa: BLE001 β any failure means "use fallback" | |
| print(f"[lofinity] enrichment failed ({e!r}), using fallback") | |
| return _enrich_fallback(prompt) | |
| # --- audio engines ------------------------------------------------------------ | |
| _musicgen = None | |
| _musicgen_lock = threading.Lock() | |
| def load_musicgen(): | |
| """Lazy-load MusicGen once; the first call downloads the model from the Hub | |
| (musicgen-medium is ~3.5 GB).""" | |
| global _musicgen | |
| with _musicgen_lock: | |
| if _musicgen is None: | |
| os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1") | |
| import torch | |
| from transformers import AutoProcessor, MusicgenForConditionalGeneration | |
| requested = os.getenv("LOFINITY_DEVICE") | |
| if requested: | |
| device = requested | |
| elif IS_ZEROGPU: | |
| device = "cuda" | |
| elif torch.backends.mps.is_available(): | |
| device = "mps" | |
| else: | |
| device = "cpu" | |
| print(f"[lofinity] loading {MUSICGEN_MODEL} on {device}β¦") | |
| processor = AutoProcessor.from_pretrained(MUSICGEN_MODEL) | |
| model = MusicgenForConditionalGeneration.from_pretrained(MUSICGEN_MODEL) | |
| model.to(device) | |
| model.eval() | |
| _musicgen = (processor, model, device) | |
| print("[lofinity] musicgen ready") | |
| return _musicgen | |
| # ZeroGPU wants models resident on cuda at startup (module import time), not | |
| # lazily inside the @spaces.GPU call β outside the decorated function a CUDA | |
| # emulation layer lets this .to('cuda') succeed without a real GPU attached, and | |
| # placements done at startup are far more efficient than per-call transfers. | |
| if IS_ZEROGPU and ENGINE != "stub": | |
| load_musicgen() | |
| try: | |
| load_enricher() | |
| except Exception as e: # noqa: BLE001 β a bad enricher must not kill the app | |
| _enricher_disabled = True | |
| print(f"[lofinity] enricher load failed ({e!r}); vends use the plain fallback") | |
| def encode_wav(samples, rate: int) -> str: | |
| """Encode mono float samples as a base64 WAV data URI, entirely in memory. | |
| Nothing is written to disk: on a shared HF Space a songs directory is | |
| visible to every visitor and grows without bound. Returning the tape | |
| inline keeps it private to the one request that asked for it.""" | |
| import numpy as np | |
| # MusicGen can exceed [-1, 1]; normalize instead of hard-clipping | |
| peak = float(np.abs(samples).max() or 1.0) | |
| if peak > 0.95: | |
| samples = samples * (0.95 / peak) | |
| pcm = (samples * 32767).astype("<i2") | |
| buf = io.BytesIO() | |
| with wave.open(buf, "wb") as w: | |
| w.setnchannels(1) | |
| w.setsampwidth(2) | |
| w.setframerate(rate) | |
| w.writeframes(pcm.tobytes()) | |
| b64 = base64.b64encode(buf.getvalue()).decode("ascii") | |
| return f"data:audio/wav;base64,{b64}" | |
| def _rms(x) -> float: | |
| import numpy as np | |
| return float(np.sqrt(np.mean(np.square(x)))) if len(x) else 0.0 | |
| def _match_rms(chunk, target: float): | |
| """Scale a fresh continuation to the first chunk's loudness. MusicGen | |
| continuations drift quieter each round; re-leveling keeps a long tape even.""" | |
| r = _rms(chunk) | |
| if r < 1e-6 or target < 1e-6: | |
| return chunk | |
| return chunk * min(4.0, target / r) # cap the boost so a quiet tail can't blow up | |
| def _stitch(base, tail, rate: int): | |
| """Append `tail` to `base` with a short equal-power crossfade at the seam.""" | |
| import numpy as np | |
| seam = int(SEAM_S * rate) | |
| if seam <= 0 or len(base) < seam or len(tail) < seam: | |
| return np.concatenate([base, tail]) | |
| t = np.linspace(0.0, 1.0, seam) | |
| fade_out, fade_in = np.cos(t * np.pi / 2), np.sin(t * np.pi / 2) | |
| blended = base[-seam:] * fade_out + tail[:seam] * fade_in | |
| return np.concatenate([base[:-seam], blended, tail[seam:]]) | |
| def musicgen_engine(music_prompt: str, seconds: int = CHUNK_S, progress_cb=None) -> tuple: | |
| """Returns (samples, sample_rate). Tapes longer than one 30s shot are built | |
| by re-seeding the model with the last OVERLAP_S and stitching the chunks. | |
| progress_cb(done, total) is called after each chunk finishes.""" | |
| import torch | |
| processor, model, device = load_musicgen() | |
| rate = model.config.audio_encoder.sampling_rate | |
| overlap = int(OVERLAP_S * rate) | |
| # a continuation prepends the OVERLAP_S seed, so it may generate only | |
| # MAX_GEN_S - OVERLAP_S NEW seconds to keep the whole shot inside musicgen's | |
| # ~30s training window β generating past it is what turns the tail to noise | |
| cont_new_s = max(1.0, MAX_GEN_S - OVERLAP_S) | |
| rounds = max(0, round(seconds / CHUNK_S) - 1) # 30->0, 60->1, 90->2 | |
| total = rounds + 1 | |
| def shot(dev, seed=None, new_s=CHUNK_S): | |
| if seed is None: | |
| inputs = processor(text=[music_prompt], padding=True, return_tensors="pt") | |
| else: | |
| inputs = processor( | |
| audio=seed, sampling_rate=rate, text=[music_prompt], | |
| padding=True, return_tensors="pt", | |
| ) | |
| inputs = inputs.to(dev) | |
| with torch.no_grad(): | |
| audio = model.generate( | |
| **inputs, | |
| do_sample=True, | |
| guidance_scale=3.0, | |
| max_new_tokens=int(new_s * 50), # ~50 tokens per second | |
| ) | |
| return audio[0, 0].cpu().float().numpy() | |
| def build(dev): | |
| track = shot(dev) # first 30s straight from the prompt | |
| if progress_cb: | |
| progress_cb(1, total) | |
| base_rms = _rms(track) | |
| for i in range(rounds): | |
| out = shot(dev, seed=track[-overlap:], new_s=cont_new_s) # capped continuation | |
| fresh = _match_rms(out[overlap:], base_rms) # drop the re-encoded seed | |
| track = _stitch(track, fresh, rate) | |
| if progress_cb: | |
| progress_cb(i + 2, total) | |
| print(f"[lofinity] stitched chunk {i + 2}/{total}") | |
| return track | |
| try: | |
| samples = build(device) | |
| except Exception as e: # noqa: BLE001 | |
| if device == "mps": | |
| print(f"[lofinity] mps generation failed ({e!r}), retrying on cpu") | |
| model.to("cpu") | |
| samples = build("cpu") | |
| else: | |
| raise | |
| return samples, rate | |
| def _gpu_budget(prompt: str, seconds: int = CHUNK_S) -> int: | |
| """GPU seconds to request from ZeroGPU for a brew of this length: MiniCPM | |
| enrichment + per-chunk MusicGen render plus headroom. Tighter budgets earn | |
| better queue priority; the signature must mirror gpu_brew so ZeroGPU can pass | |
| it the same args.""" | |
| chunks = max(1, round(int(seconds) / CHUNK_S)) | |
| # musicgen-medium renders slower than -small, so budget generously β a brew | |
| # that overruns its @spaces.GPU duration is KILLED mid-render; over-budgeting | |
| # only costs quota, under-budgeting loses the tape. | |
| return 40 + 40 * chunks # 30s->80, 60s->120, 90s->160 | |
| def gpu_brew(prompt: str, seconds: int = CHUNK_S) -> tuple: | |
| """ZeroGPU entry point β enrichment (MiniCPM) AND MusicGen on the real GPU in | |
| a single acquisition. Takes the raw vibe and returns | |
| (music_prompt, title, bed, samples, rate). It runs in a separate GPU worker | |
| process and can't push per-chunk progress back to the web process, so | |
| /api/progress reports a time-based estimate for the bar. This path is | |
| Space-only.""" | |
| music_prompt, title, bed = enrich_prompt(prompt) | |
| samples, rate = musicgen_engine(music_prompt, seconds) | |
| return music_prompt, title, bed, samples, rate | |
| def stub_engine(_music_prompt: str, seconds: int = CHUNK_S, progress_cb=None) -> tuple: | |
| """A short audible tone β handy when developing without the heavy model. | |
| Honors `seconds` and fakes per-chunk timing so the length slider and the | |
| brewing progress bar can be exercised without MusicGen.""" | |
| import time | |
| import numpy as np | |
| rate = 22050 | |
| total = max(1, round(seconds / CHUNK_S)) | |
| for i in range(total): | |
| time.sleep(0.8) # pretend each 30s chunk takes a moment to render | |
| if progress_cb: | |
| progress_cb(i + 1, total) | |
| secs = float(seconds) | |
| t = np.arange(int(rate * secs)) / rate | |
| fade = np.minimum(1.0, np.minimum(t * 4, (secs - t) * 4)) | |
| # a slow wobble so a longer stub is audibly (and visibly) longer | |
| return 0.25 * fade * np.sin(2 * np.pi * 220 * t) * (0.8 + 0.2 * np.sin(t)), rate | |
| # --- API ----------------------------------------------------------------------- | |
| def generate_song(prompt: str, seconds: int = DEFAULT_SECONDS) -> dict: | |
| import ambience | |
| # snap whatever the slider sends to a length we can actually build | |
| seconds = min(ALLOWED_SECONDS, key=lambda s: abs(s - int(seconds))) | |
| # reset progress up front, BEFORE the (sometimes slow) enrich step, so a poll | |
| # arriving early sees this brew at 0% rather than the last one at 100% | |
| chunks = max(1, round(seconds / CHUNK_S)) | |
| _PROGRESS.update(done=0, total=chunks) | |
| if IS_ZEROGPU and ENGINE != "stub": | |
| # On ZeroGPU enrichment (MiniCPM) and MusicGen share ONE @spaces.GPU | |
| # acquisition in a separate worker process, which can't push real progress | |
| # back β so /api/progress reports a smooth time-based ESTIMATE driven by | |
| # this brew's start + budget (capped <100% until the tape actually lands). | |
| est = GPU_WARMUP_S + GPU_SECS_PER_CHUNK * chunks | |
| _BREW.update(active=True, start=time.monotonic(), est=est, total=chunks) | |
| print(f"[lofinity] brewing on GPU :: {prompt!r} ({seconds}s, ~{est:.0f}s est)") | |
| try: | |
| music_prompt, title, bed, samples, rate = gpu_brew(prompt, seconds) | |
| finally: | |
| # top the bar off BEFORE clearing active, so a poll landing in between | |
| # reads 100% (from _PROGRESS), never the 0% this brew started at | |
| _PROGRESS.update(done=chunks, total=chunks) | |
| _BREW.update(active=False) | |
| print(f"[lofinity] brewed {title!r} :: {music_prompt} [+ {bed}]") | |
| else: | |
| # Local / stub: enrich in-process (Ollama or fallback), then render with | |
| # live per-chunk progress for the brewing garden. | |
| music_prompt, title, bed = enrich_prompt(prompt) | |
| print(f"[lofinity] brewing {title!r} ({seconds}s) :: {music_prompt} [+ {bed}]") | |
| engine = stub_engine if ENGINE == "stub" else musicgen_engine | |
| samples, rate = engine( | |
| music_prompt, seconds, | |
| progress_cb=lambda d, t: _PROGRESS.update(done=d, total=t), | |
| ) | |
| _PROGRESS.update(done=chunks, total=chunks) | |
| try: | |
| samples = ambience.mix(samples, rate, bed) | |
| except Exception as e: # noqa: BLE001 β a dry tape beats a failed vend | |
| print(f"[lofinity] ambience mix failed ({e!r}), vending without the bed") | |
| # The tape rides back inline as a base64 data URI β no disk write, so it is | |
| # never cached on the Space nor shared with other visitors. The frontend | |
| # keeps the collection client-side, per browser session. | |
| return {"title": title, "audio": encode_wav(samples, rate)} | |
| def progress() -> dict: | |
| """Progress for the brewing bar. Local/stub report real per-chunk progress via | |
| _PROGRESS. A ZeroGPU brew runs in a separate GPU worker that can't push | |
| progress back, so report a smooth time-based ESTIMATE instead: a fractional | |
| `done` (the frontend fills the bar to done/total) capped below 100% until the | |
| real tape lands and _PROGRESS tops it off.""" | |
| if _BREW["active"] and _BREW["est"] > 0: | |
| elapsed = time.monotonic() - _BREW["start"] | |
| frac = min(0.92, elapsed / _BREW["est"]) | |
| return {"done": round(frac * _BREW["total"], 3), "total": _BREW["total"]} | |
| return dict(_PROGRESS) | |
| def config() -> dict: | |
| """Frontend config: the tape lengths this backend allows. Hardware-dependent β | |
| a CPU-only fallback offers only 30s β so the slider reads it and adapts.""" | |
| return {"allowed_seconds": list(ALLOWED_SECONDS)} | |
| async def homepage(): | |
| return FileResponse(FRONTEND / "index.html") | |
| app.mount("/static", StaticFiles(directory=FRONTEND), name="static") | |
| if __name__ == "__main__": | |
| app.launch(show_error=True) | |