Spaces:
Running on Zero
Running on Zero
File size: 24,787 Bytes
e4d14af d777ba1 af047f2 d777ba1 613bdc6 d777ba1 3ae6607 613bdc6 af047f2 e4d14af 2916784 d777ba1 77907ff e4d14af d777ba1 e4d14af 3ae6607 53feffb 3ae6607 53feffb 3ae6607 53feffb d777ba1 613bdc6 d0f696e 946690f d0f696e 946690f b97956f d0f696e 613bdc6 d0f696e d777ba1 77907ff df643b3 d777ba1 e4d14af d0f696e 77907ff af047f2 e4d14af d777ba1 d14073a 722a5d8 d777ba1 d14073a 722a5d8 d14073a d777ba1 722a5d8 d777ba1 d14073a 722a5d8 d14073a 722a5d8 d777ba1 af047f2 d777ba1 af047f2 d777ba1 af047f2 d777ba1 af047f2 d777ba1 af047f2 722a5d8 d777ba1 af047f2 d777ba1 df643b3 d777ba1 3ae6607 df643b3 d777ba1 3ae6607 af047f2 3ae6607 2916784 d777ba1 2916784 d777ba1 2916784 d777ba1 d0f696e d777ba1 d0f696e b97956f d0f696e d777ba1 d0f696e d777ba1 d0f696e b97956f d0f696e d777ba1 d0f696e d777ba1 d0f696e d777ba1 d0f696e d777ba1 af047f2 3ae6607 df643b3 3ae6607 af047f2 77907ff af047f2 3ae6607 d0f696e 722a5d8 d0f696e d777ba1 e4d14af d777ba1 d0f696e 722a5d8 d0f696e af047f2 d0f696e af047f2 77907ff af047f2 3ae6607 af047f2 3ae6607 af047f2 d0f696e 722a5d8 2916784 7b00de0 d0f696e 77907ff d0f696e 613bdc6 e4d14af 2916784 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 | """LoFinity β a vending machine that dispenses endless chill beats.
Gradio Server backend: serves the Three.js frontend and exposes the
generation API.
Pipeline: user vibe -> a small LLM enriches it into a MusicGen prompt +
cassette title + ambience pick -> MusicGen renders the music -> ambience.py
loops a background bed (waves, crackle, rainβ¦) underneath. MusicGen ignores
texture words in prompts, hence the separate bed. The enrichment LLM is
MiniCPM (on cuda) on a ZeroGPU Space, or a local Ollama daemon in dev.
On a ZeroGPU Space it runs musicgen-medium and allows tapes up to 90s (chunked);
without a GPU it falls back to musicgen-small and a single 30s shot (no chunking).
Env knobs:
LOFINITY_ENGINE musicgen (default) | stub
LOFINITY_DURATION clip length in seconds (default 30, the single-shot max)
LOFINITY_DEVICE cuda | mps | cpu (default: cuda on ZeroGPU, else mps if available)
LOFINITY_MUSICGEN model id (default: musicgen-medium on ZeroGPU, else musicgen-small)
LOFINITY_OVERLAP_S continuation seed length, seconds (default 2)
LOFINITY_MAX_GEN_S cap on a continuation's total output, seconds (default 28)
LOFINITY_ENRICHER MiniCPM model id for ZeroGPU enrichment (default MiniCPM5-1B)
OLLAMA_URL default http://localhost:11434 (local enrichment)
OLLAMA_MODEL default llama3.2:3b (local enrichment)
"""
import base64
import io
import json
import os
import threading
import time
import wave
from pathlib import Path
import httpx
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from gradio.server import Server
ROOT = Path(__file__).parent
FRONTEND = ROOT / "frontend"
# ZeroGPU: on a Hugging Face ZeroGPU Space a GPU is attached only for the
# duration of a function wrapped in @spaces.GPU, then released. The `spaces`
# package exists only in that runtime; locally we shim @spaces.GPU to a no-op so
# the exact same code runs on mps/cpu untouched.
#
# Detect ZeroGPU via the spaces library's OWN config rather than parsing the env
# var ourselves: Config.zero_gpu is the very flag that gates @spaces.GPU (see
# spaces/zero/decorator.py), so IS_ZEROGPU ends up True precisely when the
# decorator will really allocate a GPU. (Our first attempt compared
# SPACES_ZERO_GPU to the literal "true", but the runtime sets it to a value the
# library parses loosely β "1"/"t"/"true" β so the exact match failed and the
# GPU path never fired: generation silently fell back to CPU.)
IS_ZEROGPU = False
try:
import spaces # provided by the ZeroGPU Space runtime
from spaces.config import Config as _ZeroGPUConfig
IS_ZEROGPU = bool(_ZeroGPUConfig.zero_gpu)
except ImportError: # local dev / non-ZeroGPU β make the decorator harmless
class _SpacesShim:
@staticmethod
def GPU(*args, **kwargs):
# handle both bare @spaces.GPU and @spaces.GPU(duration=...)
if args and callable(args[0]):
return args[0]
return lambda fn: fn
spaces = _SpacesShim()
print(
f"[lofinity] startup: IS_ZEROGPU={IS_ZEROGPU} "
f"(SPACES_ZERO_GPU={os.environ.get('SPACES_ZERO_GPU')!r})"
)
ENGINE = os.getenv("LOFINITY_ENGINE", "musicgen")
# Model + tape length scale with the hardware: a ZeroGPU Space gets the bigger,
# cleaner-continuing musicgen-medium and full chunked tapes (up to 90s); without a
# GPU we fall back to the smaller, faster musicgen-small and a single 30s shot
# (medium + chunking on CPU would take minutes). The env var still overrides.
MUSICGEN_MODEL = os.getenv(
"LOFINITY_MUSICGEN",
"facebook/musicgen-medium" if IS_ZEROGPU else "facebook/musicgen-small",
)
# 30s is musicgen-small's single-shot max (1500 tokens). Longer tapes are
# stitched from 30s chunks: each one re-seeds the model with the last OVERLAP_S
# of the track so it keeps playing from there. musicgen-small's context is 2048
# tokens (~41s at 50 tok/s), so a 2s seed + 30s of new audio (~1600 tokens) fits.
CHUNK_S = 30 # length of each musicgen single-shot
OVERLAP_S = float(os.getenv("LOFINITY_OVERLAP_S", "2")) # seconds of tail fed back as the continuation seed; shorter leans more on the text prompt
# musicgen is trained on 30s clips, so a single shot longer than ~30s degrades
# its tail into noise. A continuation prepends the seed THEN generates, so cap its
# total output (seed + new) at MAX_GEN_S to stay inside that window. Env-tunable.
MAX_GEN_S = float(os.getenv("LOFINITY_MAX_GEN_S", "28"))
SEAM_S = 0.4 # equal-power crossfade at each stitch, to hide the join
# the tape lengths the API allows (it snaps any request to the nearest). Only a
# GPU gets the longer, chunked tapes; a CPU-only fallback is capped to one 30s shot.
ALLOWED_SECONDS = (30, 60, 90) if IS_ZEROGPU else (30,)
DEFAULT_SECONDS = int(os.getenv("LOFINITY_DURATION", "30"))
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434")
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3.2:3b")
# A ZeroGPU brew renders in a separate GPU worker process, so /api/progress can't
# read real per-chunk progress from it; it reports a time-based estimate from this
# rough per-length budget instead (tunable; only affects the cosmetic brew bar).
GPU_WARMUP_S = 5.0 # enrichment + cold-start/queue allowance before audio flows
GPU_SECS_PER_CHUNK = 25.0 # rough GPU render time per 30s chunk (musicgen-medium)
app = Server(title="LoFinity")
# How far along the current brew is, in whole 30s chunks. generate_song runs one
# at a time (concurrency_limit=1), so a single shared counter is enough; the
# frontend polls /api/progress to fill its brewing bar.
_PROGRESS = {"done": 0, "total": 1}
# A ZeroGPU brew's wall-clock start + estimated total, so /api/progress can report
# a smooth time-based estimate while the GPU worker is busy (see progress()).
_BREW = {"active": False, "start": 0.0, "est": 1.0, "total": 1}
# --- prompt enrichment --------------------------------------------------------
ENRICH_SYSTEM = """\
You are the creative brain of LoFinity, a magical vending machine that sells
lofi cassette tapes. The user gives you a vibe. Reply ONLY with JSON with
exactly these three keys: {"music_prompt": "...", "title": "...", "ambience": "..."}
Build music_prompt from this template, in this order:
"lofi chill, <instrument 1>, <instrument 2>, <instrument 3>, <mood>, slow tempo, 75 bpm, instrumental"
- instruments: 2-3 picked to EVOKE the user's vibe, never a default set
(island -> ukulele, kalimba, steel pan; rainy city -> rhodes piano, soft
guitar; winter -> felt piano, soft strings; desert -> slide guitar, hand drums)
- mood: one or two calm words; never energetic, no vocals
ambience: the background sound layered under the music. Exactly one of:
vinyl_crackle, tape_hiss, soft_rain, ocean_waves, fireplace_crackle,
birdsong, night_crickets, cafe_murmur, wind_in_trees. Match it to the vibe.
title: a cozy cassette tape title inspired by the vibe, max 5 words,
Title Case, no quotes or emoji.
Examples:
user: island summer
{"music_prompt": "lofi chill, ukulele, kalimba, steel pan, breezy and warm, slow tempo, 75 bpm, instrumental", "title": "Coconut Daydream", "ambience": "ocean_waves"}
user: studying at midnight
{"music_prompt": "lofi chill, rhodes piano, muted guitar, soft bass, focused and calm, slow tempo, 75 bpm, instrumental", "title": "Midnight Study Session", "ambience": "vinyl_crackle"}"""
# MiniCPM enrichment LLM (ZeroGPU only β a Space has no Ollama daemon).
# MiniCPM5-1B is a standard LlamaForCausalLM (no trust_remote_code, fast
# tokenizer) with a switchable <think> mode we keep OFF so the reply is direct
# JSON. Needs transformers>=5.6 (the Space's latest satisfies it); no extra deps.
ENRICHER_MODEL = os.getenv("LOFINITY_ENRICHER", "openbmb/MiniCPM5-1B")
_enricher = None
_enricher_lock = threading.Lock()
_enricher_disabled = False # set if the model can't load; forces the fallback
def load_enricher():
"""Lazy-load the MiniCPM enrichment LLM on cuda (ZeroGPU). Like MusicGen it is
placed on cuda at module level; standard Llama arch, so no remote code."""
global _enricher
with _enricher_lock:
if _enricher is None:
import torch # noqa: F401 β needed so the .to('cuda') below resolves
from transformers import AutoModelForCausalLM, AutoTokenizer
print(f"[lofinity] loading enricher {ENRICHER_MODEL} on cudaβ¦")
tok = AutoTokenizer.from_pretrained(ENRICHER_MODEL)
model = AutoModelForCausalLM.from_pretrained(ENRICHER_MODEL, torch_dtype="auto")
model.to("cuda")
model.eval()
_enricher = (tok, model)
print("[lofinity] enricher ready")
return _enricher
def _parse_enrich_json(text: str) -> dict:
"""Pull the first {...} object out of an LLM reply (it may wrap the JSON in
prose or ```json fences, or leak a <think> block); {} if nothing parses."""
import re
if "</think>" in text: # belt-and-suspenders if thinking ever leaks through
text = text.rsplit("</think>", 1)[1]
m = re.search(r"\{.*\}", text, re.DOTALL)
if not m:
return {}
try:
return json.loads(m.group(0))
except Exception: # noqa: BLE001
return {}
def _finalize_enrichment(data: dict):
"""Shared post-processing for any backend: validate, force the genre to lead,
snap the ambience to a renderable bed. Returns a tuple, or None if unusable."""
import ambience
music_prompt = str(data.get("music_prompt") or "").strip()
title = str(data.get("title") or "").strip()[:48]
if not (music_prompt and title):
return None
# belt and suspenders: the genre must lead even if the LLM drifts
if "lofi" not in music_prompt.lower():
music_prompt = f"lofi chill, {music_prompt}"
# whatever the LLM picked, snap it to a bed we can actually render
return music_prompt, title, ambience.normalize_slug(data.get("ambience"))
def _enrich_minicpm(prompt: str):
"""Enrich via MiniCPM on cuda. MUST run inside @spaces.GPU. Returns a tuple or
None (caller falls back). Thinking mode off so the reply is direct JSON."""
if _enricher_disabled:
return None
import torch
tok, model = load_enricher()
messages = [
{"role": "system", "content": ENRICH_SYSTEM},
{"role": "user", "content": prompt},
]
inputs = tok.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True,
enable_thinking=False, return_dict=True, return_tensors="pt",
).to(model.device)
with torch.no_grad():
out = model.generate(
**inputs, max_new_tokens=220, do_sample=True, temperature=0.7, top_p=0.95
)
reply = tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
return _finalize_enrichment(_parse_enrich_json(reply))
def _enrich_ollama(prompt: str):
"""Enrich via a local Ollama daemon. Returns a tuple or None on failure."""
r = httpx.post(
f"{OLLAMA_URL}/api/chat",
json={
"model": OLLAMA_MODEL,
"messages": [
{"role": "system", "content": ENRICH_SYSTEM},
{"role": "user", "content": prompt},
],
"format": "json",
"stream": False,
"options": {"temperature": 0.8, "num_predict": 220},
},
timeout=45,
)
r.raise_for_status()
return _finalize_enrichment(json.loads(r.json()["message"]["content"]))
def _enrich_fallback(prompt: str) -> tuple[str, str, str]:
"""Plain, LLM-free enrichment β used whenever the chosen backend fails."""
import ambience
title = f"{prompt[:28].title()} Tape" if prompt.strip() else "Untitled Tape"
return (
f"lofi chill, {prompt}, mellow and warm, soft drums, slow tempo, instrumental",
title,
ambience.DEFAULT,
)
def enrich_prompt(prompt: str) -> tuple[str, str, str]:
"""Vibe -> (music_prompt, cassette title, ambience slug). Backend is chosen by
environment: MiniCPM on ZeroGPU, Ollama locally; a plain fallback covers any
failure. On ZeroGPU this MUST be called inside @spaces.GPU (MiniCPM is cuda)."""
backend = _enrich_minicpm if IS_ZEROGPU else _enrich_ollama
try:
result = backend(prompt)
if result:
return result
print("[lofinity] enrichment returned junk, using fallback")
except Exception as e: # noqa: BLE001 β any failure means "use fallback"
print(f"[lofinity] enrichment failed ({e!r}), using fallback")
return _enrich_fallback(prompt)
# --- audio engines ------------------------------------------------------------
_musicgen = None
_musicgen_lock = threading.Lock()
def load_musicgen():
"""Lazy-load MusicGen once; the first call downloads the model from the Hub
(musicgen-medium is ~3.5 GB)."""
global _musicgen
with _musicgen_lock:
if _musicgen is None:
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
import torch
from transformers import AutoProcessor, MusicgenForConditionalGeneration
requested = os.getenv("LOFINITY_DEVICE")
if requested:
device = requested
elif IS_ZEROGPU:
device = "cuda"
elif torch.backends.mps.is_available():
device = "mps"
else:
device = "cpu"
print(f"[lofinity] loading {MUSICGEN_MODEL} on {device}β¦")
processor = AutoProcessor.from_pretrained(MUSICGEN_MODEL)
model = MusicgenForConditionalGeneration.from_pretrained(MUSICGEN_MODEL)
model.to(device)
model.eval()
_musicgen = (processor, model, device)
print("[lofinity] musicgen ready")
return _musicgen
# ZeroGPU wants models resident on cuda at startup (module import time), not
# lazily inside the @spaces.GPU call β outside the decorated function a CUDA
# emulation layer lets this .to('cuda') succeed without a real GPU attached, and
# placements done at startup are far more efficient than per-call transfers.
if IS_ZEROGPU and ENGINE != "stub":
load_musicgen()
try:
load_enricher()
except Exception as e: # noqa: BLE001 β a bad enricher must not kill the app
_enricher_disabled = True
print(f"[lofinity] enricher load failed ({e!r}); vends use the plain fallback")
def encode_wav(samples, rate: int) -> str:
"""Encode mono float samples as a base64 WAV data URI, entirely in memory.
Nothing is written to disk: on a shared HF Space a songs directory is
visible to every visitor and grows without bound. Returning the tape
inline keeps it private to the one request that asked for it."""
import numpy as np
# MusicGen can exceed [-1, 1]; normalize instead of hard-clipping
peak = float(np.abs(samples).max() or 1.0)
if peak > 0.95:
samples = samples * (0.95 / peak)
pcm = (samples * 32767).astype("<i2")
buf = io.BytesIO()
with wave.open(buf, "wb") as w:
w.setnchannels(1)
w.setsampwidth(2)
w.setframerate(rate)
w.writeframes(pcm.tobytes())
b64 = base64.b64encode(buf.getvalue()).decode("ascii")
return f"data:audio/wav;base64,{b64}"
def _rms(x) -> float:
import numpy as np
return float(np.sqrt(np.mean(np.square(x)))) if len(x) else 0.0
def _match_rms(chunk, target: float):
"""Scale a fresh continuation to the first chunk's loudness. MusicGen
continuations drift quieter each round; re-leveling keeps a long tape even."""
r = _rms(chunk)
if r < 1e-6 or target < 1e-6:
return chunk
return chunk * min(4.0, target / r) # cap the boost so a quiet tail can't blow up
def _stitch(base, tail, rate: int):
"""Append `tail` to `base` with a short equal-power crossfade at the seam."""
import numpy as np
seam = int(SEAM_S * rate)
if seam <= 0 or len(base) < seam or len(tail) < seam:
return np.concatenate([base, tail])
t = np.linspace(0.0, 1.0, seam)
fade_out, fade_in = np.cos(t * np.pi / 2), np.sin(t * np.pi / 2)
blended = base[-seam:] * fade_out + tail[:seam] * fade_in
return np.concatenate([base[:-seam], blended, tail[seam:]])
def musicgen_engine(music_prompt: str, seconds: int = CHUNK_S, progress_cb=None) -> tuple:
"""Returns (samples, sample_rate). Tapes longer than one 30s shot are built
by re-seeding the model with the last OVERLAP_S and stitching the chunks.
progress_cb(done, total) is called after each chunk finishes."""
import torch
processor, model, device = load_musicgen()
rate = model.config.audio_encoder.sampling_rate
overlap = int(OVERLAP_S * rate)
# a continuation prepends the OVERLAP_S seed, so it may generate only
# MAX_GEN_S - OVERLAP_S NEW seconds to keep the whole shot inside musicgen's
# ~30s training window β generating past it is what turns the tail to noise
cont_new_s = max(1.0, MAX_GEN_S - OVERLAP_S)
rounds = max(0, round(seconds / CHUNK_S) - 1) # 30->0, 60->1, 90->2
total = rounds + 1
def shot(dev, seed=None, new_s=CHUNK_S):
if seed is None:
inputs = processor(text=[music_prompt], padding=True, return_tensors="pt")
else:
inputs = processor(
audio=seed, sampling_rate=rate, text=[music_prompt],
padding=True, return_tensors="pt",
)
inputs = inputs.to(dev)
with torch.no_grad():
audio = model.generate(
**inputs,
do_sample=True,
guidance_scale=3.0,
max_new_tokens=int(new_s * 50), # ~50 tokens per second
)
return audio[0, 0].cpu().float().numpy()
def build(dev):
track = shot(dev) # first 30s straight from the prompt
if progress_cb:
progress_cb(1, total)
base_rms = _rms(track)
for i in range(rounds):
out = shot(dev, seed=track[-overlap:], new_s=cont_new_s) # capped continuation
fresh = _match_rms(out[overlap:], base_rms) # drop the re-encoded seed
track = _stitch(track, fresh, rate)
if progress_cb:
progress_cb(i + 2, total)
print(f"[lofinity] stitched chunk {i + 2}/{total}")
return track
try:
samples = build(device)
except Exception as e: # noqa: BLE001
if device == "mps":
print(f"[lofinity] mps generation failed ({e!r}), retrying on cpu")
model.to("cpu")
samples = build("cpu")
else:
raise
return samples, rate
def _gpu_budget(prompt: str, seconds: int = CHUNK_S) -> int:
"""GPU seconds to request from ZeroGPU for a brew of this length: MiniCPM
enrichment + per-chunk MusicGen render plus headroom. Tighter budgets earn
better queue priority; the signature must mirror gpu_brew so ZeroGPU can pass
it the same args."""
chunks = max(1, round(int(seconds) / CHUNK_S))
# musicgen-medium renders slower than -small, so budget generously β a brew
# that overruns its @spaces.GPU duration is KILLED mid-render; over-budgeting
# only costs quota, under-budgeting loses the tape.
return 40 + 40 * chunks # 30s->80, 60s->120, 90s->160
@spaces.GPU(duration=_gpu_budget)
def gpu_brew(prompt: str, seconds: int = CHUNK_S) -> tuple:
"""ZeroGPU entry point β enrichment (MiniCPM) AND MusicGen on the real GPU in
a single acquisition. Takes the raw vibe and returns
(music_prompt, title, bed, samples, rate). It runs in a separate GPU worker
process and can't push per-chunk progress back to the web process, so
/api/progress reports a time-based estimate for the bar. This path is
Space-only."""
music_prompt, title, bed = enrich_prompt(prompt)
samples, rate = musicgen_engine(music_prompt, seconds)
return music_prompt, title, bed, samples, rate
def stub_engine(_music_prompt: str, seconds: int = CHUNK_S, progress_cb=None) -> tuple:
"""A short audible tone β handy when developing without the heavy model.
Honors `seconds` and fakes per-chunk timing so the length slider and the
brewing progress bar can be exercised without MusicGen."""
import time
import numpy as np
rate = 22050
total = max(1, round(seconds / CHUNK_S))
for i in range(total):
time.sleep(0.8) # pretend each 30s chunk takes a moment to render
if progress_cb:
progress_cb(i + 1, total)
secs = float(seconds)
t = np.arange(int(rate * secs)) / rate
fade = np.minimum(1.0, np.minimum(t * 4, (secs - t) * 4))
# a slow wobble so a longer stub is audibly (and visibly) longer
return 0.25 * fade * np.sin(2 * np.pi * 220 * t) * (0.8 + 0.2 * np.sin(t)), rate
# --- API -----------------------------------------------------------------------
@app.api(name="generate_song", concurrency_limit=1)
def generate_song(prompt: str, seconds: int = DEFAULT_SECONDS) -> dict:
import ambience
# snap whatever the slider sends to a length we can actually build
seconds = min(ALLOWED_SECONDS, key=lambda s: abs(s - int(seconds)))
# reset progress up front, BEFORE the (sometimes slow) enrich step, so a poll
# arriving early sees this brew at 0% rather than the last one at 100%
chunks = max(1, round(seconds / CHUNK_S))
_PROGRESS.update(done=0, total=chunks)
if IS_ZEROGPU and ENGINE != "stub":
# On ZeroGPU enrichment (MiniCPM) and MusicGen share ONE @spaces.GPU
# acquisition in a separate worker process, which can't push real progress
# back β so /api/progress reports a smooth time-based ESTIMATE driven by
# this brew's start + budget (capped <100% until the tape actually lands).
est = GPU_WARMUP_S + GPU_SECS_PER_CHUNK * chunks
_BREW.update(active=True, start=time.monotonic(), est=est, total=chunks)
print(f"[lofinity] brewing on GPU :: {prompt!r} ({seconds}s, ~{est:.0f}s est)")
try:
music_prompt, title, bed, samples, rate = gpu_brew(prompt, seconds)
finally:
# top the bar off BEFORE clearing active, so a poll landing in between
# reads 100% (from _PROGRESS), never the 0% this brew started at
_PROGRESS.update(done=chunks, total=chunks)
_BREW.update(active=False)
print(f"[lofinity] brewed {title!r} :: {music_prompt} [+ {bed}]")
else:
# Local / stub: enrich in-process (Ollama or fallback), then render with
# live per-chunk progress for the brewing garden.
music_prompt, title, bed = enrich_prompt(prompt)
print(f"[lofinity] brewing {title!r} ({seconds}s) :: {music_prompt} [+ {bed}]")
engine = stub_engine if ENGINE == "stub" else musicgen_engine
samples, rate = engine(
music_prompt, seconds,
progress_cb=lambda d, t: _PROGRESS.update(done=d, total=t),
)
_PROGRESS.update(done=chunks, total=chunks)
try:
samples = ambience.mix(samples, rate, bed)
except Exception as e: # noqa: BLE001 β a dry tape beats a failed vend
print(f"[lofinity] ambience mix failed ({e!r}), vending without the bed")
# The tape rides back inline as a base64 data URI β no disk write, so it is
# never cached on the Space nor shared with other visitors. The frontend
# keeps the collection client-side, per browser session.
return {"title": title, "audio": encode_wav(samples, rate)}
@app.get("/api/progress")
def progress() -> dict:
"""Progress for the brewing bar. Local/stub report real per-chunk progress via
_PROGRESS. A ZeroGPU brew runs in a separate GPU worker that can't push
progress back, so report a smooth time-based ESTIMATE instead: a fractional
`done` (the frontend fills the bar to done/total) capped below 100% until the
real tape lands and _PROGRESS tops it off."""
if _BREW["active"] and _BREW["est"] > 0:
elapsed = time.monotonic() - _BREW["start"]
frac = min(0.92, elapsed / _BREW["est"])
return {"done": round(frac * _BREW["total"], 3), "total": _BREW["total"]}
return dict(_PROGRESS)
@app.get("/api/config")
def config() -> dict:
"""Frontend config: the tape lengths this backend allows. Hardware-dependent β
a CPU-only fallback offers only 30s β so the slider reads it and adapts."""
return {"allowed_seconds": list(ALLOWED_SECONDS)}
@app.get("/")
async def homepage():
return FileResponse(FRONTEND / "index.html")
app.mount("/static", StaticFiles(directory=FRONTEND), name="static")
if __name__ == "__main__":
app.launch(show_error=True)
|