File size: 24,787 Bytes
e4d14af
 
 
d777ba1
 
af047f2
 
 
 
 
d777ba1
613bdc6
 
 
d777ba1
 
 
3ae6607
613bdc6
 
 
af047f2
 
 
e4d14af
 
2916784
 
d777ba1
 
 
77907ff
e4d14af
 
 
d777ba1
e4d14af
 
 
 
 
 
 
3ae6607
 
53feffb
 
 
 
 
 
 
 
 
 
 
3ae6607
 
53feffb
 
 
3ae6607
 
 
 
 
 
 
 
 
 
 
53feffb
 
 
 
 
d777ba1
613bdc6
 
 
 
 
 
 
 
d0f696e
 
 
946690f
d0f696e
946690f
b97956f
 
 
 
d0f696e
613bdc6
 
 
d0f696e
d777ba1
 
77907ff
 
 
 
df643b3
d777ba1
e4d14af
 
d0f696e
 
 
 
 
77907ff
 
 
 
af047f2
e4d14af
d777ba1
 
d14073a
722a5d8
d777ba1
d14073a
722a5d8
d14073a
 
 
 
 
d777ba1
722a5d8
 
 
 
d777ba1
d14073a
 
 
 
722a5d8
d14073a
722a5d8
 
d777ba1
af047f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d777ba1
af047f2
 
 
 
 
 
 
 
 
 
d777ba1
af047f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d777ba1
af047f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d777ba1
af047f2
 
722a5d8
d777ba1
 
 
af047f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d777ba1
 
 
 
 
 
 
df643b3
 
d777ba1
 
 
 
 
 
 
 
3ae6607
 
 
 
 
 
 
 
df643b3
 
 
d777ba1
 
 
 
 
 
 
3ae6607
 
 
 
 
 
af047f2
 
 
 
 
3ae6607
 
2916784
 
 
 
 
 
d777ba1
 
 
 
 
 
 
2916784
 
d777ba1
 
 
 
2916784
 
d777ba1
 
d0f696e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d777ba1
 
 
d0f696e
 
b97956f
 
 
 
d0f696e
 
 
 
 
 
 
 
 
 
 
 
d777ba1
 
 
 
 
d0f696e
d777ba1
 
 
d0f696e
 
 
 
 
 
b97956f
d0f696e
 
 
 
 
 
 
d777ba1
d0f696e
d777ba1
 
 
 
d0f696e
d777ba1
 
d0f696e
d777ba1
 
af047f2
 
 
 
 
3ae6607
df643b3
 
 
 
3ae6607
 
 
af047f2
 
 
77907ff
 
 
 
af047f2
 
 
3ae6607
 
d0f696e
 
 
 
 
 
722a5d8
 
 
d0f696e
 
 
 
 
 
 
 
 
 
d777ba1
e4d14af
d777ba1
 
 
 
d0f696e
722a5d8
 
d0f696e
 
af047f2
 
d0f696e
 
af047f2
 
77907ff
 
 
 
 
 
 
 
 
 
 
 
 
 
af047f2
3ae6607
af047f2
 
 
 
 
 
3ae6607
 
 
af047f2
d0f696e
722a5d8
 
 
 
2916784
 
 
 
7b00de0
 
d0f696e
 
77907ff
 
 
 
 
 
 
 
 
d0f696e
 
 
613bdc6
 
 
 
 
 
 
e4d14af
 
 
 
 
 
 
 
2916784
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
"""LoFinity β€” a vending machine that dispenses endless chill beats.

Gradio Server backend: serves the Three.js frontend and exposes the
generation API.

Pipeline: user vibe -> a small LLM enriches it into a MusicGen prompt +
cassette title + ambience pick -> MusicGen renders the music -> ambience.py
loops a background bed (waves, crackle, rain…) underneath. MusicGen ignores
texture words in prompts, hence the separate bed. The enrichment LLM is
MiniCPM (on cuda) on a ZeroGPU Space, or a local Ollama daemon in dev.

On a ZeroGPU Space it runs musicgen-medium and allows tapes up to 90s (chunked);
without a GPU it falls back to musicgen-small and a single 30s shot (no chunking).

Env knobs:
  LOFINITY_ENGINE   musicgen (default) | stub
  LOFINITY_DURATION clip length in seconds (default 30, the single-shot max)
  LOFINITY_DEVICE   cuda | mps | cpu (default: cuda on ZeroGPU, else mps if available)
  LOFINITY_MUSICGEN model id (default: musicgen-medium on ZeroGPU, else musicgen-small)
  LOFINITY_OVERLAP_S continuation seed length, seconds (default 2)
  LOFINITY_MAX_GEN_S cap on a continuation's total output, seconds (default 28)
  LOFINITY_ENRICHER MiniCPM model id for ZeroGPU enrichment (default MiniCPM5-1B)
  OLLAMA_URL        default http://localhost:11434  (local enrichment)
  OLLAMA_MODEL      default llama3.2:3b              (local enrichment)
"""

import base64
import io
import json
import os
import threading
import time
import wave
from pathlib import Path

import httpx
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from gradio.server import Server

ROOT = Path(__file__).parent
FRONTEND = ROOT / "frontend"

# ZeroGPU: on a Hugging Face ZeroGPU Space a GPU is attached only for the
# duration of a function wrapped in @spaces.GPU, then released. The `spaces`
# package exists only in that runtime; locally we shim @spaces.GPU to a no-op so
# the exact same code runs on mps/cpu untouched.
#
# Detect ZeroGPU via the spaces library's OWN config rather than parsing the env
# var ourselves: Config.zero_gpu is the very flag that gates @spaces.GPU (see
# spaces/zero/decorator.py), so IS_ZEROGPU ends up True precisely when the
# decorator will really allocate a GPU. (Our first attempt compared
# SPACES_ZERO_GPU to the literal "true", but the runtime sets it to a value the
# library parses loosely β€” "1"/"t"/"true" β€” so the exact match failed and the
# GPU path never fired: generation silently fell back to CPU.)
IS_ZEROGPU = False
try:
    import spaces  # provided by the ZeroGPU Space runtime
    from spaces.config import Config as _ZeroGPUConfig

    IS_ZEROGPU = bool(_ZeroGPUConfig.zero_gpu)
except ImportError:  # local dev / non-ZeroGPU β€” make the decorator harmless
    class _SpacesShim:
        @staticmethod
        def GPU(*args, **kwargs):
            # handle both bare @spaces.GPU and @spaces.GPU(duration=...)
            if args and callable(args[0]):
                return args[0]
            return lambda fn: fn

    spaces = _SpacesShim()

print(
    f"[lofinity] startup: IS_ZEROGPU={IS_ZEROGPU} "
    f"(SPACES_ZERO_GPU={os.environ.get('SPACES_ZERO_GPU')!r})"
)

ENGINE = os.getenv("LOFINITY_ENGINE", "musicgen")
# Model + tape length scale with the hardware: a ZeroGPU Space gets the bigger,
# cleaner-continuing musicgen-medium and full chunked tapes (up to 90s); without a
# GPU we fall back to the smaller, faster musicgen-small and a single 30s shot
# (medium + chunking on CPU would take minutes). The env var still overrides.
MUSICGEN_MODEL = os.getenv(
    "LOFINITY_MUSICGEN",
    "facebook/musicgen-medium" if IS_ZEROGPU else "facebook/musicgen-small",
)
# 30s is musicgen-small's single-shot max (1500 tokens). Longer tapes are
# stitched from 30s chunks: each one re-seeds the model with the last OVERLAP_S
# of the track so it keeps playing from there. musicgen-small's context is 2048
# tokens (~41s at 50 tok/s), so a 2s seed + 30s of new audio (~1600 tokens) fits.
CHUNK_S = 30  # length of each musicgen single-shot
OVERLAP_S = float(os.getenv("LOFINITY_OVERLAP_S", "2"))  # seconds of tail fed back as the continuation seed; shorter leans more on the text prompt
# musicgen is trained on 30s clips, so a single shot longer than ~30s degrades
# its tail into noise. A continuation prepends the seed THEN generates, so cap its
# total output (seed + new) at MAX_GEN_S to stay inside that window. Env-tunable.
MAX_GEN_S = float(os.getenv("LOFINITY_MAX_GEN_S", "28"))
SEAM_S = 0.4  # equal-power crossfade at each stitch, to hide the join
# the tape lengths the API allows (it snaps any request to the nearest). Only a
# GPU gets the longer, chunked tapes; a CPU-only fallback is capped to one 30s shot.
ALLOWED_SECONDS = (30, 60, 90) if IS_ZEROGPU else (30,)
DEFAULT_SECONDS = int(os.getenv("LOFINITY_DURATION", "30"))
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434")
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3.2:3b")
# A ZeroGPU brew renders in a separate GPU worker process, so /api/progress can't
# read real per-chunk progress from it; it reports a time-based estimate from this
# rough per-length budget instead (tunable; only affects the cosmetic brew bar).
GPU_WARMUP_S = 5.0  # enrichment + cold-start/queue allowance before audio flows
GPU_SECS_PER_CHUNK = 25.0  # rough GPU render time per 30s chunk (musicgen-medium)

app = Server(title="LoFinity")

# How far along the current brew is, in whole 30s chunks. generate_song runs one
# at a time (concurrency_limit=1), so a single shared counter is enough; the
# frontend polls /api/progress to fill its brewing bar.
_PROGRESS = {"done": 0, "total": 1}

# A ZeroGPU brew's wall-clock start + estimated total, so /api/progress can report
# a smooth time-based estimate while the GPU worker is busy (see progress()).
_BREW = {"active": False, "start": 0.0, "est": 1.0, "total": 1}

# --- prompt enrichment --------------------------------------------------------

ENRICH_SYSTEM = """\
You are the creative brain of LoFinity, a magical vending machine that sells
lofi cassette tapes. The user gives you a vibe. Reply ONLY with JSON with
exactly these three keys: {"music_prompt": "...", "title": "...", "ambience": "..."}

Build music_prompt from this template, in this order:
"lofi chill, <instrument 1>, <instrument 2>, <instrument 3>, <mood>, slow tempo, 75 bpm, instrumental"

- instruments: 2-3 picked to EVOKE the user's vibe, never a default set
  (island -> ukulele, kalimba, steel pan; rainy city -> rhodes piano, soft
  guitar; winter -> felt piano, soft strings; desert -> slide guitar, hand drums)
- mood: one or two calm words; never energetic, no vocals

ambience: the background sound layered under the music. Exactly one of:
vinyl_crackle, tape_hiss, soft_rain, ocean_waves, fireplace_crackle,
birdsong, night_crickets, cafe_murmur, wind_in_trees. Match it to the vibe.

title: a cozy cassette tape title inspired by the vibe, max 5 words,
Title Case, no quotes or emoji.

Examples:
user: island summer
{"music_prompt": "lofi chill, ukulele, kalimba, steel pan, breezy and warm, slow tempo, 75 bpm, instrumental", "title": "Coconut Daydream", "ambience": "ocean_waves"}
user: studying at midnight
{"music_prompt": "lofi chill, rhodes piano, muted guitar, soft bass, focused and calm, slow tempo, 75 bpm, instrumental", "title": "Midnight Study Session", "ambience": "vinyl_crackle"}"""


# MiniCPM enrichment LLM (ZeroGPU only β€” a Space has no Ollama daemon).
# MiniCPM5-1B is a standard LlamaForCausalLM (no trust_remote_code, fast
# tokenizer) with a switchable <think> mode we keep OFF so the reply is direct
# JSON. Needs transformers>=5.6 (the Space's latest satisfies it); no extra deps.
ENRICHER_MODEL = os.getenv("LOFINITY_ENRICHER", "openbmb/MiniCPM5-1B")
_enricher = None
_enricher_lock = threading.Lock()
_enricher_disabled = False  # set if the model can't load; forces the fallback


def load_enricher():
    """Lazy-load the MiniCPM enrichment LLM on cuda (ZeroGPU). Like MusicGen it is
    placed on cuda at module level; standard Llama arch, so no remote code."""
    global _enricher
    with _enricher_lock:
        if _enricher is None:
            import torch  # noqa: F401 β€” needed so the .to('cuda') below resolves
            from transformers import AutoModelForCausalLM, AutoTokenizer

            print(f"[lofinity] loading enricher {ENRICHER_MODEL} on cuda…")
            tok = AutoTokenizer.from_pretrained(ENRICHER_MODEL)
            model = AutoModelForCausalLM.from_pretrained(ENRICHER_MODEL, torch_dtype="auto")
            model.to("cuda")
            model.eval()
            _enricher = (tok, model)
            print("[lofinity] enricher ready")
    return _enricher


def _parse_enrich_json(text: str) -> dict:
    """Pull the first {...} object out of an LLM reply (it may wrap the JSON in
    prose or ```json fences, or leak a <think> block); {} if nothing parses."""
    import re

    if "</think>" in text:  # belt-and-suspenders if thinking ever leaks through
        text = text.rsplit("</think>", 1)[1]
    m = re.search(r"\{.*\}", text, re.DOTALL)
    if not m:
        return {}
    try:
        return json.loads(m.group(0))
    except Exception:  # noqa: BLE001
        return {}


def _finalize_enrichment(data: dict):
    """Shared post-processing for any backend: validate, force the genre to lead,
    snap the ambience to a renderable bed. Returns a tuple, or None if unusable."""
    import ambience

    music_prompt = str(data.get("music_prompt") or "").strip()
    title = str(data.get("title") or "").strip()[:48]
    if not (music_prompt and title):
        return None
    # belt and suspenders: the genre must lead even if the LLM drifts
    if "lofi" not in music_prompt.lower():
        music_prompt = f"lofi chill, {music_prompt}"
    # whatever the LLM picked, snap it to a bed we can actually render
    return music_prompt, title, ambience.normalize_slug(data.get("ambience"))


def _enrich_minicpm(prompt: str):
    """Enrich via MiniCPM on cuda. MUST run inside @spaces.GPU. Returns a tuple or
    None (caller falls back). Thinking mode off so the reply is direct JSON."""
    if _enricher_disabled:
        return None
    import torch

    tok, model = load_enricher()
    messages = [
        {"role": "system", "content": ENRICH_SYSTEM},
        {"role": "user", "content": prompt},
    ]
    inputs = tok.apply_chat_template(
        messages, tokenize=True, add_generation_prompt=True,
        enable_thinking=False, return_dict=True, return_tensors="pt",
    ).to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs, max_new_tokens=220, do_sample=True, temperature=0.7, top_p=0.95
        )
    reply = tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
    return _finalize_enrichment(_parse_enrich_json(reply))


def _enrich_ollama(prompt: str):
    """Enrich via a local Ollama daemon. Returns a tuple or None on failure."""
    r = httpx.post(
        f"{OLLAMA_URL}/api/chat",
        json={
            "model": OLLAMA_MODEL,
            "messages": [
                {"role": "system", "content": ENRICH_SYSTEM},
                {"role": "user", "content": prompt},
            ],
            "format": "json",
            "stream": False,
            "options": {"temperature": 0.8, "num_predict": 220},
        },
        timeout=45,
    )
    r.raise_for_status()
    return _finalize_enrichment(json.loads(r.json()["message"]["content"]))


def _enrich_fallback(prompt: str) -> tuple[str, str, str]:
    """Plain, LLM-free enrichment β€” used whenever the chosen backend fails."""
    import ambience

    title = f"{prompt[:28].title()} Tape" if prompt.strip() else "Untitled Tape"
    return (
        f"lofi chill, {prompt}, mellow and warm, soft drums, slow tempo, instrumental",
        title,
        ambience.DEFAULT,
    )


def enrich_prompt(prompt: str) -> tuple[str, str, str]:
    """Vibe -> (music_prompt, cassette title, ambience slug). Backend is chosen by
    environment: MiniCPM on ZeroGPU, Ollama locally; a plain fallback covers any
    failure. On ZeroGPU this MUST be called inside @spaces.GPU (MiniCPM is cuda)."""
    backend = _enrich_minicpm if IS_ZEROGPU else _enrich_ollama
    try:
        result = backend(prompt)
        if result:
            return result
        print("[lofinity] enrichment returned junk, using fallback")
    except Exception as e:  # noqa: BLE001 β€” any failure means "use fallback"
        print(f"[lofinity] enrichment failed ({e!r}), using fallback")
    return _enrich_fallback(prompt)


# --- audio engines ------------------------------------------------------------

_musicgen = None
_musicgen_lock = threading.Lock()


def load_musicgen():
    """Lazy-load MusicGen once; the first call downloads the model from the Hub
    (musicgen-medium is ~3.5 GB)."""
    global _musicgen
    with _musicgen_lock:
        if _musicgen is None:
            os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
            import torch
            from transformers import AutoProcessor, MusicgenForConditionalGeneration

            requested = os.getenv("LOFINITY_DEVICE")
            if requested:
                device = requested
            elif IS_ZEROGPU:
                device = "cuda"
            elif torch.backends.mps.is_available():
                device = "mps"
            else:
                device = "cpu"
            print(f"[lofinity] loading {MUSICGEN_MODEL} on {device}…")
            processor = AutoProcessor.from_pretrained(MUSICGEN_MODEL)
            model = MusicgenForConditionalGeneration.from_pretrained(MUSICGEN_MODEL)
            model.to(device)
            model.eval()
            _musicgen = (processor, model, device)
            print("[lofinity] musicgen ready")
    return _musicgen


# ZeroGPU wants models resident on cuda at startup (module import time), not
# lazily inside the @spaces.GPU call β€” outside the decorated function a CUDA
# emulation layer lets this .to('cuda') succeed without a real GPU attached, and
# placements done at startup are far more efficient than per-call transfers.
if IS_ZEROGPU and ENGINE != "stub":
    load_musicgen()
    try:
        load_enricher()
    except Exception as e:  # noqa: BLE001 β€” a bad enricher must not kill the app
        _enricher_disabled = True
        print(f"[lofinity] enricher load failed ({e!r}); vends use the plain fallback")


def encode_wav(samples, rate: int) -> str:
    """Encode mono float samples as a base64 WAV data URI, entirely in memory.

    Nothing is written to disk: on a shared HF Space a songs directory is
    visible to every visitor and grows without bound. Returning the tape
    inline keeps it private to the one request that asked for it."""
    import numpy as np

    # MusicGen can exceed [-1, 1]; normalize instead of hard-clipping
    peak = float(np.abs(samples).max() or 1.0)
    if peak > 0.95:
        samples = samples * (0.95 / peak)
    pcm = (samples * 32767).astype("<i2")
    buf = io.BytesIO()
    with wave.open(buf, "wb") as w:
        w.setnchannels(1)
        w.setsampwidth(2)
        w.setframerate(rate)
        w.writeframes(pcm.tobytes())
    b64 = base64.b64encode(buf.getvalue()).decode("ascii")
    return f"data:audio/wav;base64,{b64}"


def _rms(x) -> float:
    import numpy as np

    return float(np.sqrt(np.mean(np.square(x)))) if len(x) else 0.0


def _match_rms(chunk, target: float):
    """Scale a fresh continuation to the first chunk's loudness. MusicGen
    continuations drift quieter each round; re-leveling keeps a long tape even."""
    r = _rms(chunk)
    if r < 1e-6 or target < 1e-6:
        return chunk
    return chunk * min(4.0, target / r)  # cap the boost so a quiet tail can't blow up


def _stitch(base, tail, rate: int):
    """Append `tail` to `base` with a short equal-power crossfade at the seam."""
    import numpy as np

    seam = int(SEAM_S * rate)
    if seam <= 0 or len(base) < seam or len(tail) < seam:
        return np.concatenate([base, tail])
    t = np.linspace(0.0, 1.0, seam)
    fade_out, fade_in = np.cos(t * np.pi / 2), np.sin(t * np.pi / 2)
    blended = base[-seam:] * fade_out + tail[:seam] * fade_in
    return np.concatenate([base[:-seam], blended, tail[seam:]])


def musicgen_engine(music_prompt: str, seconds: int = CHUNK_S, progress_cb=None) -> tuple:
    """Returns (samples, sample_rate). Tapes longer than one 30s shot are built
    by re-seeding the model with the last OVERLAP_S and stitching the chunks.
    progress_cb(done, total) is called after each chunk finishes."""
    import torch

    processor, model, device = load_musicgen()
    rate = model.config.audio_encoder.sampling_rate
    overlap = int(OVERLAP_S * rate)
    # a continuation prepends the OVERLAP_S seed, so it may generate only
    # MAX_GEN_S - OVERLAP_S NEW seconds to keep the whole shot inside musicgen's
    # ~30s training window β€” generating past it is what turns the tail to noise
    cont_new_s = max(1.0, MAX_GEN_S - OVERLAP_S)
    rounds = max(0, round(seconds / CHUNK_S) - 1)  # 30->0, 60->1, 90->2
    total = rounds + 1

    def shot(dev, seed=None, new_s=CHUNK_S):
        if seed is None:
            inputs = processor(text=[music_prompt], padding=True, return_tensors="pt")
        else:
            inputs = processor(
                audio=seed, sampling_rate=rate, text=[music_prompt],
                padding=True, return_tensors="pt",
            )
        inputs = inputs.to(dev)
        with torch.no_grad():
            audio = model.generate(
                **inputs,
                do_sample=True,
                guidance_scale=3.0,
                max_new_tokens=int(new_s * 50),  # ~50 tokens per second
            )
        return audio[0, 0].cpu().float().numpy()

    def build(dev):
        track = shot(dev)  # first 30s straight from the prompt
        if progress_cb:
            progress_cb(1, total)
        base_rms = _rms(track)
        for i in range(rounds):
            out = shot(dev, seed=track[-overlap:], new_s=cont_new_s)  # capped continuation
            fresh = _match_rms(out[overlap:], base_rms)  # drop the re-encoded seed
            track = _stitch(track, fresh, rate)
            if progress_cb:
                progress_cb(i + 2, total)
            print(f"[lofinity]   stitched chunk {i + 2}/{total}")
        return track

    try:
        samples = build(device)
    except Exception as e:  # noqa: BLE001
        if device == "mps":
            print(f"[lofinity] mps generation failed ({e!r}), retrying on cpu")
            model.to("cpu")
            samples = build("cpu")
        else:
            raise
    return samples, rate


def _gpu_budget(prompt: str, seconds: int = CHUNK_S) -> int:
    """GPU seconds to request from ZeroGPU for a brew of this length: MiniCPM
    enrichment + per-chunk MusicGen render plus headroom. Tighter budgets earn
    better queue priority; the signature must mirror gpu_brew so ZeroGPU can pass
    it the same args."""
    chunks = max(1, round(int(seconds) / CHUNK_S))
    # musicgen-medium renders slower than -small, so budget generously β€” a brew
    # that overruns its @spaces.GPU duration is KILLED mid-render; over-budgeting
    # only costs quota, under-budgeting loses the tape.
    return 40 + 40 * chunks  # 30s->80, 60s->120, 90s->160


@spaces.GPU(duration=_gpu_budget)
def gpu_brew(prompt: str, seconds: int = CHUNK_S) -> tuple:
    """ZeroGPU entry point β€” enrichment (MiniCPM) AND MusicGen on the real GPU in
    a single acquisition. Takes the raw vibe and returns
    (music_prompt, title, bed, samples, rate). It runs in a separate GPU worker
    process and can't push per-chunk progress back to the web process, so
    /api/progress reports a time-based estimate for the bar. This path is
    Space-only."""
    music_prompt, title, bed = enrich_prompt(prompt)
    samples, rate = musicgen_engine(music_prompt, seconds)
    return music_prompt, title, bed, samples, rate


def stub_engine(_music_prompt: str, seconds: int = CHUNK_S, progress_cb=None) -> tuple:
    """A short audible tone β€” handy when developing without the heavy model.
    Honors `seconds` and fakes per-chunk timing so the length slider and the
    brewing progress bar can be exercised without MusicGen."""
    import time

    import numpy as np

    rate = 22050
    total = max(1, round(seconds / CHUNK_S))
    for i in range(total):
        time.sleep(0.8)  # pretend each 30s chunk takes a moment to render
        if progress_cb:
            progress_cb(i + 1, total)
    secs = float(seconds)
    t = np.arange(int(rate * secs)) / rate
    fade = np.minimum(1.0, np.minimum(t * 4, (secs - t) * 4))
    # a slow wobble so a longer stub is audibly (and visibly) longer
    return 0.25 * fade * np.sin(2 * np.pi * 220 * t) * (0.8 + 0.2 * np.sin(t)), rate


# --- API -----------------------------------------------------------------------


@app.api(name="generate_song", concurrency_limit=1)
def generate_song(prompt: str, seconds: int = DEFAULT_SECONDS) -> dict:
    import ambience

    # snap whatever the slider sends to a length we can actually build
    seconds = min(ALLOWED_SECONDS, key=lambda s: abs(s - int(seconds)))
    # reset progress up front, BEFORE the (sometimes slow) enrich step, so a poll
    # arriving early sees this brew at 0% rather than the last one at 100%
    chunks = max(1, round(seconds / CHUNK_S))
    _PROGRESS.update(done=0, total=chunks)

    if IS_ZEROGPU and ENGINE != "stub":
        # On ZeroGPU enrichment (MiniCPM) and MusicGen share ONE @spaces.GPU
        # acquisition in a separate worker process, which can't push real progress
        # back β€” so /api/progress reports a smooth time-based ESTIMATE driven by
        # this brew's start + budget (capped <100% until the tape actually lands).
        est = GPU_WARMUP_S + GPU_SECS_PER_CHUNK * chunks
        _BREW.update(active=True, start=time.monotonic(), est=est, total=chunks)
        print(f"[lofinity] brewing on GPU :: {prompt!r} ({seconds}s, ~{est:.0f}s est)")
        try:
            music_prompt, title, bed, samples, rate = gpu_brew(prompt, seconds)
        finally:
            # top the bar off BEFORE clearing active, so a poll landing in between
            # reads 100% (from _PROGRESS), never the 0% this brew started at
            _PROGRESS.update(done=chunks, total=chunks)
            _BREW.update(active=False)
        print(f"[lofinity] brewed {title!r} :: {music_prompt} [+ {bed}]")
    else:
        # Local / stub: enrich in-process (Ollama or fallback), then render with
        # live per-chunk progress for the brewing garden.
        music_prompt, title, bed = enrich_prompt(prompt)
        print(f"[lofinity] brewing {title!r} ({seconds}s) :: {music_prompt} [+ {bed}]")
        engine = stub_engine if ENGINE == "stub" else musicgen_engine
        samples, rate = engine(
            music_prompt, seconds,
            progress_cb=lambda d, t: _PROGRESS.update(done=d, total=t),
        )

    _PROGRESS.update(done=chunks, total=chunks)
    try:
        samples = ambience.mix(samples, rate, bed)
    except Exception as e:  # noqa: BLE001 β€” a dry tape beats a failed vend
        print(f"[lofinity] ambience mix failed ({e!r}), vending without the bed")
    # The tape rides back inline as a base64 data URI β€” no disk write, so it is
    # never cached on the Space nor shared with other visitors. The frontend
    # keeps the collection client-side, per browser session.
    return {"title": title, "audio": encode_wav(samples, rate)}


@app.get("/api/progress")
def progress() -> dict:
    """Progress for the brewing bar. Local/stub report real per-chunk progress via
    _PROGRESS. A ZeroGPU brew runs in a separate GPU worker that can't push
    progress back, so report a smooth time-based ESTIMATE instead: a fractional
    `done` (the frontend fills the bar to done/total) capped below 100% until the
    real tape lands and _PROGRESS tops it off."""
    if _BREW["active"] and _BREW["est"] > 0:
        elapsed = time.monotonic() - _BREW["start"]
        frac = min(0.92, elapsed / _BREW["est"])
        return {"done": round(frac * _BREW["total"], 3), "total": _BREW["total"]}
    return dict(_PROGRESS)


@app.get("/api/config")
def config() -> dict:
    """Frontend config: the tape lengths this backend allows. Hardware-dependent β€”
    a CPU-only fallback offers only 30s β€” so the slider reads it and adapts."""
    return {"allowed_seconds": list(ALLOWED_SECONDS)}


@app.get("/")
async def homepage():
    return FileResponse(FRONTEND / "index.html")


app.mount("/static", StaticFiles(directory=FRONTEND), name="static")

if __name__ == "__main__":
    app.launch(show_error=True)