Spaces:

build-small-hackathon
/

LoFinity

Running on Zero

App Files Files Community

LoFinity / app.py

eloigil6

Scale model + tape length to the hardware (GPU vs CPU)

613bdc6 11 days ago

Raw

History Blame Contribute Delete

24.8 kB

	"""LoFinity — a vending machine that dispenses endless chill beats.

	Gradio Server backend: serves the Three.js frontend and exposes the
	generation API.

	Pipeline: user vibe -> a small LLM enriches it into a MusicGen prompt +
	cassette title + ambience pick -> MusicGen renders the music -> ambience.py
	loops a background bed (waves, crackle, rain…) underneath. MusicGen ignores
	texture words in prompts, hence the separate bed. The enrichment LLM is
	MiniCPM (on cuda) on a ZeroGPU Space, or a local Ollama daemon in dev.

	On a ZeroGPU Space it runs musicgen-medium and allows tapes up to 90s (chunked);
	without a GPU it falls back to musicgen-small and a single 30s shot (no chunking).

	Env knobs:
	LOFINITY_ENGINE musicgen (default) \| stub
	LOFINITY_DURATION clip length in seconds (default 30, the single-shot max)
	LOFINITY_DEVICE cuda \| mps \| cpu (default: cuda on ZeroGPU, else mps if available)
	LOFINITY_MUSICGEN model id (default: musicgen-medium on ZeroGPU, else musicgen-small)
	LOFINITY_OVERLAP_S continuation seed length, seconds (default 2)
	LOFINITY_MAX_GEN_S cap on a continuation's total output, seconds (default 28)
	LOFINITY_ENRICHER MiniCPM model id for ZeroGPU enrichment (default MiniCPM5-1B)
	OLLAMA_URL default http://localhost:11434 (local enrichment)
	OLLAMA_MODEL default llama3.2:3b (local enrichment)
	"""

	import base64
	import io
	import json
	import os
	import threading
	import time
	import wave
	from pathlib import Path

	import httpx
	from fastapi.responses import FileResponse
	from fastapi.staticfiles import StaticFiles
	from gradio.server import Server

	ROOT = Path(__file__).parent
	FRONTEND = ROOT / "frontend"

	# ZeroGPU: on a Hugging Face ZeroGPU Space a GPU is attached only for the
	# duration of a function wrapped in @spaces.GPU, then released. The `spaces`
	# package exists only in that runtime; locally we shim @spaces.GPU to a no-op so
	# the exact same code runs on mps/cpu untouched.
	#
	# Detect ZeroGPU via the spaces library's OWN config rather than parsing the env
	# var ourselves: Config.zero_gpu is the very flag that gates @spaces.GPU (see
	# spaces/zero/decorator.py), so IS_ZEROGPU ends up True precisely when the
	# decorator will really allocate a GPU. (Our first attempt compared
	# SPACES_ZERO_GPU to the literal "true", but the runtime sets it to a value the
	# library parses loosely — "1"/"t"/"true" — so the exact match failed and the
	# GPU path never fired: generation silently fell back to CPU.)
	IS_ZEROGPU = False
	try:
	import spaces # provided by the ZeroGPU Space runtime
	from spaces.config import Config as _ZeroGPUConfig

	IS_ZEROGPU = bool(_ZeroGPUConfig.zero_gpu)
	except ImportError: # local dev / non-ZeroGPU — make the decorator harmless
	class _SpacesShim:
	@staticmethod
	def GPU(args, *kwargs):
	# handle both bare @spaces.GPU and @spaces.GPU(duration=...)
	if args and callable(args[0]):
	return args[0]
	return lambda fn: fn

	spaces = _SpacesShim()

	print(
	f"[lofinity] startup: IS_ZEROGPU={IS_ZEROGPU} "
	f"(SPACES_ZERO_GPU={os.environ.get('SPACES_ZERO_GPU')!r})"
	)

	ENGINE = os.getenv("LOFINITY_ENGINE", "musicgen")
	# Model + tape length scale with the hardware: a ZeroGPU Space gets the bigger,
	# cleaner-continuing musicgen-medium and full chunked tapes (up to 90s); without a
	# GPU we fall back to the smaller, faster musicgen-small and a single 30s shot
	# (medium + chunking on CPU would take minutes). The env var still overrides.
	MUSICGEN_MODEL = os.getenv(
	"LOFINITY_MUSICGEN",
	"facebook/musicgen-medium" if IS_ZEROGPU else "facebook/musicgen-small",
	)
	# 30s is musicgen-small's single-shot max (1500 tokens). Longer tapes are
	# stitched from 30s chunks: each one re-seeds the model with the last OVERLAP_S
	# of the track so it keeps playing from there. musicgen-small's context is 2048
	# tokens (~41s at 50 tok/s), so a 2s seed + 30s of new audio (~1600 tokens) fits.
	CHUNK_S = 30 # length of each musicgen single-shot
	OVERLAP_S = float(os.getenv("LOFINITY_OVERLAP_S", "2")) # seconds of tail fed back as the continuation seed; shorter leans more on the text prompt
	# musicgen is trained on 30s clips, so a single shot longer than ~30s degrades
	# its tail into noise. A continuation prepends the seed THEN generates, so cap its
	# total output (seed + new) at MAX_GEN_S to stay inside that window. Env-tunable.
	MAX_GEN_S = float(os.getenv("LOFINITY_MAX_GEN_S", "28"))
	SEAM_S = 0.4 # equal-power crossfade at each stitch, to hide the join
	# the tape lengths the API allows (it snaps any request to the nearest). Only a
	# GPU gets the longer, chunked tapes; a CPU-only fallback is capped to one 30s shot.
	ALLOWED_SECONDS = (30, 60, 90) if IS_ZEROGPU else (30,)
	DEFAULT_SECONDS = int(os.getenv("LOFINITY_DURATION", "30"))
	OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434")
	OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3.2:3b")
	# A ZeroGPU brew renders in a separate GPU worker process, so /api/progress can't
	# read real per-chunk progress from it; it reports a time-based estimate from this
	# rough per-length budget instead (tunable; only affects the cosmetic brew bar).
	GPU_WARMUP_S = 5.0 # enrichment + cold-start/queue allowance before audio flows
	GPU_SECS_PER_CHUNK = 25.0 # rough GPU render time per 30s chunk (musicgen-medium)

	app = Server(title="LoFinity")

	# How far along the current brew is, in whole 30s chunks. generate_song runs one
	# at a time (concurrency_limit=1), so a single shared counter is enough; the
	# frontend polls /api/progress to fill its brewing bar.
	_PROGRESS = {"done": 0, "total": 1}

	# A ZeroGPU brew's wall-clock start + estimated total, so /api/progress can report
	# a smooth time-based estimate while the GPU worker is busy (see progress()).
	_BREW = {"active": False, "start": 0.0, "est": 1.0, "total": 1}

	# --- prompt enrichment --------------------------------------------------------

	ENRICH_SYSTEM = """\
	You are the creative brain of LoFinity, a magical vending machine that sells
	lofi cassette tapes. The user gives you a vibe. Reply ONLY with JSON with
	exactly these three keys: {"music_prompt": "...", "title": "...", "ambience": "..."}

	Build music_prompt from this template, in this order:
	"lofi chill, <instrument 1>, <instrument 2>, <instrument 3>, <mood>, slow tempo, 75 bpm, instrumental"

	- instruments: 2-3 picked to EVOKE the user's vibe, never a default set
	(island -> ukulele, kalimba, steel pan; rainy city -> rhodes piano, soft
	guitar; winter -> felt piano, soft strings; desert -> slide guitar, hand drums)
	- mood: one or two calm words; never energetic, no vocals

	ambience: the background sound layered under the music. Exactly one of:
	vinyl_crackle, tape_hiss, soft_rain, ocean_waves, fireplace_crackle,
	birdsong, night_crickets, cafe_murmur, wind_in_trees. Match it to the vibe.

	title: a cozy cassette tape title inspired by the vibe, max 5 words,
	Title Case, no quotes or emoji.

	Examples:
	user: island summer
	{"music_prompt": "lofi chill, ukulele, kalimba, steel pan, breezy and warm, slow tempo, 75 bpm, instrumental", "title": "Coconut Daydream", "ambience": "ocean_waves"}
	user: studying at midnight
	{"music_prompt": "lofi chill, rhodes piano, muted guitar, soft bass, focused and calm, slow tempo, 75 bpm, instrumental", "title": "Midnight Study Session", "ambience": "vinyl_crackle"}"""


	# MiniCPM enrichment LLM (ZeroGPU only — a Space has no Ollama daemon).
	# MiniCPM5-1B is a standard LlamaForCausalLM (no trust_remote_code, fast
	# tokenizer) with a switchable <think> mode we keep OFF so the reply is direct
	# JSON. Needs transformers>=5.6 (the Space's latest satisfies it); no extra deps.
	ENRICHER_MODEL = os.getenv("LOFINITY_ENRICHER", "openbmb/MiniCPM5-1B")
	_enricher = None
	_enricher_lock = threading.Lock()
	_enricher_disabled = False # set if the model can't load; forces the fallback


	def load_enricher():
	"""Lazy-load the MiniCPM enrichment LLM on cuda (ZeroGPU). Like MusicGen it is
	placed on cuda at module level; standard Llama arch, so no remote code."""
	global _enricher
	with _enricher_lock:
	if _enricher is None:
	import torch # noqa: F401 — needed so the .to('cuda') below resolves
	from transformers import AutoModelForCausalLM, AutoTokenizer

	print(f"[lofinity] loading enricher {ENRICHER_MODEL} on cuda…")
	tok = AutoTokenizer.from_pretrained(ENRICHER_MODEL)
	model = AutoModelForCausalLM.from_pretrained(ENRICHER_MODEL, torch_dtype="auto")
	model.to("cuda")
	model.eval()
	_enricher = (tok, model)
	print("[lofinity] enricher ready")
	return _enricher


	def _parse_enrich_json(text: str) -> dict:
	"""Pull the first {...} object out of an LLM reply (it may wrap the JSON in
	prose or ```json fences, or leak a <think> block); {} if nothing parses."""
	import re

	if "</think>" in text: # belt-and-suspenders if thinking ever leaks through
	text = text.rsplit("</think>", 1)[1]
	m = re.search(r"\{.*\}", text, re.DOTALL)
	if not m:
	return {}
	try:
	return json.loads(m.group(0))
	except Exception: # noqa: BLE001
	return {}


	def _finalize_enrichment(data: dict):
	"""Shared post-processing for any backend: validate, force the genre to lead,
	snap the ambience to a renderable bed. Returns a tuple, or None if unusable."""
	import ambience

	music_prompt = str(data.get("music_prompt") or "").strip()
	title = str(data.get("title") or "").strip()[:48]
	if not (music_prompt and title):
	return None
	# belt and suspenders: the genre must lead even if the LLM drifts
	if "lofi" not in music_prompt.lower():
	music_prompt = f"lofi chill, {music_prompt}"
	# whatever the LLM picked, snap it to a bed we can actually render
	return music_prompt, title, ambience.normalize_slug(data.get("ambience"))


	def _enrich_minicpm(prompt: str):
	"""Enrich via MiniCPM on cuda. MUST run inside @spaces.GPU. Returns a tuple or
	None (caller falls back). Thinking mode off so the reply is direct JSON."""
	if _enricher_disabled:
	return None
	import torch

	tok, model = load_enricher()
	messages = [
	{"role": "system", "content": ENRICH_SYSTEM},
	{"role": "user", "content": prompt},
	]
	inputs = tok.apply_chat_template(
	messages, tokenize=True, add_generation_prompt=True,
	enable_thinking=False, return_dict=True, return_tensors="pt",
	).to(model.device)
	with torch.no_grad():
	out = model.generate(
	**inputs, max_new_tokens=220, do_sample=True, temperature=0.7, top_p=0.95
	)
	reply = tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
	return _finalize_enrichment(_parse_enrich_json(reply))


	def _enrich_ollama(prompt: str):
	"""Enrich via a local Ollama daemon. Returns a tuple or None on failure."""
	r = httpx.post(
	f"{OLLAMA_URL}/api/chat",
	json={
	"model": OLLAMA_MODEL,
	"messages": [
	{"role": "system", "content": ENRICH_SYSTEM},
	{"role": "user", "content": prompt},
	],
	"format": "json",
	"stream": False,
	"options": {"temperature": 0.8, "num_predict": 220},
	},
	timeout=45,
	)
	r.raise_for_status()
	return _finalize_enrichment(json.loads(r.json()["message"]["content"]))


	def _enrich_fallback(prompt: str) -> tuple[str, str, str]:
	"""Plain, LLM-free enrichment — used whenever the chosen backend fails."""
	import ambience

	title = f"{prompt[:28].title()} Tape" if prompt.strip() else "Untitled Tape"
	return (
	f"lofi chill, {prompt}, mellow and warm, soft drums, slow tempo, instrumental",
	title,
	ambience.DEFAULT,
	)


	def enrich_prompt(prompt: str) -> tuple[str, str, str]:
	"""Vibe -> (music_prompt, cassette title, ambience slug). Backend is chosen by
	environment: MiniCPM on ZeroGPU, Ollama locally; a plain fallback covers any
	failure. On ZeroGPU this MUST be called inside @spaces.GPU (MiniCPM is cuda)."""
	backend = _enrich_minicpm if IS_ZEROGPU else _enrich_ollama
	try:
	result = backend(prompt)
	if result:
	return result
	print("[lofinity] enrichment returned junk, using fallback")
	except Exception as e: # noqa: BLE001 — any failure means "use fallback"
	print(f"[lofinity] enrichment failed ({e!r}), using fallback")
	return _enrich_fallback(prompt)


	# --- audio engines ------------------------------------------------------------

	_musicgen = None
	_musicgen_lock = threading.Lock()


	def load_musicgen():
	"""Lazy-load MusicGen once; the first call downloads the model from the Hub
	(musicgen-medium is ~3.5 GB)."""
	global _musicgen
	with _musicgen_lock:
	if _musicgen is None:
	os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
	import torch
	from transformers import AutoProcessor, MusicgenForConditionalGeneration

	requested = os.getenv("LOFINITY_DEVICE")
	if requested:
	device = requested
	elif IS_ZEROGPU:
	device = "cuda"
	elif torch.backends.mps.is_available():
	device = "mps"
	else:
	device = "cpu"
	print(f"[lofinity] loading {MUSICGEN_MODEL} on {device}…")
	processor = AutoProcessor.from_pretrained(MUSICGEN_MODEL)
	model = MusicgenForConditionalGeneration.from_pretrained(MUSICGEN_MODEL)
	model.to(device)
	model.eval()
	_musicgen = (processor, model, device)
	print("[lofinity] musicgen ready")
	return _musicgen


	# ZeroGPU wants models resident on cuda at startup (module import time), not
	# lazily inside the @spaces.GPU call — outside the decorated function a CUDA
	# emulation layer lets this .to('cuda') succeed without a real GPU attached, and
	# placements done at startup are far more efficient than per-call transfers.
	if IS_ZEROGPU and ENGINE != "stub":
	load_musicgen()
	try:
	load_enricher()
	except Exception as e: # noqa: BLE001 — a bad enricher must not kill the app
	_enricher_disabled = True
	print(f"[lofinity] enricher load failed ({e!r}); vends use the plain fallback")


	def encode_wav(samples, rate: int) -> str:
	"""Encode mono float samples as a base64 WAV data URI, entirely in memory.

	Nothing is written to disk: on a shared HF Space a songs directory is
	visible to every visitor and grows without bound. Returning the tape
	inline keeps it private to the one request that asked for it."""
	import numpy as np

	# MusicGen can exceed [-1, 1]; normalize instead of hard-clipping
	peak = float(np.abs(samples).max() or 1.0)
	if peak > 0.95:
	samples = samples * (0.95 / peak)
	pcm = (samples * 32767).astype("<i2")
	buf = io.BytesIO()
	with wave.open(buf, "wb") as w:
	w.setnchannels(1)
	w.setsampwidth(2)
	w.setframerate(rate)
	w.writeframes(pcm.tobytes())
	b64 = base64.b64encode(buf.getvalue()).decode("ascii")
	return f"data:audio/wav;base64,{b64}"


	def _rms(x) -> float:
	import numpy as np

	return float(np.sqrt(np.mean(np.square(x)))) if len(x) else 0.0


	def _match_rms(chunk, target: float):
	"""Scale a fresh continuation to the first chunk's loudness. MusicGen
	continuations drift quieter each round; re-leveling keeps a long tape even."""
	r = _rms(chunk)
	if r < 1e-6 or target < 1e-6:
	return chunk
	return chunk * min(4.0, target / r) # cap the boost so a quiet tail can't blow up


	def _stitch(base, tail, rate: int):
	"""Append `tail` to `base` with a short equal-power crossfade at the seam."""
	import numpy as np

	seam = int(SEAM_S * rate)
	if seam <= 0 or len(base) < seam or len(tail) < seam:
	return np.concatenate([base, tail])
	t = np.linspace(0.0, 1.0, seam)
	fade_out, fade_in = np.cos(t * np.pi / 2), np.sin(t * np.pi / 2)
	blended = base[-seam:] * fade_out + tail[:seam] * fade_in
	return np.concatenate([base[:-seam], blended, tail[seam:]])


	def musicgen_engine(music_prompt: str, seconds: int = CHUNK_S, progress_cb=None) -> tuple:
	"""Returns (samples, sample_rate). Tapes longer than one 30s shot are built
	by re-seeding the model with the last OVERLAP_S and stitching the chunks.
	progress_cb(done, total) is called after each chunk finishes."""
	import torch

	processor, model, device = load_musicgen()
	rate = model.config.audio_encoder.sampling_rate
	overlap = int(OVERLAP_S * rate)
	# a continuation prepends the OVERLAP_S seed, so it may generate only
	# MAX_GEN_S - OVERLAP_S NEW seconds to keep the whole shot inside musicgen's
	# ~30s training window — generating past it is what turns the tail to noise
	cont_new_s = max(1.0, MAX_GEN_S - OVERLAP_S)
	rounds = max(0, round(seconds / CHUNK_S) - 1) # 30->0, 60->1, 90->2
	total = rounds + 1

	def shot(dev, seed=None, new_s=CHUNK_S):
	if seed is None:
	inputs = processor(text=[music_prompt], padding=True, return_tensors="pt")
	else:
	inputs = processor(
	audio=seed, sampling_rate=rate, text=[music_prompt],
	padding=True, return_tensors="pt",
	)
	inputs = inputs.to(dev)
	with torch.no_grad():
	audio = model.generate(
	**inputs,
	do_sample=True,
	guidance_scale=3.0,
	max_new_tokens=int(new_s * 50), # ~50 tokens per second
	)
	return audio[0, 0].cpu().float().numpy()

	def build(dev):
	track = shot(dev) # first 30s straight from the prompt
	if progress_cb:
	progress_cb(1, total)
	base_rms = _rms(track)
	for i in range(rounds):
	out = shot(dev, seed=track[-overlap:], new_s=cont_new_s) # capped continuation
	fresh = _match_rms(out[overlap:], base_rms) # drop the re-encoded seed
	track = _stitch(track, fresh, rate)
	if progress_cb:
	progress_cb(i + 2, total)
	print(f"[lofinity] stitched chunk {i + 2}/{total}")
	return track

	try:
	samples = build(device)
	except Exception as e: # noqa: BLE001
	if device == "mps":
	print(f"[lofinity] mps generation failed ({e!r}), retrying on cpu")
	model.to("cpu")
	samples = build("cpu")
	else:
	raise
	return samples, rate


	def _gpu_budget(prompt: str, seconds: int = CHUNK_S) -> int:
	"""GPU seconds to request from ZeroGPU for a brew of this length: MiniCPM
	enrichment + per-chunk MusicGen render plus headroom. Tighter budgets earn
	better queue priority; the signature must mirror gpu_brew so ZeroGPU can pass
	it the same args."""
	chunks = max(1, round(int(seconds) / CHUNK_S))
	# musicgen-medium renders slower than -small, so budget generously — a brew
	# that overruns its @spaces.GPU duration is KILLED mid-render; over-budgeting
	# only costs quota, under-budgeting loses the tape.
	return 40 + 40 * chunks # 30s->80, 60s->120, 90s->160


	@spaces.GPU(duration=_gpu_budget)
	def gpu_brew(prompt: str, seconds: int = CHUNK_S) -> tuple:
	"""ZeroGPU entry point — enrichment (MiniCPM) AND MusicGen on the real GPU in
	a single acquisition. Takes the raw vibe and returns
	(music_prompt, title, bed, samples, rate). It runs in a separate GPU worker
	process and can't push per-chunk progress back to the web process, so
	/api/progress reports a time-based estimate for the bar. This path is
	Space-only."""
	music_prompt, title, bed = enrich_prompt(prompt)
	samples, rate = musicgen_engine(music_prompt, seconds)
	return music_prompt, title, bed, samples, rate


	def stub_engine(_music_prompt: str, seconds: int = CHUNK_S, progress_cb=None) -> tuple:
	"""A short audible tone — handy when developing without the heavy model.
	Honors `seconds` and fakes per-chunk timing so the length slider and the
	brewing progress bar can be exercised without MusicGen."""
	import time

	import numpy as np

	rate = 22050
	total = max(1, round(seconds / CHUNK_S))
	for i in range(total):
	time.sleep(0.8) # pretend each 30s chunk takes a moment to render
	if progress_cb:
	progress_cb(i + 1, total)
	secs = float(seconds)
	t = np.arange(int(rate * secs)) / rate
	fade = np.minimum(1.0, np.minimum(t * 4, (secs - t) * 4))
	# a slow wobble so a longer stub is audibly (and visibly) longer
	return 0.25 * fade * np.sin(2 * np.pi * 220 * t) * (0.8 + 0.2 * np.sin(t)), rate


	# --- API -----------------------------------------------------------------------


	@app.api(name="generate_song", concurrency_limit=1)
	def generate_song(prompt: str, seconds: int = DEFAULT_SECONDS) -> dict:
	import ambience

	# snap whatever the slider sends to a length we can actually build
	seconds = min(ALLOWED_SECONDS, key=lambda s: abs(s - int(seconds)))
	# reset progress up front, BEFORE the (sometimes slow) enrich step, so a poll
	# arriving early sees this brew at 0% rather than the last one at 100%
	chunks = max(1, round(seconds / CHUNK_S))
	_PROGRESS.update(done=0, total=chunks)

	if IS_ZEROGPU and ENGINE != "stub":
	# On ZeroGPU enrichment (MiniCPM) and MusicGen share ONE @spaces.GPU
	# acquisition in a separate worker process, which can't push real progress
	# back — so /api/progress reports a smooth time-based ESTIMATE driven by
	# this brew's start + budget (capped <100% until the tape actually lands).
	est = GPU_WARMUP_S + GPU_SECS_PER_CHUNK * chunks
	_BREW.update(active=True, start=time.monotonic(), est=est, total=chunks)
	print(f"[lofinity] brewing on GPU :: {prompt!r} ({seconds}s, ~{est:.0f}s est)")
	try:
	music_prompt, title, bed, samples, rate = gpu_brew(prompt, seconds)
	finally:
	# top the bar off BEFORE clearing active, so a poll landing in between
	# reads 100% (from _PROGRESS), never the 0% this brew started at
	_PROGRESS.update(done=chunks, total=chunks)
	_BREW.update(active=False)
	print(f"[lofinity] brewed {title!r} :: {music_prompt} [+ {bed}]")
	else:
	# Local / stub: enrich in-process (Ollama or fallback), then render with
	# live per-chunk progress for the brewing garden.
	music_prompt, title, bed = enrich_prompt(prompt)
	print(f"[lofinity] brewing {title!r} ({seconds}s) :: {music_prompt} [+ {bed}]")
	engine = stub_engine if ENGINE == "stub" else musicgen_engine
	samples, rate = engine(
	music_prompt, seconds,
	progress_cb=lambda d, t: _PROGRESS.update(done=d, total=t),
	)

	_PROGRESS.update(done=chunks, total=chunks)
	try:
	samples = ambience.mix(samples, rate, bed)
	except Exception as e: # noqa: BLE001 — a dry tape beats a failed vend
	print(f"[lofinity] ambience mix failed ({e!r}), vending without the bed")
	# The tape rides back inline as a base64 data URI — no disk write, so it is
	# never cached on the Space nor shared with other visitors. The frontend
	# keeps the collection client-side, per browser session.
	return {"title": title, "audio": encode_wav(samples, rate)}


	@app.get("/api/progress")
	def progress() -> dict:
	"""Progress for the brewing bar. Local/stub report real per-chunk progress via
	_PROGRESS. A ZeroGPU brew runs in a separate GPU worker that can't push
	progress back, so report a smooth time-based ESTIMATE instead: a fractional
	`done` (the frontend fills the bar to done/total) capped below 100% until the
	real tape lands and _PROGRESS tops it off."""
	if _BREW["active"] and _BREW["est"] > 0:
	elapsed = time.monotonic() - _BREW["start"]
	frac = min(0.92, elapsed / _BREW["est"])
	return {"done": round(frac * _BREW["total"], 3), "total": _BREW["total"]}
	return dict(_PROGRESS)


	@app.get("/api/config")
	def config() -> dict:
	"""Frontend config: the tape lengths this backend allows. Hardware-dependent —
	a CPU-only fallback offers only 30s — so the slider reads it and adapts."""
	return {"allowed_seconds": list(ALLOWED_SECONDS)}


	@app.get("/")
	async def homepage():
	return FileResponse(FRONTEND / "index.html")


	app.mount("/static", StaticFiles(directory=FRONTEND), name="static")

	if __name__ == "__main__":
	app.launch(show_error=True)