Spaces:

build-small-hackathon
/

LoFinity

Running on Zero

File size: 24,787 Bytes

"""LoFinity — a vending machine that dispenses endless chill beats.

Gradio Server backend: serves the Three.js frontend and exposes the
generation API.

Pipeline: user vibe -> a small LLM enriches it into a MusicGen prompt +
cassette title + ambience pick -> MusicGen renders the music -> ambience.py
loops a background bed (waves, crackle, rain…) underneath. MusicGen ignores
texture words in prompts, hence the separate bed. The enrichment LLM is
MiniCPM (on cuda) on a ZeroGPU Space, or a local Ollama daemon in dev.

On a ZeroGPU Space it runs musicgen-medium and allows tapes up to 90s (chunked);
without a GPU it falls back to musicgen-small and a single 30s shot (no chunking).

Env knobs:
  LOFINITY_ENGINE   musicgen (default) | stub
  LOFINITY_DURATION clip length in seconds (default 30, the single-shot max)
  LOFINITY_DEVICE   cuda | mps | cpu (default: cuda on ZeroGPU, else mps if available)
  LOFINITY_MUSICGEN model id (default: musicgen-medium on ZeroGPU, else musicgen-small)
  LOFINITY_OVERLAP_S continuation seed length, seconds (default 2)
  LOFINITY_MAX_GEN_S cap on a continuation's total output, seconds (default 28)
  LOFINITY_ENRICHER MiniCPM model id for ZeroGPU enrichment (default MiniCPM5-1B)
  OLLAMA_URL        default http://localhost:11434  (local enrichment)
  OLLAMA_MODEL      default llama3.2:3b              (local enrichment)
"""

import base64
import io
import json
import os
import threading
import time
import wave
from pathlib import Path

import httpx
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from gradio.server import Server

ROOT = Path(__file__).parent
FRONTEND = ROOT / "frontend"

# ZeroGPU: on a Hugging Face ZeroGPU Space a GPU is attached only for the
# duration of a function wrapped in @spaces.GPU, then released. The `spaces`
# package exists only in that runtime; locally we shim @spaces.GPU to a no-op so
# the exact same code runs on mps/cpu untouched.
#
# Detect ZeroGPU via the spaces library's OWN config rather than parsing the env
# var ourselves: Config.zero_gpu is the very flag that gates @spaces.GPU (see
# spaces/zero/decorator.py), so IS_ZEROGPU ends up True precisely when the
# decorator will really allocate a GPU. (Our first attempt compared
# SPACES_ZERO_GPU to the literal "true", but the runtime sets it to a value the
# library parses loosely — "1"/"t"/"true" — so the exact match failed and the
# GPU path never fired: generation silently fell back to CPU.)
IS_ZEROGPU = False
try:
    import spaces  # provided by the ZeroGPU Space runtime
    from spaces.config import Config as _ZeroGPUConfig

    IS_ZEROGPU = bool(_ZeroGPUConfig.zero_gpu)
except ImportError:  # local dev / non-ZeroGPU — make the decorator harmless
    class _SpacesShim:
        @staticmethod
        def GPU(*args, **kwargs):
            # handle both bare @spaces.GPU and @spaces.GPU(duration=...)
            if args and callable(args[0]):
                return args[0]
            return lambda fn: fn

    spaces = _SpacesShim()

print(
    f"[lofinity] startup: IS_ZEROGPU={IS_ZEROGPU} "
    f"(SPACES_ZERO_GPU={os.environ.get('SPACES_ZERO_GPU')!r})"
)

ENGINE = os.getenv("LOFINITY_ENGINE", "musicgen")
# Model + tape length scale with the hardware: a ZeroGPU Space gets the bigger,
# cleaner-continuing musicgen-medium and full chunked tapes (up to 90s); without a
# GPU we fall back to the smaller, faster musicgen-small and a single 30s shot
# (medium + chunking on CPU would take minutes). The env var still overrides.
MUSICGEN_MODEL = os.getenv(
    "LOFINITY_MUSICGEN",
    "facebook/musicgen-medium" if IS_ZEROGPU else "facebook/musicgen-small",
)
# 30s is musicgen-small's single-shot max (1500 tokens). Longer tapes are
# stitched from 30s chunks: each one re-seeds the model with the last OVERLAP_S
# of the track so it keeps playing from there. musicgen-small's context is 2048
# tokens (~41s at 50 tok/s), so a 2s seed + 30s of new audio (~1600 tokens) fits.
CHUNK_S = 30  # length of each musicgen single-shot
OVERLAP_S = float(os.getenv("LOFINITY_OVERLAP_S", "2"))  # seconds of tail fed back as the continuation seed; shorter leans more on the text prompt
# musicgen is trained on 30s clips, so a single shot longer than ~30s degrades
# its tail into noise. A continuation prepends the seed THEN generates, so cap its
# total output (seed + new) at MAX_GEN_S to stay inside that window. Env-tunable.
MAX_GEN_S = float(os.getenv("LOFINITY_MAX_GEN_S", "28"))
SEAM_S = 0.4  # equal-power crossfade at each stitch, to hide the join
# the tape lengths the API allows (it snaps any request to the nearest). Only a
# GPU gets the longer, chunked tapes; a CPU-only fallback is capped to one 30s shot.
ALLOWED_SECONDS = (30, 60, 90) if IS_ZEROGPU else (30,)
DEFAULT_SECONDS = int(os.getenv("LOFINITY_DURATION", "30"))
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434")
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3.2:3b")
# A ZeroGPU brew renders in a separate GPU worker process, so /api/progress can't
# read real per-chunk progress from it; it reports a time-based estimate from this
# rough per-length budget instead (tunable; only affects the cosmetic brew bar).
GPU_WARMUP_S = 5.0  # enrichment + cold-start/queue allowance before audio flows
GPU_SECS_PER_CHUNK = 25.0  # rough GPU render time per 30s chunk (musicgen-medium)

app = Server(title="LoFinity")

# How far along the current brew is, in whole 30s chunks. generate_song runs one
# at a time (concurrency_limit=1), so a single shared counter is enough; the
# frontend polls /api/progress to fill its brewing bar.
_PROGRESS = {"done": 0, "total": 1}

# A ZeroGPU brew's wall-clock start + estimated total, so /api/progress can report
# a smooth time-based estimate while the GPU worker is busy (see progress()).
_BREW = {"active": False, "start": 0.0, "est": 1.0, "total": 1}

# --- prompt enrichment --------------------------------------------------------

ENRICH_SYSTEM = """\
You are the creative brain of LoFinity, a magical vending machine that sells
lofi cassette tapes. The user gives you a vibe. Reply ONLY with JSON with
exactly these three keys: {"music_prompt": "...", "title": "...", "ambience": "..."}

Build music_prompt from this template, in this order:
"lofi chill, <instrument 1>, <instrument 2>, <instrument 3>, <mood>, slow tempo, 75 bpm, instrumental"

- instruments: 2-3 picked to EVOKE the user's vibe, never a default set
  (island -> ukulele, kalimba, steel pan; rainy city -> rhodes piano, soft
  guitar; winter -> felt piano, soft strings; desert -> slide guitar, hand drums)
- mood: one or two calm words; never energetic, no vocals

ambience: the background sound layered under the music. Exactly one of:
vinyl_crackle, tape_hiss, soft_rain, ocean_waves, fireplace_crackle,
birdsong, night_crickets, cafe_murmur, wind_in_trees. Match it to the vibe.

title: a cozy cassette tape title inspired by the vibe, max 5 words,
Title Case, no quotes or emoji.

Examples:
user: island summer
{"music_prompt": "lofi chill, ukulele, kalimba, steel pan, breezy and warm, slow tempo, 75 bpm, instrumental", "title": "Coconut Daydream", "ambience": "ocean_waves"}
user: studying at midnight
{"music_prompt": "lofi chill, rhodes piano, muted guitar, soft bass, focused and calm, slow tempo, 75 bpm, instrumental", "title": "Midnight Study Session", "ambience": "vinyl_crackle"}"""


# MiniCPM enrichment LLM (ZeroGPU only — a Space has no Ollama daemon).
# MiniCPM5-1B is a standard LlamaForCausalLM (no trust_remote_code, fast
# tokenizer) with a switchable <think> mode we keep OFF so the reply is direct
# JSON. Needs transformers>=5.6 (the Space's latest satisfies it); no extra deps.
ENRICHER_MODEL = os.getenv("LOFINITY_ENRICHER", "openbmb/MiniCPM5-1B")
_enricher = None
_enricher_lock = threading.Lock()
_enricher_disabled = False  # set if the model can't load; forces the fallback


def load_enricher():
    """Lazy-load the MiniCPM enrichment LLM on cuda (ZeroGPU). Like MusicGen it is
    placed on cuda at module level; standard Llama arch, so no remote code."""
    global _enricher
    with _enricher_lock:
        if _enricher is None:
            import torch  # noqa: F401 — needed so the .to('cuda') below resolves
            from transformers import AutoModelForCausalLM, AutoTokenizer

            print(f"[lofinity] loading enricher {ENRICHER_MODEL} on cuda…")
            tok = AutoTokenizer.from_pretrained(ENRICHER_MODEL)
            model = AutoModelForCausalLM.from_pretrained(ENRICHER_MODEL, torch_dtype="auto")
            model.to("cuda")
            model.eval()
            _enricher = (tok, model)
            print("[lofinity] enricher ready")
    return _enricher


def _parse_enrich_json(text: str) -> dict:
    """Pull the first {...} object out of an LLM reply (it may wrap the JSON in
    prose or ```json fences, or leak a <think> block); {} if nothing parses."""
    import re

    if "</think>" in text:  # belt-and-suspenders if thinking ever leaks through
        text = text.rsplit("</think>", 1)[1]
    m = re.search(r"\{.*\}", text, re.DOTALL)
    if not m:
        return {}
    try:
        return json.loads(m.group(0))
    except Exception:  # noqa: BLE001
        return {}


def _finalize_enrichment(data: dict):
    """Shared post-processing for any backend: validate, force the genre to lead,
    snap the ambience to a renderable bed. Returns a tuple, or None if unusable."""
    import ambience

    music_prompt = str(data.get("music_prompt") or "").strip()
    title = str(data.get("title") or "").strip()[:48]
    if not (music_prompt and title):
        return None
    # belt and suspenders: the genre must lead even if the LLM drifts
    if "lofi" not in music_prompt.lower():
        music_prompt = f"lofi chill, {music_prompt}"
    # whatever the LLM picked, snap it to a bed we can actually render
    return music_prompt, title, ambience.normalize_slug(data.get("ambience"))


def _enrich_minicpm(prompt: str):
    """Enrich via MiniCPM on cuda. MUST run inside @spaces.GPU. Returns a tuple or
    None (caller falls back). Thinking mode off so the reply is direct JSON."""
    if _enricher_disabled:
        return None
    import torch

    tok, model = load_enricher()
    messages = [
        {"role": "system", "content": ENRICH_SYSTEM},
        {"role": "user", "content": prompt},
    ]
    inputs = tok.apply_chat_template(
        messages, tokenize=True, add_generation_prompt=True,
        enable_thinking=False, return_dict=True, return_tensors="pt",
    ).to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs, max_new_tokens=220, do_sample=True, temperature=0.7, top_p=0.95
        )
    reply = tok.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
    return _finalize_enrichment(_parse_enrich_json(reply))


def _enrich_ollama(prompt: str):
    """Enrich via a local Ollama daemon. Returns a tuple or None on failure."""
    r = httpx.post(
        f"{OLLAMA_URL}/api/chat",
        json={
            "model": OLLAMA_MODEL,
            "messages": [
                {"role": "system", "content": ENRICH_SYSTEM},
                {"role": "user", "content": prompt},
            ],
            "format": "json",
            "stream": False,
            "options": {"temperature": 0.8, "num_predict": 220},
        },
        timeout=45,
    )
    r.raise_for_status()
    return _finalize_enrichment(json.loads(r.json()["message"]["content"]))


def _enrich_fallback(prompt: str) -> tuple[str, str, str]:
    """Plain, LLM-free enrichment — used whenever the chosen backend fails."""
    import ambience

    title = f"{prompt[:28].title()} Tape" if prompt.strip() else "Untitled Tape"
    return (
        f"lofi chill, {prompt}, mellow and warm, soft drums, slow tempo, instrumental",
        title,
        ambience.DEFAULT,
    )


def enrich_prompt(prompt: str) -> tuple[str, str, str]:
    """Vibe -> (music_prompt, cassette title, ambience slug). Backend is chosen by
    environment: MiniCPM on ZeroGPU, Ollama locally; a plain fallback covers any
    failure. On ZeroGPU this MUST be called inside @spaces.GPU (MiniCPM is cuda)."""
    backend = _enrich_minicpm if IS_ZEROGPU else _enrich_ollama
    try:
        result = backend(prompt)
        if result:
            return result
        print("[lofinity] enrichment returned junk, using fallback")
    except Exception as e:  # noqa: BLE001 — any failure means "use fallback"
        print(f"[lofinity] enrichment failed ({e!r}), using fallback")
    return _enrich_fallback(prompt)


# --- audio engines ------------------------------------------------------------

_musicgen = None
_musicgen_lock = threading.Lock()


def load_musicgen():
    """Lazy-load MusicGen once; the first call downloads the model from the Hub
    (musicgen-medium is ~3.5 GB)."""
    global _musicgen
    with _musicgen_lock:
        if _musicgen is None:
            os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
            import torch
            from transformers import AutoProcessor, MusicgenForConditionalGeneration

            requested = os.getenv("LOFINITY_DEVICE")
            if requested:
                device = requested
            elif IS_ZEROGPU:
                device = "cuda"
            elif torch.backends.mps.is_available():
                device = "mps"
            else:
                device = "cpu"
            print(f"[lofinity] loading {MUSICGEN_MODEL} on {device}…")
            processor = AutoProcessor.from_pretrained(MUSICGEN_MODEL)
            model = MusicgenForConditionalGeneration.from_pretrained(MUSICGEN_MODEL)
            model.to(device)
            model.eval()
            _musicgen = (processor, model, device)
            print("[lofinity] musicgen ready")
    return _musicgen


# ZeroGPU wants models resident on cuda at startup (module import time), not
# lazily inside the @spaces.GPU call — outside the decorated function a CUDA
# emulation layer lets this .to('cuda') succeed without a real GPU attached, and
# placements done at startup are far more efficient than per-call transfers.
if IS_ZEROGPU and ENGINE != "stub":
    load_musicgen()
    try:
        load_enricher()
    except Exception as e:  # noqa: BLE001 — a bad enricher must not kill the app
        _enricher_disabled = True
        print(f"[lofinity] enricher load failed ({e!r}); vends use the plain fallback")


def encode_wav(samples, rate: int) -> str:
    """Encode mono float samples as a base64 WAV data URI, entirely in memory.

    Nothing is written to disk: on a shared HF Space a songs directory is
    visible to every visitor and grows without bound. Returning the tape
    inline keeps it private to the one request that asked for it."""
    import numpy as np

    # MusicGen can exceed [-1, 1]; normalize instead of hard-clipping
    peak = float(np.abs(samples).max() or 1.0)
    if peak > 0.95:
        samples = samples * (0.95 / peak)
    pcm = (samples * 32767).astype("<i2")
    buf = io.BytesIO()
    with wave.open(buf, "wb") as w:
        w.setnchannels(1)
        w.setsampwidth(2)
        w.setframerate(rate)
        w.writeframes(pcm.tobytes())
    b64 = base64.b64encode(buf.getvalue()).decode("ascii")
    return f"data:audio/wav;base64,{b64}"


def _rms(x) -> float:
    import numpy as np

    return float(np.sqrt(np.mean(np.square(x)))) if len(x) else 0.0


def _match_rms(chunk, target: float):
    """Scale a fresh continuation to the first chunk's loudness. MusicGen
    continuations drift quieter each round; re-leveling keeps a long tape even."""
    r = _rms(chunk)
    if r < 1e-6 or target < 1e-6:
        return chunk
    return chunk * min(4.0, target / r)  # cap the boost so a quiet tail can't blow up


def _stitch(base, tail, rate: int):
    """Append `tail` to `base` with a short equal-power crossfade at the seam."""
    import numpy as np

    seam = int(SEAM_S * rate)
    if seam <= 0 or len(base) < seam or len(tail) < seam:
        return np.concatenate([base, tail])
    t = np.linspace(0.0, 1.0, seam)
    fade_out, fade_in = np.cos(t * np.pi / 2), np.sin(t * np.pi / 2)
    blended = base[-seam:] * fade_out + tail[:seam] * fade_in
    return np.concatenate([base[:-seam], blended, tail[seam:]])


def musicgen_engine(music_prompt: str, seconds: int = CHUNK_S, progress_cb=None) -> tuple:
    """Returns (samples, sample_rate). Tapes longer than one 30s shot are built
    by re-seeding the model with the last OVERLAP_S and stitching the chunks.
    progress_cb(done, total) is called after each chunk finishes."""
    import torch

    processor, model, device = load_musicgen()
    rate = model.config.audio_encoder.sampling_rate
    overlap = int(OVERLAP_S * rate)
    # a continuation prepends the OVERLAP_S seed, so it may generate only
    # MAX_GEN_S - OVERLAP_S NEW seconds to keep the whole shot inside musicgen's
    # ~30s training window — generating past it is what turns the tail to noise
    cont_new_s = max(1.0, MAX_GEN_S - OVERLAP_S)
    rounds = max(0, round(seconds / CHUNK_S) - 1)  # 30->0, 60->1, 90->2
    total = rounds + 1

    def shot(dev, seed=None, new_s=CHUNK_S):
        if seed is None:
            inputs = processor(text=[music_prompt], padding=True, return_tensors="pt")
        else:
            inputs = processor(
                audio=seed, sampling_rate=rate, text=[music_prompt],
                padding=True, return_tensors="pt",
            )
        inputs = inputs.to(dev)
        with torch.no_grad():
            audio = model.generate(
                **inputs,
                do_sample=True,
                guidance_scale=3.0,
                max_new_tokens=int(new_s * 50),  # ~50 tokens per second
            )
        return audio[0, 0].cpu().float().numpy()

    def build(dev):
        track = shot(dev)  # first 30s straight from the prompt
        if progress_cb:
            progress_cb(1, total)
        base_rms = _rms(track)
        for i in range(rounds):
            out = shot(dev, seed=track[-overlap:], new_s=cont_new_s)  # capped continuation
            fresh = _match_rms(out[overlap:], base_rms)  # drop the re-encoded seed
            track = _stitch(track, fresh, rate)
            if progress_cb:
                progress_cb(i + 2, total)
            print(f"[lofinity]   stitched chunk {i + 2}/{total}")
        return track

    try:
        samples = build(device)
    except Exception as e:  # noqa: BLE001
        if device == "mps":
            print(f"[lofinity] mps generation failed ({e!r}), retrying on cpu")
            model.to("cpu")
            samples = build("cpu")
        else:
            raise
    return samples, rate


def _gpu_budget(prompt: str, seconds: int = CHUNK_S) -> int:
    """GPU seconds to request from ZeroGPU for a brew of this length: MiniCPM
    enrichment + per-chunk MusicGen render plus headroom. Tighter budgets earn
    better queue priority; the signature must mirror gpu_brew so ZeroGPU can pass
    it the same args."""
    chunks = max(1, round(int(seconds) / CHUNK_S))
    # musicgen-medium renders slower than -small, so budget generously — a brew
    # that overruns its @spaces.GPU duration is KILLED mid-render; over-budgeting
    # only costs quota, under-budgeting loses the tape.
    return 40 + 40 * chunks  # 30s->80, 60s->120, 90s->160


@spaces.GPU(duration=_gpu_budget)
def gpu_brew(prompt: str, seconds: int = CHUNK_S) -> tuple:
    """ZeroGPU entry point — enrichment (MiniCPM) AND MusicGen on the real GPU in
    a single acquisition. Takes the raw vibe and returns
    (music_prompt, title, bed, samples, rate). It runs in a separate GPU worker
    process and can't push per-chunk progress back to the web process, so
    /api/progress reports a time-based estimate for the bar. This path is
    Space-only."""
    music_prompt, title, bed = enrich_prompt(prompt)
    samples, rate = musicgen_engine(music_prompt, seconds)
    return music_prompt, title, bed, samples, rate


def stub_engine(_music_prompt: str, seconds: int = CHUNK_S, progress_cb=None) -> tuple:
    """A short audible tone — handy when developing without the heavy model.
    Honors `seconds` and fakes per-chunk timing so the length slider and the
    brewing progress bar can be exercised without MusicGen."""
    import time

    import numpy as np

    rate = 22050
    total = max(1, round(seconds / CHUNK_S))
    for i in range(total):
        time.sleep(0.8)  # pretend each 30s chunk takes a moment to render
        if progress_cb:
            progress_cb(i + 1, total)
    secs = float(seconds)
    t = np.arange(int(rate * secs)) / rate
    fade = np.minimum(1.0, np.minimum(t * 4, (secs - t) * 4))
    # a slow wobble so a longer stub is audibly (and visibly) longer
    return 0.25 * fade * np.sin(2 * np.pi * 220 * t) * (0.8 + 0.2 * np.sin(t)), rate


# --- API -----------------------------------------------------------------------


@app.api(name="generate_song", concurrency_limit=1)
def generate_song(prompt: str, seconds: int = DEFAULT_SECONDS) -> dict:
    import ambience

    # snap whatever the slider sends to a length we can actually build
    seconds = min(ALLOWED_SECONDS, key=lambda s: abs(s - int(seconds)))
    # reset progress up front, BEFORE the (sometimes slow) enrich step, so a poll
    # arriving early sees this brew at 0% rather than the last one at 100%
    chunks = max(1, round(seconds / CHUNK_S))
    _PROGRESS.update(done=0, total=chunks)

    if IS_ZEROGPU and ENGINE != "stub":
        # On ZeroGPU enrichment (MiniCPM) and MusicGen share ONE @spaces.GPU
        # acquisition in a separate worker process, which can't push real progress
        # back — so /api/progress reports a smooth time-based ESTIMATE driven by
        # this brew's start + budget (capped <100% until the tape actually lands).
        est = GPU_WARMUP_S + GPU_SECS_PER_CHUNK * chunks
        _BREW.update(active=True, start=time.monotonic(), est=est, total=chunks)
        print(f"[lofinity] brewing on GPU :: {prompt!r} ({seconds}s, ~{est:.0f}s est)")
        try:
            music_prompt, title, bed, samples, rate = gpu_brew(prompt, seconds)
        finally:
            # top the bar off BEFORE clearing active, so a poll landing in between
            # reads 100% (from _PROGRESS), never the 0% this brew started at
            _PROGRESS.update(done=chunks, total=chunks)
            _BREW.update(active=False)
        print(f"[lofinity] brewed {title!r} :: {music_prompt} [+ {bed}]")
    else:
        # Local / stub: enrich in-process (Ollama or fallback), then render with
        # live per-chunk progress for the brewing garden.
        music_prompt, title, bed = enrich_prompt(prompt)
        print(f"[lofinity] brewing {title!r} ({seconds}s) :: {music_prompt} [+ {bed}]")
        engine = stub_engine if ENGINE == "stub" else musicgen_engine
        samples, rate = engine(
            music_prompt, seconds,
            progress_cb=lambda d, t: _PROGRESS.update(done=d, total=t),
        )

    _PROGRESS.update(done=chunks, total=chunks)
    try:
        samples = ambience.mix(samples, rate, bed)
    except Exception as e:  # noqa: BLE001 — a dry tape beats a failed vend
        print(f"[lofinity] ambience mix failed ({e!r}), vending without the bed")
    # The tape rides back inline as a base64 data URI — no disk write, so it is
    # never cached on the Space nor shared with other visitors. The frontend
    # keeps the collection client-side, per browser session.
    return {"title": title, "audio": encode_wav(samples, rate)}


@app.get("/api/progress")
def progress() -> dict:
    """Progress for the brewing bar. Local/stub report real per-chunk progress via
    _PROGRESS. A ZeroGPU brew runs in a separate GPU worker that can't push
    progress back, so report a smooth time-based ESTIMATE instead: a fractional
    `done` (the frontend fills the bar to done/total) capped below 100% until the
    real tape lands and _PROGRESS tops it off."""
    if _BREW["active"] and _BREW["est"] > 0:
        elapsed = time.monotonic() - _BREW["start"]
        frac = min(0.92, elapsed / _BREW["est"])
        return {"done": round(frac * _BREW["total"], 3), "total": _BREW["total"]}
    return dict(_PROGRESS)


@app.get("/api/config")
def config() -> dict:
    """Frontend config: the tape lengths this backend allows. Hardware-dependent —
    a CPU-only fallback offers only 30s — so the slider reads it and adapts."""
    return {"allowed_seconds": list(ALLOWED_SECONDS)}


@app.get("/")
async def homepage():
    return FileResponse(FRONTEND / "index.html")


app.mount("/static", StaticFiles(directory=FRONTEND), name="static")

if __name__ == "__main__":
    app.launch(show_error=True)