magenta-rt-jam

Runtime error

App Files Files Community

linoyts HF Staff commited on 4 days ago

Commit

dfd4eb6

1 Parent(s): a8fed3a

Revert "jam: single-dispatch CUDA-graph stepping in the WS worker (eager fallback)"

Browse files

This reverts commit a8fed3aa371c9b349e9ebf31ba5b4c5cc378644f.

Files changed (5) hide show

app.py +4 -27
magenta_rt/torch/configuration_magenta_rt2.py +0 -115
magenta_rt/torch/depthformer.py +4 -21
magenta_rt/torch/modeling_magenta_rt2.py +27 -339
magenta_rt/torch/processing_musiccoca.py +0 -77

app.py CHANGED Viewed

@@ -162,9 +162,6 @@ async def banks(session_id: str = ""):
 def gpu_stream(session_id):
     """Continuous gen; switches model live when the dropdown changes."""
     from magenta_rt.torch.system import make_sampler, discretize_cfg, _float_to_int16, FRAME_SAMPLES
-    from magenta_rt.torch.modeling_magenta_rt2 import CudaGraphStreamer
-    USE_CG = os.environ.get('MRT_CUDAGRAPH', '1') == '1'   # single-dispatch CUDA-graph stepping (eager fallback)
-    cg_ok = True
     if style_model.device != "cuda":
         style_model.to("cuda")
     dev, dt = "cuda", torch.bfloat16
@@ -186,7 +183,6 @@ def gpu_stream(session_id):
         print("[warmup]", repr(_e), flush=True)
     notes, drums = [-1] * 128, [-1]
     cur_name = model = dstate = source = None
-    streamer = last_src_for_graph = None
     decode_state = {}
     emitted_samples = 0
     t0 = time.time()
@@ -211,14 +207,12 @@ def gpu_stream(session_id):
             decode_state = model.init_decode_state()
             emitted_samples, source, prev_active = 0, None, set()
             cur_style_sig = cur_note_sig = cur_tokens = None; had_onsets = False
-            streamer = last_src_for_graph = None     # rebuild CUDA graph on model switch / reset
         seed = int(c.get("seed", 0))
         if seed != cur_seed:
             cur_seed = seed
             gen = torch.Generator(device=dev).manual_seed(seed)
-            streamer = None                          # re-seed => re-capture (graph RNG fixed at capture)
         bop = c.get("bank_op")
-        if bop and int(bop.get("ver", 0)) != cur_bank_ver and not USE_CG:   # save/recall (eager only; cudagraph KV is static)
             cur_bank_ver = int(bop.get("ver", 0))
             bpath = os.path.join(SESSION_DIR, f"{os.path.basename(session_id)}_bank{int(bop.get('idx', 0))}.pt")
             try:
@@ -297,26 +291,9 @@ def gpu_stream(session_id):
                 cond = model._conditioning((list(cur_tokens) + [-1] * model.num_musiccoca)[:model.num_musiccoca],
                                            nvec, drm, cfgs)
                 source = model.model.encode(cond).to(dt)
-            temp = c.get("temperature", 1.1); topk = int(c.get("top_k", 50))
-            ok = False
-            if USE_CG and cg_ok:                          # single-dispatch CUDA-graph step
-                try:
-                    if streamer is None:                  # build + capture on first frame (~2-3s warmup)
-                        streamer = CudaGraphStreamer(model.model.decoder, source, dt,
-                                                     temperature=temp, top_k=topk, seed=cur_seed)
-                        last_src_for_graph = source
-                    elif source is not last_src_for_graph:  # conditioning changed -> update static buffer
-                        streamer.set_source(source); last_src_for_graph = source
-                    streamer.set_temperature(temp)
-                    toks.append(streamer.step()); ok = True
-                except Exception as _cge:
-                    print("[cudagraph] fallback to eager:", repr(_cge), flush=True)
-                    cg_ok = False; streamer = None
-                    dstate = model.model.decoder.init_streaming_f(1, dev, dt)
-            if not ok:                                    # eager fallback path
-                sampler = make_sampler(temp, topk, gen)
-                toks.append(model.model.decoder.step_f(dstate, source, sampler=sampler,
-                            temporal_step=model._temporal_step, depth_step=model._depth_step))
         new_codes = torch.cat(toks, dim=1)
         audio = model.decode_stream(new_codes, decode_state)      # FLOP-optimal stateful streaming decode
         emitted_samples += audio.shape[1]

 def gpu_stream(session_id):
     """Continuous gen; switches model live when the dropdown changes."""
     from magenta_rt.torch.system import make_sampler, discretize_cfg, _float_to_int16, FRAME_SAMPLES
     if style_model.device != "cuda":
         style_model.to("cuda")
     dev, dt = "cuda", torch.bfloat16
         print("[warmup]", repr(_e), flush=True)
     notes, drums = [-1] * 128, [-1]
     cur_name = model = dstate = source = None
     decode_state = {}
     emitted_samples = 0
     t0 = time.time()
             decode_state = model.init_decode_state()
             emitted_samples, source, prev_active = 0, None, set()
             cur_style_sig = cur_note_sig = cur_tokens = None; had_onsets = False
         seed = int(c.get("seed", 0))
         if seed != cur_seed:
             cur_seed = seed
             gen = torch.Generator(device=dev).manual_seed(seed)
         bop = c.get("bank_op")
+        if bop and int(bop.get("ver", 0)) != cur_bank_ver:    # save/recall generation state
             cur_bank_ver = int(bop.get("ver", 0))
             bpath = os.path.join(SESSION_DIR, f"{os.path.basename(session_id)}_bank{int(bop.get('idx', 0))}.pt")
             try:
                 cond = model._conditioning((list(cur_tokens) + [-1] * model.num_musiccoca)[:model.num_musiccoca],
                                            nvec, drm, cfgs)
                 source = model.model.encode(cond).to(dt)
+            sampler = make_sampler(c.get("temperature", 1.1), c.get("top_k", 50), gen)
+            toks.append(model.model.decoder.step_f(dstate, source, sampler=sampler,
+                        temporal_step=model._temporal_step, depth_step=model._depth_step))
         new_codes = torch.cat(toks, dim=1)
         audio = model.decode_stream(new_codes, decode_state)      # FLOP-optimal stateful streaming decode
         emitted_samples += audio.shape[1]

magenta_rt/torch/configuration_magenta_rt2.py DELETED Viewed

@@ -1,115 +0,0 @@
-# Copyright 2026 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""HF config for the Magenta RealTime 2 PyTorch model."""
-from transformers import PretrainedConfig
-class MagentaRT2Config(PretrainedConfig):
-    """Config for `MagentaRT2ForConditionalGeneration`.
-    `temporal` / `depth` are [num_layers, model_dims, hidden_dims, num_heads,
-    dim_per_head] for the two Depthformer transformer stacks.
-    """
-    model_type = "magenta_rt2"
-    def __init__(
-        self,
-        size="mrt2_small",
-        encoder_model_dims=256,
-        temporal=(12, 1024, 4096, 8, 128),
-        depth=(2, 768, 3072, 6, 128),
-        temporal_max_past=41,
-        depth_max_past=12,
-        musiccoca_rvq=12,
-        musiccoca_per_rvq_vocab=1031,
-        musiccoca_embed_dim=768,
-        regular_num_embeddings_per_channel=None,
-        regular_num_channels=132,
-        num_sinks=1,
-        num_codebooks=12,
-        codebook_size=1024,
-        num_reserved_tokens=6,
-        vocab_size=12294,
-        soft_cap_logits=30.0,
-        temperature=1.3,
-        top_k=40,
-        cfg_musiccoca=3.0,
-        cfg_notes=1.0,
-        cfg_drums=1.0,
-        num_notes=128,
-        num_drums=1,
-        sample_rate=48000,
-        frame_samples=1920,
-        codec_param_shapes=None,
-        **kwargs,
-    ):
-        self.size = size
-        self.codec_param_shapes = codec_param_shapes
-        self.encoder_model_dims = encoder_model_dims
-        self.temporal = list(temporal)
-        self.depth = list(depth)
-        self.temporal_max_past = temporal_max_past
-        self.depth_max_past = depth_max_past
-        self.musiccoca_rvq = musiccoca_rvq
-        self.musiccoca_per_rvq_vocab = musiccoca_per_rvq_vocab
-        self.musiccoca_embed_dim = musiccoca_embed_dim
-        self.regular_num_embeddings_per_channel = regular_num_embeddings_per_channel
-        self.regular_num_channels = regular_num_channels
-        self.num_sinks = num_sinks
-        self.num_codebooks = num_codebooks
-        self.codebook_size = codebook_size
-        self.num_reserved_tokens = num_reserved_tokens
-        self.vocab_size = vocab_size
-        self.soft_cap_logits = soft_cap_logits
-        self.temperature = temperature
-        self.top_k = top_k
-        self.cfg_musiccoca = cfg_musiccoca
-        self.cfg_notes = cfg_notes
-        self.cfg_drums = cfg_drums
-        self.num_notes = num_notes
-        self.num_drums = num_drums
-        self.sample_rate = sample_rate
-        self.frame_samples = frame_samples
-        super().__init__(**kwargs)
-    @classmethod
-    def from_size(cls, size):
-        from .depthformer import config_for
-        from dataclasses import astuple
-        c = config_for(size)
-        return cls(
-            size=size,
-            encoder_model_dims=c.encoder_model_dims,
-            temporal=list(astuple(c.temporal)),
-            depth=list(astuple(c.depth)),
-            temporal_max_past=c.temporal_max_past,
-            depth_max_past=c.depth_max_past,
-            musiccoca_rvq=c.musiccoca_rvq,
-            musiccoca_per_rvq_vocab=c.musiccoca_per_rvq_vocab,
-            musiccoca_embed_dim=c.musiccoca_embed_dim,
-            regular_num_embeddings_per_channel=list(c.regular_num_embeddings_per_channel),
-            regular_num_channels=c.regular_num_channels,
-            num_sinks=c.num_sinks,
-            num_codebooks=c.num_codebooks,
-            codebook_size=c.codebook_size,
-            num_reserved_tokens=c.num_reserved_tokens,
-            vocab_size=c.vocab_size,
-            soft_cap_logits=c.soft_cap_logits,
-        )
-__all__ = ["MagentaRT2Config"]

magenta_rt/torch/depthformer.py CHANGED Viewed

@@ -256,15 +256,9 @@ class MultivariateDecoder(nn.Module):
         }
     def step_f(self, state, source_frame, sampler=None, forced=None,
-               temporal_step=None, depth_step=None, cfg_scales=None):
         """One functional frame. temporal_step/depth_step override the eager fns
-        (e.g. with AOTI-compiled callables). Updates state in place; returns [b,1,Q].
-        cfg_scales: optional tuple of classifier-free-guidance scales. When set,
-        `source_frame`/`state` are batched as [positive, neg_1, ...] with
-        arity = 1 + len(cfg_scales); per-codebook logits are combined as
-        ``cond + sum_i scale_i*(cond - neg_i)`` before sampling (the native
-        MLX/.mlxfn path). The single sampled token is broadcast to all rows."""
         cfg = self.cfg
         tstep = temporal_step or self.temporal_step_fn
         dstep = depth_step or self.depth_step_fn
@@ -281,21 +275,10 @@ class MultivariateDecoder(nn.Module):
             logits, depth_kv = dstep(depth_input, depth_kv)
             lo = cfg.num_reserved_tokens + q * cfg.codebook_size
             hi = lo + cfg.codebook_size
-            if cfg_scales is not None:                       # classifier-free guidance combine
-                cond = logits[0:1]
-                comb = cond
-                for i, s in enumerate(cfg_scales, start=1):
-                    comb = comb + s * (cond - logits[i:i + 1])
-                tok = forced[..., q] if forced is not None else sampler(comb.float(), q, lo, hi)
-                depth_input = self.embed(tok.expand(logits.shape[0], -1))
-            else:
-                tok = forced[..., q] if forced is not None else sampler(logits.float(), q, lo, hi)
-                depth_input = self.embed(tok)
             samples.append(tok)
         frame = torch.stack(samples, dim=-1)
-        if cfg_scales is not None:
-            state["prev"] = frame.expand(to.shape[0], -1, -1)
-            return frame[:1]
         state["prev"] = frame
         return frame

         }
     def step_f(self, state, source_frame, sampler=None, forced=None,
+               temporal_step=None, depth_step=None):
         """One functional frame. temporal_step/depth_step override the eager fns
+        (e.g. with AOTI-compiled callables). Updates state in place; returns [b,1,Q]."""
         cfg = self.cfg
         tstep = temporal_step or self.temporal_step_fn
         dstep = depth_step or self.depth_step_fn
             logits, depth_kv = dstep(depth_input, depth_kv)
             lo = cfg.num_reserved_tokens + q * cfg.codebook_size
             hi = lo + cfg.codebook_size
+            tok = forced[..., q] if forced is not None else sampler(logits.float(), q, lo, hi)
             samples.append(tok)
+            depth_input = self.embed(tok)
         frame = torch.stack(samples, dim=-1)
         state["prev"] = frame
         return frame

magenta_rt/torch/modeling_magenta_rt2.py CHANGED Viewed

@@ -24,7 +24,6 @@ loop is a single token stream). MusicCoCa style encoding is a separate
 import json
 import os
-import warnings
 import numpy as np
 import torch
@@ -239,32 +238,6 @@ class MagentaRT2ForConditionalGeneration(MagentaRT2PreTrainedModel):
         ]
         return self._conditioning(style_tokens, notes, drums, cfgs)
-    def _guidance_source(self, style, notes, drums, cfg_musiccoca, cfg_notes):
-        """OPTIONAL classifier-free-guidance conditioning (the native MLX/.mlxfn path).
-        Builds a 3-row batch [positive, neg_musiccoca, neg_notes] + per-component scales.
-        cfg tokens are neutralized (guidance replaces them); negatives mask style / notes.
-        Returns (source[3,Tc,enc], (cfg_musiccoca, cfg_notes))."""
-        c = self.config
-        if style is None:
-            st = [-1] * self.num_musiccoca
-        elif isinstance(style, (list, np.ndarray)) and np.asarray(style).ndim == 1 \
-                and np.asarray(style).dtype.kind in "iu" and len(style) == self.num_musiccoca:
-            st = list(style)
-        else:
-            st = self._tokenize_style(style)
-        st = (list(st) + [-1] * self.num_musiccoca)[:self.num_musiccoca]
-        notes = notes if notes is not None else [-1] * self.num_notes
-        drums = drums if drums is not None else [-1] * self.num_drums
-        CM = [-1, -1, -1]                                   # neutralized cfg tokens
-        cond   = self._conditioning(st, notes, drums, CM)
-        neg_mc = self._conditioning([-1] * self.num_musiccoca, notes, drums, CM)
-        neg_n  = self._conditioning(st, [-1] * self.num_notes, drums, CM)
-        source = self.depthformer.encode(torch.cat([cond, neg_mc, neg_n], 0)).to(self._dt)
-        cfg_mc = c.cfg_musiccoca if cfg_musiccoca is None else cfg_musiccoca
-        cfg_n = c.cfg_notes if cfg_notes is None else cfg_notes
-        _warn_high_cfg(cfg_mc, cfg_n)
-        return source, (float(cfg_mc), float(cfg_n))
     # ---- codec ----
     def _decode_stream(self, history, emitted, context=STREAM_DECODE_CONTEXT,
                        margin=STREAM_DECODE_MARGIN, flush=False):
@@ -287,25 +260,6 @@ class MagentaRT2ForConditionalGeneration(MagentaRT2PreTrainedModel):
         """Fresh state dict for streaming decode (decode_stream)."""
         return {}
-    @torch.no_grad()
-    def prefill_f(self, dstate, source_frame, seed_codes):
-        """Teacher-force seed_codes [1,N,Q] (raw 0..codebook_size-1) through the
-        temporal transformer to populate its KV cache (native mlx_engine prefill
-        parity), so generation CONTINUES from the seed. Advances `dstate` in place.
-        Returns unique-code frames [1,N,Q] for the codec decoder."""
-        dec = self.depthformer.decoder
-        Q = self.config.num_codebooks
-        per_cb = (torch.arange(Q, device=seed_codes.device) * self.codebook_size
-                  + self.num_reserved_tokens).view(1, 1, Q)
-        unique = seed_codes.to(torch.long) + per_cb
-        N = unique.shape[1]
-        for step in range(max(0, N - 1)):
-            dec.step_f(dstate, source_frame, forced=unique[:, step:step + 1, :],
-                       temporal_step=self._temporal_step, depth_step=self._depth_step)
-        if N > 0:
-            dstate["prev"] = unique[:, N - 1:N, :]
-        return unique
     def decode_stream(self, new_codes, state):
         """Incremental codec decode of new token frames [b, t_new, Q] -> audio [b, N, 2].
         FLOP-optimal stateful streaming (no overlap-save re-decode); bf16-equivalent to
@@ -328,35 +282,26 @@ class MagentaRT2ForConditionalGeneration(MagentaRT2PreTrainedModel):
     @torch.no_grad()
     def generate(self, style=None, notes=None, drums=None, cfg_musiccoca=None,
                  cfg_notes=None, cfg_drums=None, temperature=None, top_k=None,
-                 frames=25, seed=0, state=None, flush=False, return_int16=False,
-                 guidance=False):
-        """`guidance=False` (default): cfg_* are discretized conditioning tokens — the
-        validated in-process/JAX path, unchanged. `guidance=True`: cfg_musiccoca/cfg_notes
-        become classifier-free-guidance scales (negatives + per-codebook logit combine),
-        matching the native MLX/Mac-app path. Guidance uses eager steps (batch>1)."""
         c = self.config
         temperature = c.temperature if temperature is None else temperature
         top_k = c.top_k if top_k is None else top_k
-        if guidance:
-            source, cfg_scales = self._guidance_source(style, notes, drums, cfg_musiccoca, cfg_notes)
-            arity = len(cfg_scales) + 1
-        else:
-            cond = self._resolve_conditioning(style, notes, drums, cfg_musiccoca, cfg_notes, cfg_drums)
-            source = self.depthformer.encode(cond).to(self._dt)
-            cfg_scales, arity = None, 1
         if state is None:
-            dstate = self.depthformer.decoder.init_streaming_f(arity, self._dev, self._dt)
             gen = torch.Generator(device=self._dev).manual_seed(seed)
-            decode_state = self.init_decode_state()
         else:
-            dstate, gen, decode_state = state["dstate"], state["gen"], state["decode_state"]
         sampler = make_sampler(temperature, top_k, gen)
-        # dynamic-batch AOTI (or eager fallback) handles guidance B>1 and no-guidance B=1 alike.
         toks = [self.depthformer.decoder.step_f(
-            dstate, source, sampler=sampler, cfg_scales=cfg_scales,
             temporal_step=self._temporal_step, depth_step=self._depth_step) for _ in range(frames)]
-        audio = self.decode_stream(torch.cat(toks, dim=1), decode_state)   # stateful per-frame streaming decode (40ms frames)
-        new_state = {"dstate": dstate, "gen": gen, "decode_state": decode_state}
         wav = audio[0].float().cpu().numpy()
         i16 = _float_to_int16(wav)
         out = i16 if return_int16 else i16.astype(np.float32) / 32768.0
@@ -364,19 +309,9 @@ class MagentaRT2ForConditionalGeneration(MagentaRT2PreTrainedModel):
     @torch.no_grad()
     def stream(self, control, chunk_frames=10, max_seconds=55.0, seed=0,
-               time_fn=None, sleep_fn=None, notes=None, drums=None, guidance=False,
-               cudagraph=False):
         """Continuous generation. `control()` returns {style_tokens, temperature,
-        top_k, cfg_*} read every chunk for mid-stream steering. Yields int16 [N,2].
-        guidance=False (default): cfg_* are conditioning tokens (validated token path,
-        unchanged). guidance=True: cfg_musiccoca/cfg_notes are classifier-free-guidance
-        scales read live every chunk. cudagraph=True: single-dispatch CUDA-graph stepping
-        (one capture at start, ~4-5x faster), steered via static input buffers."""
-        if cudagraph:
-            yield from self._stream_cudagraph(control, chunk_frames, max_seconds, seed,
-                                              time_fn, sleep_fn, notes, drums, guidance)
-            return
         import time as _time
         time_fn = time_fn or _time.time
         sleep_fn = sleep_fn or _time.sleep
@@ -384,11 +319,10 @@ class MagentaRT2ForConditionalGeneration(MagentaRT2PreTrainedModel):
         dev, dt = self._dev, self._dt
         notes = notes if notes is not None else [-1] * self.num_notes
         drums = drums if drums is not None else [-1] * self.num_drums
-        arity = 3 if guidance else 1
-        dstate = self.depthformer.decoder.init_streaming_f(arity, dev, dt)
         gen = torch.Generator(device=dev).manual_seed(seed)
-        decode_state = self.init_decode_state()
-        emitted_samples = 0
         cur_tokens = None
         source = None
         t0 = time_fn()
@@ -400,269 +334,23 @@ class MagentaRT2ForConditionalGeneration(MagentaRT2PreTrainedModel):
             tokens = ctl["style_tokens"]
             if tokens != cur_tokens:
                 cur_tokens = tokens
-                st = (list(tokens) + [-1] * self.num_musiccoca)[:self.num_musiccoca]
-                if guidance:                                  # [pos, neg_mc, neg_n]; cfg tokens neutralized
-                    source, _ = self._guidance_source(st, notes, drums, None, None)
-                else:
-                    cfgs = [discretize_cfg(ctl.get("cfg_musiccoca", c.cfg_musiccoca), 0.2, 40),
-                            discretize_cfg(ctl.get("cfg_notes", c.cfg_notes), 0.2, 40),
-                            discretize_cfg(ctl.get("cfg_drums", c.cfg_drums), 1.0, 8)]
-                    source = self.depthformer.encode(self._conditioning(st, notes, drums, cfgs)).to(dt)
-            cfg_scales = ((float(ctl.get("cfg_musiccoca", c.cfg_musiccoca)),     # live scales (unclamped)
-                           float(ctl.get("cfg_notes", c.cfg_notes))) if guidance else None)
             sampler = make_sampler(ctl.get("temperature", c.temperature), ctl.get("top_k", c.top_k), gen)
             toks = [self.depthformer.decoder.step_f(
-                dstate, source, sampler=sampler, cfg_scales=cfg_scales,
                 temporal_step=self._temporal_step, depth_step=self._depth_step) for _ in range(chunk_frames)]
-            audio = self.decode_stream(torch.cat(toks, dim=1), decode_state)
-            emitted_samples += audio.shape[1]
             if audio.shape[1] > 0:
                 yield _float_to_int16(audio[0].float().cpu().numpy())
-            ahead = (emitted_samples / SR) - (time_fn() - t0)
             if ahead > 1.0:
                 sleep_fn(min(ahead - 1.0, 0.5))
-    @torch.no_grad()
-    def _stream_cudagraph(self, control, chunk_frames, max_seconds, seed,
-                          time_fn, sleep_fn, notes, drums, guidance):
-        """CUDA-graph backend for stream(cudagraph=True): one capture at start
-        (warmup ~KEEP frames), then single-dispatch replay per frame. Steering
-        goes through the streamer's static input buffers — cfg/temperature are
-        buffer writes; a style change re-encodes + set_source (windowed ramp)."""
-        import time as _time
-        time_fn = time_fn or _time.time
-        sleep_fn = sleep_fn or _time.sleep
-        c = self.config
-        dt = self._dt
-        notes = notes if notes is not None else [-1] * self.num_notes
-        drums = drums if drums is not None else [-1] * self.num_drums
-        def encode_src(tokens, cfg_mc, cfg_n):
-            st = (list(tokens) + [-1] * self.num_musiccoca)[:self.num_musiccoca]
-            if guidance:
-                return self._guidance_source(st, notes, drums, cfg_mc, cfg_n)[0]
-            cfgs = [discretize_cfg(cfg_mc, 0.2, 40), discretize_cfg(cfg_n, 0.2, 40),
-                    discretize_cfg(c.cfg_drums, 1.0, 8)]
-            return self.depthformer.encode(self._conditioning(st, notes, drums, cfgs)).to(dt)
-        # bounded wait for the first conditioning, then build + capture the graph
-        t0 = time_fn()
-        ctl = control()
-        while ctl is None and time_fn() - t0 < max_seconds:
-            sleep_fn(0.02); ctl = control()
-        if ctl is None:
-            return
-        cur_tokens = ctl["style_tokens"]
-        cur_cfg = (float(ctl.get("cfg_musiccoca", c.cfg_musiccoca)),
-                   float(ctl.get("cfg_notes", c.cfg_notes)))
-        streamer = self.make_cudagraph_streamer(
-            style=cur_tokens, notes=notes, drums=drums,
-            cfg_musiccoca=cur_cfg[0], cfg_notes=cur_cfg[1],
-            temperature=ctl.get("temperature", c.temperature),
-            top_k=ctl.get("top_k", c.top_k), seed=seed, guidance=guidance)
-        decode_state = self.init_decode_state()
-        emitted_samples = 0
-        t0 = time_fn()
-        while time_fn() - t0 < max_seconds:
-            ctl = control()
-            if ctl is None:
-                sleep_fn(0.005); continue
-            tokens = ctl["style_tokens"]
-            cfg_mc = float(ctl.get("cfg_musiccoca", c.cfg_musiccoca))
-            cfg_n = float(ctl.get("cfg_notes", c.cfg_notes))
-            if guidance:
-                if (cfg_mc, cfg_n) != cur_cfg:
-                    streamer.set_cfg([cfg_mc, cfg_n]); cur_cfg = (cfg_mc, cfg_n)
-                if tokens != cur_tokens:
-                    streamer.set_source(encode_src(tokens, cfg_mc, cfg_n)); cur_tokens = tokens
-            elif tokens != cur_tokens or (cfg_mc, cfg_n) != cur_cfg:   # token path: cfg lives in source
-                streamer.set_source(encode_src(tokens, cfg_mc, cfg_n))
-                cur_tokens, cur_cfg = tokens, (cfg_mc, cfg_n)
-            streamer.set_temperature(ctl.get("temperature", c.temperature))
-            toks = [streamer.step() for _ in range(chunk_frames)]
-            audio = self.decode_stream(torch.cat(toks, dim=1), decode_state)
-            emitted_samples += audio.shape[1]
-            if audio.shape[1] > 0:
-                yield _float_to_int16(audio[0].float().cpu().numpy())
-            ahead = (emitted_samples / SR) - (time_fn() - t0)
-            if ahead > 1.0:
-                sleep_fn(min(ahead - 1.0, 0.5))
-    @torch.no_grad()
-    def make_cudagraph_streamer(self, style=None, notes=None, drums=None,
-                                cfg_musiccoca=None, cfg_notes=None, cfg_drums=None,
-                                temperature=None, top_k=None, seed=0, guidance=False,
-                                warmup=None):
-        """One-dispatch-per-frame CUDA-graph streaming: captures the whole frame
-        (temporal + N-codebook depth + in-graph sampler + optional CFG) as a single
-        `torch.cuda.graph` replay over fixed-size static KV buffers — ~MLX `.mlxfn`.
-        Returns a `CudaGraphStreamer`; call `.step()` for the next frame [1,1,Q]
-        (decode with `decode_stream`), and `.set_cfg/.set_temperature/.set_source`
-        for live steering (no re-capture). `top_k` is fixed at capture time."""
-        if guidance:
-            source, scales = self._guidance_source(style, notes, drums, cfg_musiccoca, cfg_notes)
-            num_neg = len(scales)
-        else:
-            cond = self._resolve_conditioning(style, notes, drums, cfg_musiccoca, cfg_notes, cfg_drums)
-            source = self.depthformer.encode(cond).to(self._dt)
-            scales, num_neg = None, 0
-        temperature = self.config.temperature if temperature is None else temperature
-        top_k = self.config.top_k if top_k is None else top_k
-        return CudaGraphStreamer(self.depthformer.decoder, source, self._dt, num_neg, scales,
-                                 temperature, top_k, seed, warmup)
-# Classifier-free guidance scales above this can run away / collapse the output
-# to silence under *sustained constant* conditioning over long runs (the native
-# UI uses a 0-5 slider, default 2.4). We don't clamp — values pass through to
-# match the native range — but we warn once so the caller knows the risk.
-GUIDANCE_CFG_WARN = 3.5
-def _warn_high_cfg(*scales):
-    hi = [round(float(s), 2) for s in scales if float(s) > GUIDANCE_CFG_WARN]
-    if hi:
-        warnings.warn(
-            f"CFG guidance scale(s) {hi} exceed ~{GUIDANCE_CFG_WARN}; sustained high "
-            "guidance on constant conditioning can make the output run away / collapse "
-            "to silence over long runs. (Changing notes/style during play avoids this.)",
-            stacklevel=3)
-        return True
-    return False
-class CudaGraphStreamer:
-    """Single-dispatch CUDA-graph frame stepper over fixed-size static KV buffers.
-    Warms `KEEP` frames eagerly to fill the temporal/cross KV to steady state,
-    snapshots them into static buffers, then captures one frame (temporal + depth +
-    sampler) with `torch.cuda.graph`. `.step()` replays it (one GPU dispatch) and
-    returns the new frame tokens. Live steering writes into static input buffers
-    (`source`, `cfg`, `temperature`) — the captured graph reads them, no re-capture.
-    Conditioning changes ramp in via the windowed cross-KV (optional hard flush)."""
-    def __init__(self, decoder, source, decode_dtype, num_neg=0, cfg_scales=None,
-                 temperature=1.1, top_k=50, seed=0, warmup=None):
-        """decoder: a MultivariateDecoder (`model.depthformer.decoder` for the
-        modeling class, `model.model.decoder` for the system class). `source` is the
-        pre-encoded conditioning [B, Tc, enc] (B = 1 + num_neg); `decode_dtype` the
-        compute dtype. Class-agnostic so both model wrappers can build it."""
-        dec = decoder
-        c = dec.cfg
-        self.dec = dec
-        self.Q, self.CB, self.NR = c.num_codebooks, c.codebook_size, c.num_reserved_tokens
-        self.KEEP = c.temporal_max_past + 1
-        self.num_neg = num_neg
-        self.top_k = int(top_k)
-        dev, dt = source.device, decode_dtype
-        B = source.shape[0]; self.B = B
-        # live-steering static inputs
-        self.source = source.clone()
-        self.cfg = (torch.zeros(0, device=dev, dtype=torch.float32) if not num_neg
-                    else torch.tensor([float(s) for s in cfg_scales], device=dev, dtype=torch.float32))
-        self.temp = torch.tensor(float(temperature), device=dev, dtype=torch.float32)
-        torch.manual_seed(seed)
-        # 1) prime to steady state (KV == KEEP on every layer)
-        st = dec.init_streaming_f(B, dev, dt)
-        K = self.KEEP
-        for _ in range(K + 8 if warmup is None else warmup):
-            to, ns, nc = dec.temporal_step_fn(st["prev"], st["self"], st["cross"], self.source)
-            st["self"] = [(k[:, -K:], v[:, -K:]) for k, v in ns]
-            st["cross"] = [(k[:, -K:], v[:, -K:]) for k, v in nc]
-            frame = self._depth_sample(to)
-            st["prev"] = frame.expand(B, -1, -1)
-        # 2) static KV + state buffers
-        L = len(st["self"]); self.L = L
-        self.SK = [st["self"][i][0].clone() for i in range(L)]; self.SV = [st["self"][i][1].clone() for i in range(L)]
-        self.CK = [st["cross"][i][0].clone() for i in range(L)]; self.CV = [st["cross"][i][1].clone() for i in range(L)]
-        self.prev = st["prev"].clone()
-        self.out = torch.zeros(1, 1, self.Q, dtype=torch.long, device=dev)
-        # 3) capture (side-stream warmup is required before graph capture)
-        s = torch.cuda.Stream(); s.wait_stream(torch.cuda.current_stream())
-        with torch.cuda.stream(s):
-            for _ in range(3):
-                self._frame_static()
-        torch.cuda.current_stream().wait_stream(s)
-        self.graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(self.graph):
-            self._frame_static()
-    def _depth_sample(self, to):
-        dec = self.dec; B = self.B; Q, CB, NR = self.Q, self.CB, self.NR
-        dd = dec.cfg.depth
-        z = torch.zeros(B, 0, dd.num_heads, dd.dim_per_head, device=to.device, dtype=to.dtype)
-        dk = [(z, z) for _ in range(dd.num_layers)]
-        di = to; toks = []
-        for q in range(Q):
-            logits, dk = dec.depth_step_fn(di, dk)            # [B,1,V]
-            lo = NR + q * CB
-            ls = logits[..., lo:lo + CB]
-            cond = ls[0:1]; comb = cond
-            for i in range(self.num_neg):                     # classifier-free guidance combine
-                comb = comb + self.cfg[i] * (cond - ls[i + 1:i + 2])
-            kth = torch.topk(comb, self.top_k, dim=-1).values[..., -1:]
-            comb = torch.where(comb >= kth, comb, torch.full_like(comb, -1e9))
-            u = torch.rand(1, 1, CB, device=to.device, dtype=torch.float32)   # graph-safe RNG
-            g = -torch.log(-torch.log(u.clamp(1e-10, 1 - 1e-7)))
-            tok = (comb + g * self.temp).argmax(-1) + lo
-            toks.append(tok)
-            di = dec.embed(tok.expand(B, -1))
-        return torch.stack(toks, dim=-1)                       # [1,1,Q]
-    def _frame_static(self):
-        dec = self.dec; K = self.KEEP; L = self.L
-        to, ns, nc = dec.temporal_step_fn(
-            self.prev, [(self.SK[i], self.SV[i]) for i in range(L)],
-            [(self.CK[i], self.CV[i]) for i in range(L)], self.source)
-        for i in range(L):
-            self.SK[i].copy_(ns[i][0][:, -K:]); self.SV[i].copy_(ns[i][1][:, -K:])
-            self.CK[i].copy_(nc[i][0][:, -K:]); self.CV[i].copy_(nc[i][1][:, -K:])
-        frame = self._depth_sample(to)
-        self.out.copy_(frame)
-        self.prev.copy_(frame.expand(self.B, -1, -1))
-    # ---- live steering (no re-capture) ----
-    def set_cfg(self, scales):
-        if self.num_neg:
-            if not getattr(self, "_cfg_warned", False):
-                self._cfg_warned = _warn_high_cfg(*scales)
-            self.cfg.copy_(torch.tensor([float(s) for s in scales],
-                                        device=self.cfg.device, dtype=torch.float32))
-    def set_temperature(self, t):
-        self.temp.fill_(float(t))
-    def set_source(self, source, flush=False):
-        """Update conditioning. Ramps in via the windowed cross-KV; flush=True
-        overwrites all cross-KV slots for an immediate change."""
-        self.source.copy_(source if source.shape[0] == self.B else source.expand(self.B, -1, -1))
-        if flush:
-            for i in range(self.L):
-                sk, sv = self.dec.temporal_body.layers[i]["cross_attention"]._kv(self.source)
-                self.CK[i].copy_(sk[:, -self.KEEP:]); self.CV[i].copy_(sv[:, -self.KEEP:])
-    def step(self):
-        """Advance one frame (single CUDA-graph dispatch). Returns tokens [1,1,Q]."""
-        self.graph.replay()
-        return self.out.clone()
-    def close(self):
-        """Free the captured CUDA graph + its private memory pool. Idempotent;
-        call at session end (the WS worker should). Safe during interpreter
-        shutdown — swallows teardown-ordering errors."""
-        g = getattr(self, "graph", None)
-        if g is not None:
-            try:
-                g.reset()
-            except Exception:
-                pass
-            self.graph = None
-    def __del__(self):
-        try:
-            self.close()
-        except Exception:
-            pass
-__all__ = ["MagentaRT2ForConditionalGeneration", "MagentaRT2PreTrainedModel", "CudaGraphStreamer"]

 import json
 import os
 import numpy as np
 import torch
         ]
         return self._conditioning(style_tokens, notes, drums, cfgs)
     # ---- codec ----
     def _decode_stream(self, history, emitted, context=STREAM_DECODE_CONTEXT,
                        margin=STREAM_DECODE_MARGIN, flush=False):
         """Fresh state dict for streaming decode (decode_stream)."""
         return {}
     def decode_stream(self, new_codes, state):
         """Incremental codec decode of new token frames [b, t_new, Q] -> audio [b, N, 2].
         FLOP-optimal stateful streaming (no overlap-save re-decode); bf16-equivalent to
     @torch.no_grad()
     def generate(self, style=None, notes=None, drums=None, cfg_musiccoca=None,
                  cfg_notes=None, cfg_drums=None, temperature=None, top_k=None,
+                 frames=25, seed=0, state=None, flush=False, return_int16=False):
         c = self.config
         temperature = c.temperature if temperature is None else temperature
         top_k = c.top_k if top_k is None else top_k
+        cond = self._resolve_conditioning(style, notes, drums, cfg_musiccoca, cfg_notes, cfg_drums)
+        source = self.depthformer.encode(cond).to(self._dt)
         if state is None:
+            dstate = self.depthformer.decoder.init_streaming_f(1, self._dev, self._dt)
             gen = torch.Generator(device=self._dev).manual_seed(seed)
+            history = torch.zeros((1, 0, c.num_codebooks), dtype=torch.long, device=self._dev)
+            emitted = 0
         else:
+            dstate, gen, history, emitted = state["dstate"], state["gen"], state["history"], state["emitted"]
         sampler = make_sampler(temperature, top_k, gen)
         toks = [self.depthformer.decoder.step_f(
+            dstate, source, sampler=sampler,
             temporal_step=self._temporal_step, depth_step=self._depth_step) for _ in range(frames)]
+        history = torch.cat([history] + toks, dim=1)
+        audio, emitted = self._decode_stream(history, emitted, flush=flush)
+        new_state = {"dstate": dstate, "gen": gen, "history": history, "emitted": emitted}
         wav = audio[0].float().cpu().numpy()
         i16 = _float_to_int16(wav)
         out = i16 if return_int16 else i16.astype(np.float32) / 32768.0
     @torch.no_grad()
     def stream(self, control, chunk_frames=10, max_seconds=55.0, seed=0,
+               time_fn=None, sleep_fn=None, notes=None, drums=None):
         """Continuous generation. `control()` returns {style_tokens, temperature,
+        top_k, cfg_*} read every chunk for mid-stream steering. Yields int16 [N,2]."""
         import time as _time
         time_fn = time_fn or _time.time
         sleep_fn = sleep_fn or _time.sleep
         dev, dt = self._dev, self._dt
         notes = notes if notes is not None else [-1] * self.num_notes
         drums = drums if drums is not None else [-1] * self.num_drums
+        dstate = self.depthformer.decoder.init_streaming_f(1, dev, dt)
         gen = torch.Generator(device=dev).manual_seed(seed)
+        history = torch.zeros((1, 0, c.num_codebooks), dtype=torch.long, device=dev)
+        emitted = 0
         cur_tokens = None
         source = None
         t0 = time_fn()
             tokens = ctl["style_tokens"]
             if tokens != cur_tokens:
                 cur_tokens = tokens
+                cfgs = [discretize_cfg(ctl.get("cfg_musiccoca", c.cfg_musiccoca), 0.2, 40),
+                        discretize_cfg(ctl.get("cfg_notes", c.cfg_notes), 0.2, 40),
+                        discretize_cfg(ctl.get("cfg_drums", c.cfg_drums), 1.0, 8)]
+                cond = self._conditioning((list(tokens) + [-1] * self.num_musiccoca)[:self.num_musiccoca],
+                                          notes, drums, cfgs)
+                source = self.depthformer.encode(cond).to(dt)
             sampler = make_sampler(ctl.get("temperature", c.temperature), ctl.get("top_k", c.top_k), gen)
             toks = [self.depthformer.decoder.step_f(
+                dstate, source, sampler=sampler,
                 temporal_step=self._temporal_step, depth_step=self._depth_step) for _ in range(chunk_frames)]
+            history = torch.cat([history] + toks, dim=1)
+            audio, emitted = self._decode_stream(history, emitted)
             if audio.shape[1] > 0:
                 yield _float_to_int16(audio[0].float().cpu().numpy())
+            ahead = (emitted * FRAME_SAMPLES / SR) - (time_fn() - t0)
             if ahead > 1.0:
                 sleep_fn(min(ahead - 1.0, 0.5))
+__all__ = ["MagentaRT2ForConditionalGeneration", "MagentaRT2PreTrainedModel"]

magenta_rt/torch/processing_musiccoca.py DELETED Viewed

@@ -1,77 +0,0 @@
-# Copyright 2026 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""MusicCoCa style processor for Magenta RealTime 2.
-A processor-style component (like a feature extractor / tokenizer): turns a text
-prompt OR an audio clip into 12 RVQ style tokens that condition the model. Pure
-torch + sentencepiece (text tower, audio tower, RVQ all torch-native).
-"""
-import os
-import numpy as np
-class MusicCoCaProcessor:
-    """Text/audio -> 12 RVQ style tokens (and 768-d embeddings, for layering)."""
-    def __init__(self, musiccoca):
-        self._mc = musiccoca
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, device="cpu", **kwargs):
-        from .musiccoca import MusicCoCa
-        p = pretrained_model_name_or_path
-        if p is not None and os.path.isdir(p) and os.path.exists(os.path.join(p, "text_encoder.pt")):
-            mc = MusicCoCa(resource_dir=p, device=device)
-        else:
-            mc = MusicCoCa(repo_id=p, device=device) if p else MusicCoCa(device=device)
-        return cls(mc)
-    def save_pretrained(self, save_directory, **kwargs):
-        # Artifacts live in the MusicCoCa hub repo; nothing extra to serialize here.
-        os.makedirs(save_directory, exist_ok=True)
-    @property
-    def device(self):
-        return self._mc.device
-    def to(self, device):
-        self._mc.to(device)
-        return self
-    def embed(self, text_or_audio):
-        """Text str / audio (Waveform | (samples, sr) | np@16kHz) -> [768] torch."""
-        return self._mc.embed(text_or_audio)
-    def tokenize(self, embedding):
-        """[768] embedding -> [12] int RVQ tokens (np.int64)."""
-        return self._mc.tokenize(embedding)
-    def layer(self, prompts, weights=None):
-        """Blend several prompts (text/audio) by weighted-mean of embeddings,
-        then tokenize. `prompts` is a list; `weights` defaults to uniform."""
-        embs = [self.embed(p) for p in prompts]
-        w = weights or [1.0 / len(embs)] * len(embs)
-        emb = sum(wi * e for wi, e in zip(w, embs))
-        return self.tokenize(emb).tolist()
-    def __call__(self, text_or_audio, return_tokens=True):
-        """-> 12 style tokens (list[int]) by default, or the [768] embedding."""
-        emb = self.embed(text_or_audio)
-        return self.tokenize(emb).tolist() if return_tokens else emb
-__all__ = ["MusicCoCaProcessor"]