Spaces:

thecollabagepatch
/

magenta-retry

Running

App Files Files Community

thecollabagepatch commited on Jan 24

Commit

c8f33f0

1 Parent(s): 3fbec97

i think we did it? gradio with fastrtc v1

Browse files

Files changed (3) hide show

Dockerfile +2 -2
app.py +27 -21
fastrtc_magenta.py +228 -254

Dockerfile CHANGED Viewed

@@ -134,7 +134,7 @@ RUN uv pip install --system -c /tmp/constraints.txt \
 # Ensure compatible protobuf version
 RUN uv pip install --system --force-reinstall "protobuf>=5.27.0"
-# RUN uv pip install --system fastrtc
 # Set working directory and create cache
 WORKDIR /app
@@ -152,7 +152,7 @@ COPY lil_demo_540p.mp4 /app/
 COPY magentaRT_rt_tester.html /app/
 COPY magenta_prompts.js /app/
 COPY docs/ /app/docs/
-# COPY fastrtc_magenta.py /app/
 EXPOSE 7860
 CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 # Ensure compatible protobuf version
 RUN uv pip install --system --force-reinstall "protobuf>=5.27.0"
+RUN uv pip install --system fastrtc
 # Set working directory and create cache
 WORKDIR /app
 COPY magentaRT_rt_tester.html /app/
 COPY magenta_prompts.js /app/
 COPY docs/ /app/docs/
+COPY fastrtc_magenta.py /app/
 EXPOSE 7860
 CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,5 +1,15 @@
 import os
 # ---- Space mode gating (place above any JAX import!) ----
 SPACE_MODE = os.getenv("SPACE_MODE")
 if SPACE_MODE is None:
@@ -1741,27 +1751,23 @@ async def ws_jam(websocket: WebSocket):
         except Exception:
             pass
-# --- FastRTC Gradio Integration (optional) ---
-# if FASTRTC_AVAILABLE:
-#     try:
-#         magenta_stream = create_magenta_stream(
-#             get_mrt_fn=get_mrt,
-#             build_style_fn=build_style_vector,
-#             asset_manager=asset_manager,
-#             concurrency_limit=1,  # Single GPU
-#             time_limit=3600,  # 1 hour max per session
-#         )
-#         # Mount FastRTC routes at /rtc
-#         magenta_stream.mount(app, path="/rtc")
-#         # Mount Gradio UI at /gradio
-#         import gradio as gr
-#         app = gr.mount_gradio_app(app, magenta_stream.ui, path="/gradio")
-#         print("✓ FastRTC Gradio interface available at /gradio")
-#     except Exception as e:
-#         print(f"FastRTC integration skipped: {e}")
 @app.get("/ping")

 import os
+# very top of app.py, before importing tensorflow/jax/magenta stuff
+try:
+    import pylibsrtp
+    # This is the key: call srtp_init via cffi binding
+    from pylibsrtp import _binding
+    _binding.lib.srtp_init()
+    print("SRTP init OK (pre-TF)", flush=True)
+except Exception as e:
+    print("SRTP init failed early:", e, flush=True)
 # ---- Space mode gating (place above any JAX import!) ----
 SPACE_MODE = os.getenv("SPACE_MODE")
 if SPACE_MODE is None:
         except Exception:
             pass
+# --- FastRTC Gradio Integration (only in serve mode) ---
+if SPACE_MODE == "serve":
+    try:
+        from fastrtc_magenta import create_magenta_stream, FASTRTC_AVAILABLE
+        if FASTRTC_AVAILABLE:
+            magenta_stream = create_magenta_stream(
+                get_mrt_fn=get_mrt,
+                build_style_fn=build_style_vector,
+                asset_manager=asset_manager,
+                concurrency_limit=1,
+                time_limit=3600,
+            )
+            magenta_stream.mount(app, path="/rtc")
+            app = gr.mount_gradio_app(app, magenta_stream.ui, path="/gradio")
+            print("✓ FastRTC Gradio interface available at /gradio")
+    except Exception as e:
+        print(f"⚠ FastRTC integration skipped: {e}")
 @app.get("/ping")

fastrtc_magenta.py CHANGED Viewed

@@ -2,28 +2,24 @@
 FastRTC integration for MagentaRT real-time streaming.
 This module provides a Gradio-native interface for MagentaRT using FastRTC,
-enabling real-time audio streaming with live parameter updates through
-a proper Gradio UI.
-Usage:
-    from fastrtc_magenta import create_magenta_stream
-    # In your existing FastAPI app:
-    magenta_stream = create_magenta_stream(get_mrt_fn=get_mrt)
-    magenta_stream.mount(app, path="/rtc")
-    # Or standalone:
-    magenta_stream.ui.launch()
 """
 import numpy as np
 import gradio as gr
 from typing import Callable, Optional
-from dataclasses import dataclass, field
 # FastRTC imports
 try:
-    from fastrtc import Stream, StreamHandler, AdditionalOutputs
     FASTRTC_AVAILABLE = True
 except ImportError:
     FASTRTC_AVAILABLE = False
@@ -32,7 +28,6 @@ except ImportError:
 @dataclass
 class MagentaRTParams:
-    """Live-updatable parameters for MagentaRT generation."""
     temperature: float = 1.1
     guidance_weight: float = 1.1
     topk: int = 40
@@ -45,97 +40,175 @@ class MagentaRTParams:
 class MagentaRTStreamHandler(StreamHandler):
     """
-    FastRTC StreamHandler for continuous MagentaRT audio generation.
-    This handler generates ~2s audio chunks continuously, with support
-    for live parameter updates via FastRTC's set_input mechanism.
-    The MagentaRT system handles crossfading internally, so chunks
-    can be played back-to-back without client-side processing.
     """
     def __init__(
         self,
         get_mrt_fn: Callable,
         build_style_fn: Callable,
         asset_manager=None,
     ):
-        # MagentaRT outputs stereo 48kHz audio
         super().__init__(
             expected_layout="stereo",
             output_sample_rate=48000,
-            input_sample_rate=48000,  # Not used in receive-only mode
         )
         self.get_mrt_fn = get_mrt_fn
         self.build_style_fn = build_style_fn
         self.asset_manager = asset_manager
-        # Will be initialized in start_up()
         self.mrt = None
         self.state = None
         self.style_cur = None
         self.style_tgt = None
-        self.params = MagentaRTParams()
-        # Track chunk timing for style ramping
-        self.chunk_duration = 2.0  # Will be updated from mrt config
     def copy(self) -> "MagentaRTStreamHandler":
-        """Create a fresh handler for each new connection."""
         return MagentaRTStreamHandler(
             get_mrt_fn=self.get_mrt_fn,
             build_style_fn=self.build_style_fn,
             asset_manager=self.asset_manager,
         )
     def start_up(self) -> None:
-        """Initialize MagentaRT state when stream starts."""
         self.mrt = self.get_mrt_fn()
         self.state = self.mrt.init_state()
-        # Calculate chunk duration from config
         codec_fps = float(self.mrt.codec.frame_rate)
         self.chunk_duration = (
-            self.mrt.config.chunk_length_frames *
-            self.mrt.config.frame_length_samples
         ) / float(self.mrt.sample_rate)
-        # Build silent context (10s) tokens
         ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
         sr = int(self.mrt.sample_rate)
-        samples = int(max(1, round(ctx_seconds * sr)))
-        # Import here to avoid circular deps
-        from magenta_rt import audio as au
-        silent = au.Waveform(np.zeros((samples, 2), np.float32), sr)
         tokens = self.mrt.codec.encode(silent).astype(np.int32)
         tokens = tokens[:, :self.mrt.config.decoder_codec_rvq_depth]
         self.state.context_tokens = tokens
-        # Ensure assets loaded for style building
         if self.asset_manager:
             self.asset_manager.ensure_assets_loaded(self.mrt)
-        # Build initial style
         self._rebuild_style()
         self.style_cur = self.style_tgt.copy()
     def _rebuild_style(self) -> None:
-        """Rebuild target style vector from current params."""
         text_list = [s.strip() for s in self.params.styles.split(",") if s.strip()]
         try:
             text_w = [float(x) for x in self.params.style_weights.split(",") if x.strip()]
         except ValueError:
             text_w = []
         try:
             cw = [float(x) for x in self.params.centroid_weights.split(",") if x.strip()]
         except ValueError:
             cw = []
         self.style_tgt = self.build_style_fn(
             self.mrt,
             text_styles=text_list,
@@ -145,15 +218,18 @@ class MagentaRTStreamHandler(StreamHandler):
             mean_weight=self.params.mean_weight,
             centroid_weights=cw,
         )
     def _apply_param_updates(self) -> None:
-        """Check latest_args for parameter updates from Gradio UI."""
-        # latest_args format: [webrtc_value, temp, guidance, topk, styles, style_weights, mean, centroids, ramp]
         args = self.latest_args
         if not args or len(args) < 2:
             return
-        # Skip index 0 which is the dummy webrtc value
         try:
             if len(args) > 1 and args[1] is not None:
                 self.params.temperature = float(args[1])
@@ -172,66 +248,89 @@ class MagentaRTStreamHandler(StreamHandler):
             if len(args) > 8 and args[8] is not None:
                 self.params.style_ramp_seconds = float(args[8])
         except (ValueError, TypeError):
-            pass  # Ignore malformed updates
-        # Apply to MRT
         self.mrt.temperature = self.params.temperature
         self.mrt.guidance_weight = self.params.guidance_weight
         self.mrt.topk = self.params.topk
-        # Rebuild target style
-        self._rebuild_style()
-    def receive(self, frame: tuple[int, np.ndarray]) -> None:
-        """
-        Receive incoming audio frame.
-        For MagentaRT rt-mode, we ignore input audio - this is output-only.
-        In the future, we could use input for style conditioning or feedback.
-        """
-        pass
-    def emit(self) -> Optional[tuple[int, np.ndarray]]:
         """
-        Generate and emit the next audio chunk.
-        Returns:
-            Tuple of (sample_rate, audio_array) where audio_array has shape
-            (num_channels, num_samples) for stereo output.
         """
-        if self.mrt is None or self.state is None:
-            return None
-        # Check for parameter updates from UI
-        self._apply_param_updates()
-        # Ramp style toward target
-        ramp = self.params.style_ramp_seconds
-        if ramp <= 0.0:
-            self.style_cur = self.style_tgt
-        else:
-            step = min(1.0, self.chunk_duration / ramp)
-            self.style_cur = self.style_cur + step * (self.style_tgt - self.style_cur)
-        # Generate chunk (crossfading handled internally by MagentaRT)
-        wav, new_state = self.mrt.generate_chunk(
-            state=self.state,
-            style=self.style_cur
-        )
-        self.state = new_state
-        # Convert to FastRTC format: (sample_rate, array)
-        # FastRTC expects shape (num_channels, num_samples) for stereo
-        # MagentaRT outputs shape (num_samples, num_channels)
-        audio = wav.samples.astype(np.float32)
-        audio = audio.T  # Transpose to (channels, samples)
-        return (int(self.mrt.sample_rate), audio)
-    def shutdown(self) -> None:
-        """Clean up when stream closes."""
-        self.mrt = None
-        self.state = None
 def create_magenta_stream(
@@ -241,156 +340,31 @@ def create_magenta_stream(
     concurrency_limit: int = 1,
     time_limit: Optional[float] = None,
 ) -> "Stream":
-    """
-    Create a FastRTC Stream for MagentaRT real-time generation.
-    Args:
-        get_mrt_fn: Function that returns the MagentaRT instance
-        build_style_fn: Function to build style vectors (build_style_vector from app.py)
-        asset_manager: Optional AssetManager for finetune steering
-        concurrency_limit: Max concurrent streams (default 1 for single GPU)
-        time_limit: Optional max stream duration in seconds
-    Returns:
-        FastRTC Stream object that can be mounted or launched
-    Example:
-        stream = create_magenta_stream(get_mrt, build_style_vector, asset_manager)
-        stream.ui.launch()  # Standalone Gradio UI
-        # or
-        stream.mount(app, path="/rtc")  # Mount on existing FastAPI
-    """
     if not FASTRTC_AVAILABLE:
         raise ImportError("FastRTC not installed. Run: pip install fastrtc")
     handler = MagentaRTStreamHandler(
         get_mrt_fn=get_mrt_fn,
         build_style_fn=build_style_fn,
         asset_manager=asset_manager,
     )
     stream = Stream(
         handler=handler,
         modality="audio",
-        mode="receive",  # Server-to-client only (we generate, client listens)
         concurrency_limit=concurrency_limit,
         time_limit=time_limit,
         additional_inputs=[
-            gr.Slider(
-                minimum=0.1, maximum=2.0, step=0.01, value=1.1,
-                label="Temperature",
-                info="Higher = more random, lower = more deterministic"
-            ),
-            gr.Slider(
-                minimum=0.0, maximum=8.0, step=0.1, value=1.1,
-                label="Guidance Weight",
-                info="How strongly to follow the style"
-            ),
-            gr.Slider(
-                minimum=1, maximum=256, step=1, value=40,
-                label="Top-K",
-                info="Number of token candidates to sample from"
-            ),
-            gr.Textbox(
-                value="warmup",
-                label="Styles",
-                info="Comma-separated style prompts (e.g., 'acid house, dreamy pads')"
-            ),
-            gr.Textbox(
-                value="1.0",
-                label="Style Weights",
-                info="Comma-separated weights for each style"
-            ),
-            gr.Slider(
-                minimum=0.0, maximum=2.0, step=0.01, value=0.0,
-                label="Mean Weight",
-                info="Weight for finetune mean embedding (if available)"
-            ),
-            gr.Textbox(
-                value="",
-                label="Centroid Weights",
-                info="Comma-separated weights for finetune centroids"
-            ),
-            gr.Slider(
-                minimum=0.0, maximum=10.0, step=0.1, value=2.0,
-                label="Style Ramp (seconds)",
-                info="How long to transition between style changes"
-            ),
-        ],
-    )
-    return stream
-# -----------------------------------------------------------------------------
-# Alternative: Simpler generator-based approach (if StreamHandler is overkill)
-# -----------------------------------------------------------------------------
-def create_simple_magenta_stream(
-    get_mrt_fn: Callable,
-    build_style_fn: Callable,
-    asset_manager=None,
-) -> "Stream":
-    """
-    Simpler generator-based MagentaRT stream.
-    This approach is less flexible but easier to understand.
-    Parameter updates won't work as smoothly - they'll only apply
-    when a new stream starts.
-    """
-    if not FASTRTC_AVAILABLE:
-        raise ImportError("FastRTC not installed. Run: pip install fastrtc")
-    def generate_audio(
-        temperature: float = 1.1,
-        guidance: float = 1.1,
-        topk: int = 40,
-        styles: str = "warmup",
-    ):
-        """Generator that yields MagentaRT audio chunks."""
-        from magenta_rt import audio as au
-        mrt = get_mrt_fn()
-        state = mrt.init_state()
-        # Set params
-        mrt.temperature = temperature
-        mrt.guidance_weight = guidance
-        mrt.topk = topk
-        # Build silent context
-        codec_fps = float(mrt.codec.frame_rate)
-        ctx_seconds = float(mrt.config.context_length_frames) / codec_fps
-        sr = int(mrt.sample_rate)
-        samples = int(max(1, round(ctx_seconds * sr)))
-        silent = au.Waveform(np.zeros((samples, 2), np.float32), sr)
-        tokens = mrt.codec.encode(silent).astype(np.int32)
-        tokens = tokens[:, :mrt.config.decoder_codec_rvq_depth]
-        state.context_tokens = tokens
-        # Build style
-        if asset_manager:
-            asset_manager.ensure_assets_loaded(mrt)
-        text_list = [s.strip() for s in styles.split(",") if s.strip()]
-        style = build_style_fn(mrt, text_styles=text_list)
-        # Generate forever
-        while True:
-            wav, state = mrt.generate_chunk(state=state, style=style)
-            audio = wav.samples.astype(np.float32).T
-            yield (int(mrt.sample_rate), audio)
-    stream = Stream(
-        handler=generate_audio,
-        modality="audio",
-        mode="receive",
-        concurrency_limit=1,
-        additional_inputs=[
-            gr.Slider(0.1, 2.0, value=1.1, label="Temperature"),
-            gr.Slider(0.0, 8.0, value=1.1, label="Guidance"),
-            gr.Slider(1, 256, value=40, step=1, label="Top-K"),
             gr.Textbox(value="warmup", label="Styles"),
         ],
     )
-    return stream

 FastRTC integration for MagentaRT real-time streaming.
 This module provides a Gradio-native interface for MagentaRT using FastRTC,
+enabling real-time audio streaming with live parameter updates.
+Key notes:
+- MagentaRT system handles crossfading internally.
+- Many FastRTC builds assume mono in the outgoing PyAV path; we downmix to mono
+  int16 for now (easy to switch once FastRTC stereo output is patched).
 """
+from __future__ import annotations
 import numpy as np
 import gradio as gr
 from typing import Callable, Optional
+from dataclasses import dataclass
 # FastRTC imports
 try:
+    from fastrtc import Stream, StreamHandler
     FASTRTC_AVAILABLE = True
 except ImportError:
     FASTRTC_AVAILABLE = False
 @dataclass
 class MagentaRTParams:
     temperature: float = 1.1
     guidance_weight: float = 1.1
     topk: int = 40
 class MagentaRTStreamHandler(StreamHandler):
     """
+    StreamHandler for continuous MagentaRT audio generation (server -> client).
+    FastRTC versions differ in how they consume handlers; some require emit()
+    (abstract), so we implement emit() as the canonical “produce next frame” API.
+    We also keep __call__ as a generator adapter because some versions call that.
     """
     def __init__(
         self,
         get_mrt_fn: Callable,
         build_style_fn: Callable,
         asset_manager=None,
     ):
         super().__init__(
             expected_layout="stereo",
             output_sample_rate=48000,
+            input_sample_rate=48000,
         )
         self.get_mrt_fn = get_mrt_fn
         self.build_style_fn = build_style_fn
         self.asset_manager = asset_manager
         self.mrt = None
         self.state = None
+        self.params = MagentaRTParams()
         self.style_cur = None
         self.style_tgt = None
+        self.chunk_duration = 2.0
+        self.latest_args = None
+        # Internal generator used by emit()
+        self._gen = None
     def copy(self) -> "MagentaRTStreamHandler":
         return MagentaRTStreamHandler(
             get_mrt_fn=self.get_mrt_fn,
             build_style_fn=self.build_style_fn,
             asset_manager=self.asset_manager,
         )
+    # -------------------------------------------------------------------------
+    # Lifecycle
+    # -------------------------------------------------------------------------
     def start_up(self) -> None:
+        """Initialize MagentaRT + state."""
         self.mrt = self.get_mrt_fn()
         self.state = self.mrt.init_state()
+        # Compute chunk duration from MRT config
         codec_fps = float(self.mrt.codec.frame_rate)
         self.chunk_duration = (
+            self.mrt.config.chunk_length_frames * self.mrt.config.frame_length_samples
         ) / float(self.mrt.sample_rate)
+        # Build silent context tokens
+        from magenta_rt import audio as au
         ctx_seconds = float(self.mrt.config.context_length_frames) / codec_fps
         sr = int(self.mrt.sample_rate)
+        n = int(max(1, round(ctx_seconds * sr)))
+        silent = au.Waveform(np.zeros((n, 2), np.float32), sr)
         tokens = self.mrt.codec.encode(silent).astype(np.int32)
         tokens = tokens[:, :self.mrt.config.decoder_codec_rvq_depth]
         self.state.context_tokens = tokens
+        # Load assets if needed
         if self.asset_manager:
             self.asset_manager.ensure_assets_loaded(self.mrt)
+        # Initial style
         self._rebuild_style()
         self.style_cur = self.style_tgt.copy()
+        # Create internal generator for emit()
+        self._gen = self._generate_forever()
+    def shutdown(self) -> None:
+        self.mrt = None
+        self.state = None
+        self.style_cur = None
+        self.style_tgt = None
+        self.latest_args = None
+        self._gen = None
+    # -------------------------------------------------------------------------
+    # FastRTC entrypoints
+    # -------------------------------------------------------------------------
+    def __call__(self, *args):
+        """
+        Some FastRTC versions call handler(*ui_args) and expect a generator.
+        We provide that by yielding emit() forever.
+        """
+        self.latest_args = [None, *args]
+        self.start_up()
+        try:
+            while True:
+                out = self.emit()
+                if out is None:
+                    continue
+                yield out
+        finally:
+            self.shutdown()
+    def emit(self):
+        """
+        REQUIRED by some FastRTC versions (abstract method).
+        Produce the next (sample_rate, audio) chunk.
+        """
+        if self._gen is None:
+            # If FastRTC calls emit() without calling __call__ first,
+            # we still need to be able to start up.
+            self.latest_args = self.latest_args or [None]
+            self.start_up()
+        try:
+            return next(self._gen)
+        except StopIteration:
+            return None
+    def receive(self, frame: tuple[int, np.ndarray]) -> None:
+        # output-only mode
+        return
+    # -------------------------------------------------------------------------
+    # Core generation loop
+    # -------------------------------------------------------------------------
+    def _generate_forever(self):
+        """Internal generator that yields audio chunks forever."""
+        while True:
+            self._apply_param_updates()
+            self._ramp_style()
+            wav, self.state = self.mrt.generate_chunk(state=self.state, style=self.style_cur)
+            samples = np.asarray(wav.samples)
+            if samples.dtype != np.float32:
+                samples = samples.astype(np.float32, copy=False)
+            # Ensure stereo planar float32: (2, N)
+            audio_stereo = self._ensure_stereo_planar(samples)
+            # Return (sr, ndarray, layout) so FastRTC sets layout properly
+            yield (48000, audio_stereo, "stereo")
+    # -------------------------------------------------------------------------
+    # Params + style
+    # -------------------------------------------------------------------------
     def _rebuild_style(self) -> None:
         text_list = [s.strip() for s in self.params.styles.split(",") if s.strip()]
         try:
             text_w = [float(x) for x in self.params.style_weights.split(",") if x.strip()]
         except ValueError:
             text_w = []
         try:
             cw = [float(x) for x in self.params.centroid_weights.split(",") if x.strip()]
         except ValueError:
             cw = []
         self.style_tgt = self.build_style_fn(
             self.mrt,
             text_styles=text_list,
             mean_weight=self.params.mean_weight,
             centroid_weights=cw,
         )
     def _apply_param_updates(self) -> None:
         args = self.latest_args
         if not args or len(args) < 2:
+            # no UI args yet
             return
+        prev_styles = self.params.styles
+        prev_style_weights = self.params.style_weights
+        prev_mean = self.params.mean_weight
+        prev_centroids = self.params.centroid_weights
         try:
             if len(args) > 1 and args[1] is not None:
                 self.params.temperature = float(args[1])
             if len(args) > 8 and args[8] is not None:
                 self.params.style_ramp_seconds = float(args[8])
         except (ValueError, TypeError):
+            return
+        # Apply sampler params
         self.mrt.temperature = self.params.temperature
         self.mrt.guidance_weight = self.params.guidance_weight
         self.mrt.topk = self.params.topk
+        style_changed = (
+            self.params.styles != prev_styles or
+            self.params.style_weights != prev_style_weights or
+            self.params.mean_weight != prev_mean or
+            self.params.centroid_weights != prev_centroids
+        )
+        if style_changed:
+            self._rebuild_style()
+    def _ramp_style(self) -> None:
+        if self.style_cur is None or self.style_tgt is None:
+            return
+        ramp = float(self.params.style_ramp_seconds or 0.0)
+        if ramp <= 0.0:
+            self.style_cur = self.style_tgt.copy()
+            return
+        alpha = min(1.0, max(0.0, self.chunk_duration / ramp))
+        self.style_cur = (1.0 - alpha) * self.style_cur + alpha * self.style_tgt
+    # -------------------------------------------------------------------------
+    # Audio helpers
+    # -------------------------------------------------------------------------
+    @staticmethod
+    def _downmix_to_mono(samples: np.ndarray) -> np.ndarray:
+        if samples.ndim == 1:
+            return samples
+        if samples.ndim == 2:
+            # assume (num_samples, channels)
+            if samples.shape[1] == 1:
+                return samples[:, 0]
+            return samples.mean(axis=1)
+        return samples.reshape(-1)
+    @staticmethod
+    def _ensure_stereo_planar(samples: np.ndarray) -> np.ndarray:
         """
+        Convert waveform samples into PyAV-friendly planar audio (C-contiguous).
+        PyAV expects planar audio for format="fltp":
+          - mono:   (1, N)
+          - stereo: (2, N)
+        We also MUST return a C-contiguous ndarray, or PyAV will raise:
+          ValueError: ndarray is not C-contiguous
         """
+        x = np.asarray(samples, dtype=np.float32)
+        # Mono 1D -> (1, N)
+        if x.ndim == 1:
+            return np.ascontiguousarray(x.reshape(1, -1))
+        # (N, 1) -> (1, N)
+        if x.ndim == 2 and x.shape[1] == 1:
+            return np.ascontiguousarray(x[:, 0].reshape(1, -1))
+        # Interleaved stereo (N, 2) -> planar (2, N)
+        if x.ndim == 2 and x.shape[1] == 2:
+            # x.T is typically non-contiguous, so force contiguous
+            return np.ascontiguousarray(x.T)
+        # Already planar stereo (2, N) -> ensure contiguous anyway
+        if x.ndim == 2 and x.shape[0] == 2:
+            return np.ascontiguousarray(x)
+        # Fallback: flatten to mono
+        return np.ascontiguousarray(x.reshape(-1).reshape(1, -1))
+    @staticmethod
+    def _float_to_int16(x: np.ndarray) -> np.ndarray:
+        x = np.asarray(x, dtype=np.float32)
+        x = np.clip(x, -1.0, 1.0)
+        return (x * 32767.0).astype(np.int16)
 def create_magenta_stream(
     concurrency_limit: int = 1,
     time_limit: Optional[float] = None,
 ) -> "Stream":
     if not FASTRTC_AVAILABLE:
         raise ImportError("FastRTC not installed. Run: pip install fastrtc")
     handler = MagentaRTStreamHandler(
         get_mrt_fn=get_mrt_fn,
         build_style_fn=build_style_fn,
         asset_manager=asset_manager,
     )
     stream = Stream(
         handler=handler,
         modality="audio",
+        mode="receive",
         concurrency_limit=concurrency_limit,
         time_limit=time_limit,
         additional_inputs=[
+            gr.Slider(0.1, 2.0, step=0.01, value=1.1, label="Temperature"),
+            gr.Slider(0.0, 8.0, step=0.1, value=1.1, label="Guidance Weight"),
+            gr.Slider(1, 256, step=1, value=40, label="Top-K"),
             gr.Textbox(value="warmup", label="Styles"),
+            gr.Textbox(value="1.0", label="Style Weights"),
+            gr.Slider(0.0, 2.0, step=0.01, value=0.0, label="Mean Weight"),
+            gr.Textbox(value="", label="Centroid Weights"),
+            gr.Slider(0.0, 10.0, step=0.1, value=2.0, label="Style Ramp (seconds)"),
         ],
     )
+    return stream