OpenMOSS-Team
/

MOSS-Audio-Tokenizer-Nano

@@ -17,14 +17,25 @@ from __future__ import annotations
 import copy
 import math
 from contextlib import ExitStack, contextmanager
 from dataclasses import dataclass
 from typing import cast
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 try:
     from transformers.modeling_utils import PreTrainedAudioTokenizerBase
 except ImportError:
@@ -32,9 +43,12 @@ except ImportError:
 from transformers.utils import ModelOutput, logging
 try:
-    from transformers.utils import auto_docstring
 except ImportError:
-    def auto_docstring(*args, **kwargs):
         if len(args) == 1 and callable(args[0]) and not kwargs:
             return args[0]
@@ -43,9 +57,35 @@ except ImportError:
         return decorator
 try:
     from .configuration_moss_audio_tokenizer import MossAudioTokenizerConfig
 except ImportError:
     from configuration_moss_audio_tokenizer import MossAudioTokenizerConfig
@@ -64,6 +104,25 @@ SUPPORTED_ATTENTION_IMPLEMENTATIONS = {"sdpa", "flash_attention_2"}
 SUPPORTED_COMPUTE_DTYPES = {"fp32": None, "bf16": torch.bfloat16, "fp16": torch.float16}
 def resolve_compute_dtype(compute_dtype: str) -> torch.dtype | None:
     if compute_dtype not in SUPPORTED_COMPUTE_DTYPES:
         raise ValueError(
@@ -83,6 +142,7 @@ def disable_cuda_autocast():
 # =============================================================================
 @dataclass
 @auto_docstring
 class MossAudioTokenizerEncoderOutput(ModelOutput):
@@ -100,6 +160,7 @@ class MossAudioTokenizerEncoderOutput(ModelOutput):
     encoder_hidden_states: torch.Tensor | None = None
 @dataclass
 @auto_docstring
 class MossAudioTokenizerDecoderOutput(ModelOutput):
@@ -114,6 +175,7 @@ class MossAudioTokenizerDecoderOutput(ModelOutput):
     audio_lengths: torch.Tensor | None = None
 @dataclass
 @auto_docstring
 class MossAudioTokenizerOutput(ModelOutput):
@@ -139,6 +201,7 @@ class MossAudioTokenizerOutput(ModelOutput):
 # =============================================================================
 @dataclass
 class StreamingState:
     """Base state for streaming modules."""
@@ -228,6 +291,463 @@ class StreamingContainer(StreamingModule):
     pass
 # =============================================================================
 # Normalization Layers
 # =============================================================================
@@ -598,6 +1118,7 @@ class RingKVCache:
 # =============================================================================
 @dataclass
 class MHAState(StreamingState):
     cached_keys: torch.Tensor | None
@@ -677,6 +1198,7 @@ class MossAudioTokenizerMultiheadAttention(StreamingModule):
                 f"Expected one of {sorted(SUPPORTED_ATTENTION_IMPLEMENTATIONS)}."
             )
         self.attention_implementation = attention_implementation
         self.in_proj = nn.Linear(embed_dim, 3 * embed_dim, bias=False, **factory_kwargs)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False, **factory_kwargs)
@@ -811,6 +1333,34 @@ class MossAudioTokenizerMultiheadAttention(StreamingModule):
                 state.cached_positions = state.cached_positions.to(device=device)
         return state.cached_keys, state.cached_values, state.cached_positions
     def _build_streaming_kv(
         self,
         cached_k: torch.Tensor,
@@ -845,12 +1395,15 @@ class MossAudioTokenizerMultiheadAttention(StreamingModule):
             state.cached_positions = pos_k.contiguous()
             return
         new_cached_k = k_all[:, :, -self.context :, :].contiguous()
         new_cached_v = v_all[:, :, -self.context :, :].contiguous()
         new_cached_pos = pos_k[:, -self.context :].contiguous()
-        state.cached_keys = torch.where(exec_mask, new_cached_k, cached_k)
-        state.cached_values = torch.where(exec_mask, new_cached_v, cached_v)
-        state.cached_positions = torch.where(exec_mask_pos, new_cached_pos, cached_pos)
     def _build_streaming_sdpa_bias(self, pos_q: torch.Tensor, pos_k: torch.Tensor) -> torch.Tensor:
         delta = pos_q[:, :, None] - pos_k[:, None, :]
@@ -890,16 +1443,19 @@ class MossAudioTokenizerMultiheadAttention(StreamingModule):
         if flash_attn_varlen_func is None:
             raise RuntimeError("flash-attn is not installed.")
         window_size = (self.context, 0) if (self.context is not None and self.causal) else (-1, -1)
-        return flash_attn_varlen_func(
-            q.contiguous(),
-            k.contiguous(),
-            v.contiguous(),
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            max_seqlen_k,
-            causal=self.causal,
-            window_size=window_size,
         )
     def _forward_streaming_sdpa(self, x: torch.Tensor, state: MHAState) -> torch.Tensor:
@@ -968,6 +1524,46 @@ class MossAudioTokenizerMultiheadAttention(StreamingModule):
         state.offset[:] = torch.where(state.exec_mask, state.offset + chunk_length, state.offset)
         return out
     def _forward_non_streaming_sdpa(self, x: torch.Tensor, input_lengths: torch.Tensor) -> torch.Tensor:
         batch_size, max_seqlen, _ = x.shape
         q, k, v = self._project_qkv(x)
@@ -1009,11 +1605,12 @@ class MossAudioTokenizerMultiheadAttention(StreamingModule):
         if state is not None:
             if query.dim() != 3:
                 raise ValueError(f"Streaming attention expects a 3D tensor, got shape {tuple(query.shape)}")
-            out = (
-                self._forward_streaming_flash(query, state)
-                if backend == "flash_attention_2"
-                else self._forward_streaming_sdpa(query, state)
-            )
             return self.out_proj(out)
         if backend == "flash_attention_2":
@@ -1037,6 +1634,7 @@ class MossAudioTokenizerMultiheadAttention(StreamingModule):
 # =============================================================================
 @dataclass
 class LayerState(StreamingState):
     pass
@@ -1128,6 +1726,7 @@ class MossAudioTokenizerTransformerLayer(StreamingModule):
 # =============================================================================
 @dataclass
 class TransformerState(StreamingState):
     offsets: torch.Tensor
@@ -1800,9 +2399,129 @@ class MossAudioTokenizerModel(MossAudioTokenizerPreTrainedModel):
             )
         self.post_init()
     def _start_streaming(self, batch_size: int):
         """Start streaming mode for all modules."""
         def _start(module):
             if isinstance(module, StreamingModule):
@@ -1812,6 +2531,9 @@ class MossAudioTokenizerModel(MossAudioTokenizerPreTrainedModel):
     def _stop_streaming(self):
         """Stop streaming mode for all modules."""
         def _stop(module):
             if isinstance(module, StreamingModule):
@@ -2183,7 +2905,27 @@ class MossAudioTokenizerModel(MossAudioTokenizerPreTrainedModel):
         codes_list: list[torch.Tensor],
         num_quantizers: int | None = None,
         chunk_duration: float | None = None,
     ) -> MossAudioTokenizerDecoderOutput:
         audio_codes, audio_codes_lengths, num_quantizers_used = self._prepare_codes_batch(
             codes_list,
             num_quantizers=num_quantizers,
@@ -2191,9 +2933,53 @@ class MossAudioTokenizerModel(MossAudioTokenizerPreTrainedModel):
         batch_size = len(codes_list)
         device = audio_codes.device
-        if chunk_duration is None:
             return self._decode_frame(audio_codes, audio_codes_lengths)
         if chunk_duration <= 0:
             raise ValueError("`chunk_duration` must be > 0 when provided.")
@@ -2366,6 +3152,7 @@ class MossAudioTokenizerModel(MossAudioTokenizerPreTrainedModel):
             `MossAudioTokenizerDecoderOutput` or tuple containing decoded audio.
         """
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         if audio_codes.dim() == 2:
             codes_list = [audio_codes[:num_quantizers] if num_quantizers is not None else audio_codes]

 import copy
 import math
+import sys
+import types
 from contextlib import ExitStack, contextmanager
 from dataclasses import dataclass
+from pathlib import Path
 from typing import cast
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+if __name__ not in sys.modules:
+    _module_proxy = types.ModuleType(__name__)
+    sys.modules[__name__] = _module_proxy
+def _sync_module_proxy() -> None:
+    sys.modules[__name__].__dict__.update(globals())
 try:
     from transformers.modeling_utils import PreTrainedAudioTokenizerBase
 except ImportError:
 from transformers.utils import ModelOutput, logging
 try:
+    from transformers.utils import auto_docstring as _hf_auto_docstring
 except ImportError:
+    _hf_auto_docstring = None
+def auto_docstring(*args, **kwargs):
+    if _hf_auto_docstring is None:
         if len(args) == 1 and callable(args[0]) and not kwargs:
             return args[0]
         return decorator
+    if len(args) == 1 and callable(args[0]) and not kwargs:
+        obj = args[0]
+        try:
+            return _hf_auto_docstring(obj)
+        except Exception:
+            return obj
+    try:
+        decorator = _hf_auto_docstring(*args, **kwargs)
+    except Exception:
+        def decorator(obj):
+            return obj
+        return decorator
+    def safe_decorator(obj):
+        try:
+            return decorator(obj)
+        except Exception:
+            return obj
+    return safe_decorator
 try:
     from .configuration_moss_audio_tokenizer import MossAudioTokenizerConfig
 except ImportError:
+    _module_dir = str(Path(__file__).resolve().parent)
+    if _module_dir not in sys.path:
+        sys.path.insert(0, _module_dir)
     from configuration_moss_audio_tokenizer import MossAudioTokenizerConfig
 SUPPORTED_COMPUTE_DTYPES = {"fp32": None, "bf16": torch.bfloat16, "fp16": torch.float16}
+_ACTIVE_DECODE_SESSION_ERROR_MESSAGE = "MossAudioTokenizerModel only supports one active decode session at a time."
+_CLOSED_DECODE_SESSION_ERROR_MESSAGE = "This decode session is closed."
+_MODEL_STREAMING_CONFLICT_ERROR_MESSAGE = "Model-level streaming helpers cannot be used while a decode session is active."
+_PLAIN_DECODE_SESSION_CONFLICT_ERROR_MESSAGE = "Plain decode helpers cannot be used while a decode session is active."
+_DUPLICATE_DECODE_REQUEST_ERROR_TEMPLATE = "Decode session already contains request_id={request_id!r}."
+_UNKNOWN_DECODE_REQUEST_ERROR_TEMPLATE = "Decode session does not contain an active request_id={request_id!r}."
+_DECODE_SESSION_FULL_ERROR_TEMPLATE = "Decode session has no free slots remaining (max_batch_size={max_batch_size})."
+_INVALID_DECODE_STEP_REQUEST_IDS_ERROR_MESSAGE = (
+    "`request_ids` must exactly match the current active decode request order."
+)
+_BATCH_DECODE_STREAMING_DUPLICATE_FINALIZE_INDICES_ERROR_MESSAGE = "`finalize_indices` must not contain duplicates."
+_BATCH_DECODE_STREAMING_FINALIZE_INDEX_OUT_OF_RANGE_ERROR_TEMPLATE = (
+    "`finalize_indices` index {index} is out of range for the pre-call logical batch of size {batch_size}."
+)
+_BATCH_DECODE_STREAMING_SHRINK_ERROR_MESSAGE = (
+    "`batch_decode(streaming=True)` must include all pre-call active rows in the current call before applying `finalize_indices`."
+)
 def resolve_compute_dtype(compute_dtype: str) -> torch.dtype | None:
     if compute_dtype not in SUPPORTED_COMPUTE_DTYPES:
         raise ValueError(
 # =============================================================================
+_sync_module_proxy()
 @dataclass
 @auto_docstring
 class MossAudioTokenizerEncoderOutput(ModelOutput):
     encoder_hidden_states: torch.Tensor | None = None
+_sync_module_proxy()
 @dataclass
 @auto_docstring
 class MossAudioTokenizerDecoderOutput(ModelOutput):
     audio_lengths: torch.Tensor | None = None
+_sync_module_proxy()
 @dataclass
 @auto_docstring
 class MossAudioTokenizerOutput(ModelOutput):
 # =============================================================================
+_sync_module_proxy()
 @dataclass
 class StreamingState:
     """Base state for streaming modules."""
     pass
+class MossAudioTokenizerDecodeSession:
+    model: MossAudioTokenizerModel
+    max_batch_size: int
+    _use_cuda_graph: bool
+    active_request_ids: list[str | int]
+    request_id_to_slot_index: dict[str | int, int]
+    slot_index_to_request_id: list[str | int | None]
+    slot_is_free: list[bool]
+    request_id_to_code_offset: dict[str | int, int]
+    request_id_to_audio_offset: dict[str | int, int]
+    _flash_kvcache_attention_modules: list[MossAudioTokenizerMultiheadAttention]
+    _graph_num_quantizers_capacity: int | None
+    _graph_input_codes: torch.Tensor | None
+    _graph_input_code_lengths: torch.Tensor | None
+    _graph_output_audio: torch.Tensor | None
+    _graph_output_audio_lengths: torch.Tensor | None
+    _cuda_graph: torch.cuda.CUDAGraph | None
+    _cuda_graph_key: tuple[str, int, int, str] | None
+    _decode_streaming_exit_stack: ExitStack | None
+    _closed: bool
+    def __init__(self, model: MossAudioTokenizerModel, max_batch_size: int, use_cuda_graph: bool = False):
+        if max_batch_size <= 0:
+            raise ValueError("`max_batch_size` must be > 0.")
+        decoder_attention_modules: list[MossAudioTokenizerMultiheadAttention] = []
+        for decoder_module in model.decoder:
+            for module in decoder_module.modules():
+                if isinstance(module, MossAudioTokenizerMultiheadAttention):
+                    if module.context is None:
+                        raise ValueError(
+                            "MossAudioTokenizerDecodeSession requires all decoder MHA modules to have a finite "
+                            "`context` (context=None is unsupported for continuous-batch streaming)."
+                        )
+                    decoder_attention_modules.append(module)
+        flash_kvcache_attention_modules: list[MossAudioTokenizerMultiheadAttention] = []
+        if use_cuda_graph and HAS_FLASH_ATTN:
+            for module in decoder_attention_modules:
+                module._use_flash_kvcache = True
+                flash_kvcache_attention_modules.append(module)
+        decode_streaming_exit_stack = ExitStack()
+        try:
+            for decoder_module in model.decoder:
+                if isinstance(decoder_module, StreamingModule):
+                    inner_stack = decoder_module.streaming(batch_size=max_batch_size)
+                    _ = decode_streaming_exit_stack.enter_context(inner_stack)
+        except Exception:
+            decode_streaming_exit_stack.close()
+            for module in flash_kvcache_attention_modules:
+                module._use_flash_kvcache = False
+            raise
+        self.model = model
+        self.max_batch_size = max_batch_size
+        self._use_cuda_graph = use_cuda_graph
+        self.active_request_ids: list[str | int] = []
+        self.request_id_to_slot_index: dict[str | int, int] = {}
+        self.slot_index_to_request_id: list[str | int | None] = [None] * max_batch_size
+        self.slot_is_free: list[bool] = [True] * max_batch_size
+        self.request_id_to_code_offset: dict[str | int, int] = {}
+        self.request_id_to_audio_offset: dict[str | int, int] = {}
+        self._flash_kvcache_attention_modules = flash_kvcache_attention_modules
+        self._graph_num_quantizers_capacity = int(getattr(model.quantizer, "num_quantizers", 0)) if use_cuda_graph else None
+        self._graph_input_codes = None
+        self._graph_input_code_lengths = None
+        self._graph_output_audio = None
+        self._graph_output_audio_lengths = None
+        self._cuda_graph = None
+        self._cuda_graph_key = None
+        self._decode_streaming_exit_stack: ExitStack | None = decode_streaming_exit_stack
+        self._closed = False
+        if use_cuda_graph:
+            device = next(iter(model.parameters())).device
+            if device.type == "cuda":
+                self._ensure_cuda_graph_buffers(device)
+        model._active_decode_session = self
+    def _ensure_open(self) -> None:
+        if self._closed:
+            raise RuntimeError(_CLOSED_DECODE_SESSION_ERROR_MESSAGE)
+    def append(self, request_id: str | int) -> None:
+        self._ensure_open()
+        if request_id in self.request_id_to_slot_index:
+            raise RuntimeError(_DUPLICATE_DECODE_REQUEST_ERROR_TEMPLATE.format(request_id=request_id))
+        slot_index = next((index for index, is_free in enumerate(self.slot_is_free) if is_free), None)
+        if slot_index is None:
+            raise RuntimeError(_DECODE_SESSION_FULL_ERROR_TEMPLATE.format(max_batch_size=self.max_batch_size))
+        self.active_request_ids.append(request_id)
+        self.request_id_to_slot_index[request_id] = slot_index
+        self.slot_index_to_request_id[slot_index] = request_id
+        self.slot_is_free[slot_index] = False
+        self.request_id_to_code_offset[request_id] = 0
+        self.request_id_to_audio_offset[request_id] = 0
+    def _decoder_streaming_states(self) -> list[StreamingState]:
+        decoder_streaming_states: list[StreamingState] = []
+        for decoder_module in self.model.decoder:
+            for module in decoder_module.modules():
+                if isinstance(module, StreamingModule) and module._streaming_state is not None:
+                    decoder_streaming_states.append(module._streaming_state)
+        return decoder_streaming_states
+    def _ensure_cuda_graph_buffers(self, device: torch.device) -> None:
+        if not self._use_cuda_graph or device.type != "cuda":
+            return
+        graph_num_quantizers_capacity = self._graph_num_quantizers_capacity
+        if graph_num_quantizers_capacity is None:
+            graph_num_quantizers_capacity = int(getattr(self.model.quantizer, "num_quantizers", 0))
+            self._graph_num_quantizers_capacity = graph_num_quantizers_capacity
+        if graph_num_quantizers_capacity <= 0:
+            raise RuntimeError("`use_cuda_graph=True` requires a quantizer with `num_quantizers > 0`.")
+        if self._graph_input_codes is None or self._graph_input_codes.device != device:
+            self._graph_input_codes = torch.zeros(
+                (graph_num_quantizers_capacity, self.max_batch_size, 1),
+                device=device,
+                dtype=torch.long,
+            )
+            self._graph_input_code_lengths = torch.zeros(self.max_batch_size, device=device, dtype=torch.long)
+            self._graph_output_audio = None
+            self._graph_output_audio_lengths = None
+            self._cuda_graph = None
+            self._cuda_graph_key = None
+    def _snapshot_decoder_streaming_states(self) -> list[tuple[StreamingState, dict[str, torch.Tensor | None]]]:
+        snapshots: list[tuple[StreamingState, dict[str, torch.Tensor | None]]] = []
+        for streaming_state in self._decoder_streaming_states():
+            state_snapshot: dict[str, torch.Tensor | None] = {"exec_mask": streaming_state.exec_mask.clone()}
+            if isinstance(streaming_state, TransformerState):
+                state_snapshot["offsets"] = streaming_state.offsets.clone()
+            if isinstance(streaming_state, MHAState):
+                state_snapshot["offset"] = streaming_state.offset.clone()
+                state_snapshot["cached_keys"] = None if streaming_state.cached_keys is None else streaming_state.cached_keys.clone()
+                state_snapshot["cached_values"] = None if streaming_state.cached_values is None else streaming_state.cached_values.clone()
+                state_snapshot["cached_positions"] = (
+                    None if streaming_state.cached_positions is None else streaming_state.cached_positions.clone()
+                )
+                state_snapshot["flash_cached_keys"] = (
+                    None
+                    if getattr(streaming_state, "_flash_cached_keys", None) is None
+                    else cast(torch.Tensor, getattr(streaming_state, "_flash_cached_keys")).clone()
+                )
+                state_snapshot["flash_cached_values"] = (
+                    None
+                    if getattr(streaming_state, "_flash_cached_values", None) is None
+                    else cast(torch.Tensor, getattr(streaming_state, "_flash_cached_values")).clone()
+                )
+            snapshots.append((streaming_state, state_snapshot))
+        return snapshots
+    def _restore_decoder_streaming_states(
+        self,
+        snapshots: list[tuple[StreamingState, dict[str, torch.Tensor | None]]],
+    ) -> None:
+        for streaming_state, state_snapshot in snapshots:
+            exec_mask = state_snapshot["exec_mask"]
+            assert exec_mask is not None
+            streaming_state.exec_mask.copy_(exec_mask)
+            if isinstance(streaming_state, TransformerState):
+                offsets = state_snapshot.get("offsets")
+                assert offsets is not None
+                streaming_state.offsets.copy_(offsets)
+            if isinstance(streaming_state, MHAState):
+                offset = state_snapshot.get("offset")
+                assert offset is not None
+                streaming_state.offset.copy_(offset)
+                cached_keys = state_snapshot.get("cached_keys")
+                cached_values = state_snapshot.get("cached_values")
+                cached_positions = state_snapshot.get("cached_positions")
+                if cached_keys is None or cached_values is None or cached_positions is None:
+                    if streaming_state.cached_keys is not None:
+                        streaming_state.cached_keys.zero_()
+                    if streaming_state.cached_values is not None:
+                        streaming_state.cached_values.zero_()
+                    if streaming_state.cached_positions is not None:
+                        streaming_state.cached_positions.fill_(-1)
+                else:
+                    if streaming_state.cached_keys is None or streaming_state.cached_keys.shape != cached_keys.shape:
+                        streaming_state.cached_keys = cached_keys.clone()
+                    else:
+                        streaming_state.cached_keys.copy_(cached_keys)
+                    if streaming_state.cached_values is None or streaming_state.cached_values.shape != cached_values.shape:
+                        streaming_state.cached_values = cached_values.clone()
+                    else:
+                        streaming_state.cached_values.copy_(cached_values)
+                    if streaming_state.cached_positions is None or streaming_state.cached_positions.shape != cached_positions.shape:
+                        streaming_state.cached_positions = cached_positions.clone()
+                    else:
+                        streaming_state.cached_positions.copy_(cached_positions)
+                flash_cached_keys = state_snapshot.get("flash_cached_keys")
+                flash_cached_values = state_snapshot.get("flash_cached_values")
+                current_flash_cached_keys = cast(torch.Tensor | None, getattr(streaming_state, "_flash_cached_keys", None))
+                current_flash_cached_values = cast(torch.Tensor | None, getattr(streaming_state, "_flash_cached_values", None))
+                if flash_cached_keys is None or flash_cached_values is None:
+                    if current_flash_cached_keys is not None:
+                        current_flash_cached_keys.zero_()
+                    if current_flash_cached_values is not None:
+                        current_flash_cached_values.zero_()
+                else:
+                    if current_flash_cached_keys is None or current_flash_cached_keys.shape != flash_cached_keys.shape:
+                        setattr(streaming_state, "_flash_cached_keys", flash_cached_keys.clone())
+                    else:
+                        current_flash_cached_keys.copy_(flash_cached_keys)
+                    if current_flash_cached_values is None or current_flash_cached_values.shape != flash_cached_values.shape:
+                        setattr(streaming_state, "_flash_cached_values", flash_cached_values.clone())
+                    else:
+                        current_flash_cached_values.copy_(flash_cached_values)
+    def _graphed_decode_frame(
+        self,
+        codes: torch.Tensor,
+        code_lengths: torch.Tensor,
+    ) -> MossAudioTokenizerDecoderOutput:
+        self._ensure_cuda_graph_buffers(codes.device)
+        graph_input_codes = self._graph_input_codes
+        graph_input_code_lengths = self._graph_input_code_lengths
+        if graph_input_codes is None or graph_input_code_lengths is None:
+            raise RuntimeError("CUDA graph buffers are unavailable.")
+        num_quantizers = codes.shape[0]
+        graph_input_codes_view = graph_input_codes[:num_quantizers]
+        graph_input_codes_view.copy_(codes)
+        graph_input_code_lengths.copy_(code_lengths)
+        cuda_graph_key = (str(codes.device), self.max_batch_size, num_quantizers, self.model.compute_dtype_name)
+        if self._cuda_graph is None or self._cuda_graph_key != cuda_graph_key:
+            state_snapshots = self._snapshot_decoder_streaming_states()
+            current_stream = torch.cuda.current_stream(device=codes.device)
+            warmup_stream = torch.cuda.Stream(device=codes.device)
+            warmup_stream.wait_stream(current_stream)
+            with torch.cuda.stream(warmup_stream):
+                _ = self.model._decode_frame(graph_input_codes_view, graph_input_code_lengths)
+            current_stream.wait_stream(warmup_stream)
+            self._restore_decoder_streaming_states(state_snapshots)
+            cuda_graph = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(cuda_graph):
+                decoder_output = self.model._decode_frame(graph_input_codes_view, graph_input_code_lengths)
+            self._cuda_graph = cuda_graph
+            self._cuda_graph_key = cuda_graph_key
+            self._graph_output_audio = decoder_output.audio
+            self._graph_output_audio_lengths = decoder_output.audio_lengths
+        else:
+            self._cuda_graph.replay()
+        return MossAudioTokenizerDecoderOutput(
+            audio=self._graph_output_audio,
+            audio_lengths=self._graph_output_audio_lengths,
+        )
+    def _reset_slot(self, slot_index: int) -> None:
+        for streaming_state in self._decoder_streaming_states():
+            reset_mask = torch.zeros(streaming_state.batch_size, dtype=torch.bool, device=streaming_state.exec_mask.device)
+            reset_mask[slot_index] = True
+            streaming_state.reset(reset_mask)
+    def _pack_logical_codes_to_physical_slots(
+        self,
+        request_ids: list[str | int],
+        codes: torch.Tensor,
+        code_lengths: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, list[int], torch.Tensor]:
+        if request_ids != self.active_request_ids:
+            raise ValueError(_INVALID_DECODE_STEP_REQUEST_IDS_ERROR_MESSAGE)
+        if not request_ids:
+            raise ValueError("`step()` requires at least one active request.")
+        if codes.dim() == 2:
+            codes = codes.unsqueeze(1)
+        if codes.dim() != 3:
+            raise ValueError(f"`codes` must be 3D with shape `(num_quantizers, batch_size, sequence_length)`, got {codes.shape}.")
+        code_lengths = code_lengths.to(device=codes.device, dtype=torch.long)
+        if code_lengths.dim() != 1:
+            raise ValueError(f"`code_lengths` must be 1D with shape `(batch_size,)`, got {code_lengths.shape}.")
+        num_quantizers, logical_batch_size, max_code_length = codes.shape
+        if logical_batch_size != len(request_ids):
+            raise ValueError(
+                f"`codes.shape[1]` ({logical_batch_size}) must match len(`request_ids`) ({len(request_ids)})."
+            )
+        if code_lengths.shape[0] != logical_batch_size:
+            raise ValueError(
+                f"`code_lengths.shape[0]` ({code_lengths.shape[0]}) must match len(`request_ids`) ({len(request_ids)})."
+            )
+        if torch.any(code_lengths < 0):
+            raise ValueError("`code_lengths` must be >= 0.")
+        if torch.any(code_lengths > max_code_length):
+            raise ValueError(f"`code_lengths` must be <= codes.shape[-1] ({max_code_length}).")
+        packed_codes = codes.new_zeros((num_quantizers, self.max_batch_size, max_code_length))
+        packed_code_lengths = code_lengths.new_zeros((self.max_batch_size,))
+        logical_row_to_slot_index: list[int] = []
+        for logical_row_index, request_id in enumerate(request_ids):
+            slot_index = self.request_id_to_slot_index[request_id]
+            logical_row_to_slot_index.append(slot_index)
+            row_length = int(code_lengths[logical_row_index].item())
+            if row_length > 0:
+                packed_codes[:, slot_index, :row_length] = codes[:, logical_row_index, :row_length]
+            packed_code_lengths[slot_index] = row_length
+        return packed_codes, packed_code_lengths, logical_row_to_slot_index, code_lengths
+    def _advance_request_progress(
+        self,
+        request_ids: list[str | int],
+        code_lengths: torch.Tensor,
+        audio_lengths: torch.Tensor,
+    ) -> None:
+        for logical_row_index, request_id in enumerate(request_ids):
+            self.request_id_to_code_offset[request_id] += int(code_lengths[logical_row_index].item())
+            self.request_id_to_audio_offset[request_id] += int(audio_lengths[logical_row_index].item())
+    def step(
+        self,
+        request_ids: list[str | int],
+        codes: torch.Tensor,
+        code_lengths: torch.Tensor,
+    ) -> tuple[list[str | int], torch.Tensor, torch.Tensor]:
+        self._ensure_open()
+        packed_codes, packed_code_lengths, logical_row_to_slot_index, logical_code_lengths = (
+            self._pack_logical_codes_to_physical_slots(
+                request_ids=request_ids,
+                codes=codes,
+                code_lengths=code_lengths,
+            )
+        )
+        max_step_length = int(packed_code_lengths.max().item())
+        if max_step_length <= 0:
+            raise ValueError("`step()` requires at least one row with `code_length > 0`.")
+        decoder_streaming_states = self._decoder_streaming_states()
+        logical_audio_chunks: list[list[torch.Tensor]] = [[] for _ in request_ids]
+        audio_device: torch.device | None = None
+        audio_dtype: torch.dtype | None = None
+        audio_num_channels: int | None = None
+        try:
+            for frame_index in range(max_step_length):
+                frame_exec_mask = packed_code_lengths > frame_index
+                for streaming_state in decoder_streaming_states:
+                    streaming_state.set_exec_mask(frame_exec_mask)
+                frame_codes = packed_codes[:, :, frame_index : frame_index + 1]
+                frame_code_lengths = frame_exec_mask.to(dtype=packed_code_lengths.dtype)
+                if self._use_cuda_graph and frame_codes.is_cuda:
+                    decoder_output = self._graphed_decode_frame(frame_codes, frame_code_lengths)
+                else:
+                    decoder_output = self.model._decode_frame(frame_codes, frame_code_lengths)
+                if decoder_output.audio is None or decoder_output.audio_lengths is None:
+                    raise RuntimeError("Internal error: `_decode_frame` returned empty audio.")
+                audio = decoder_output.audio
+                audio_lengths = decoder_output.audio_lengths
+                audio_device = audio.device
+                audio_dtype = audio.dtype
+                audio_num_channels = audio.shape[1]
+                for logical_row_index, slot_index in enumerate(logical_row_to_slot_index):
+                    audio_length = int(audio_lengths[slot_index].item())
+                    if audio_length <= 0:
+                        continue
+                    logical_audio_chunks[logical_row_index].append(audio[slot_index : slot_index + 1, :, :audio_length])
+        except Exception:
+            self.close()
+            raise
+        finally:
+            for streaming_state in decoder_streaming_states:
+                streaming_state.set_exec_mask(torch.ones_like(streaming_state.exec_mask))
+        if audio_device is None or audio_dtype is None or audio_num_channels is None:
+            raise RuntimeError("Internal error: `step()` produced no decoder outputs.")
+        logical_audio_rows: list[torch.Tensor] = []
+        logical_audio_lengths: list[int] = []
+        for row_chunks in logical_audio_chunks:
+            if row_chunks:
+                row_audio = torch.cat(row_chunks, dim=-1)
+            else:
+                row_audio = torch.zeros((1, audio_num_channels, 0), device=audio_device, dtype=audio_dtype)
+            logical_audio_rows.append(row_audio)
+            logical_audio_lengths.append(row_audio.shape[-1])
+        audio_lengths = torch.tensor(logical_audio_lengths, device=audio_device, dtype=torch.long)
+        max_audio_length = max(logical_audio_lengths)
+        audio = torch.zeros(
+            (len(request_ids), audio_num_channels, max_audio_length),
+            device=audio_device,
+            dtype=audio_dtype,
+        )
+        for logical_row_index, row_audio in enumerate(logical_audio_rows):
+            row_audio_length = row_audio.shape[-1]
+            if row_audio_length > 0:
+                audio[logical_row_index, :, :row_audio_length] = row_audio[0]
+        logical_request_ids = list(request_ids)
+        self._advance_request_progress(
+            request_ids=logical_request_ids,
+            code_lengths=logical_code_lengths,
+            audio_lengths=audio_lengths,
+        )
+        return logical_request_ids, audio, audio_lengths
+    def remove(self, request_id: str | int) -> None:
+        self._ensure_open()
+        slot_index = self.request_id_to_slot_index.get(request_id)
+        if slot_index is None or request_id not in self.active_request_ids:
+            raise RuntimeError(_UNKNOWN_DECODE_REQUEST_ERROR_TEMPLATE.format(request_id=request_id))
+        if self.slot_is_free[slot_index] or self.slot_index_to_request_id[slot_index] != request_id:
+            raise RuntimeError(_UNKNOWN_DECODE_REQUEST_ERROR_TEMPLATE.format(request_id=request_id))
+        self.active_request_ids.remove(request_id)
+        self._reset_slot(slot_index)
+        _ = self.request_id_to_slot_index.pop(request_id)
+        self.slot_index_to_request_id[slot_index] = None
+        self.slot_is_free[slot_index] = True
+        _ = self.request_id_to_code_offset.pop(request_id, None)
+        _ = self.request_id_to_audio_offset.pop(request_id, None)
+    def close(self) -> None:
+        if self._closed:
+            return
+        self._closed = True
+        decode_streaming_exit_stack = self._decode_streaming_exit_stack
+        self._decode_streaming_exit_stack = None
+        try:
+            if decode_streaming_exit_stack is not None:
+                decode_streaming_exit_stack.close()
+        finally:
+            for module in self._flash_kvcache_attention_modules:
+                module._use_flash_kvcache = False
+            self._flash_kvcache_attention_modules = []
+            self._cuda_graph = None
+            self._cuda_graph_key = None
+            self._graph_input_codes = None
+            self._graph_input_code_lengths = None
+            self._graph_output_audio = None
+            self._graph_output_audio_lengths = None
+            if self.model._active_decode_session is self:
+                self.model._active_decode_session = None
 # =============================================================================
 # Normalization Layers
 # =============================================================================
 # =============================================================================
+_sync_module_proxy()
 @dataclass
 class MHAState(StreamingState):
     cached_keys: torch.Tensor | None
                 f"Expected one of {sorted(SUPPORTED_ATTENTION_IMPLEMENTATIONS)}."
             )
         self.attention_implementation = attention_implementation
+        self._use_flash_kvcache = False
         self.in_proj = nn.Linear(embed_dim, 3 * embed_dim, bias=False, **factory_kwargs)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False, **factory_kwargs)
                 state.cached_positions = state.cached_positions.to(device=device)
         return state.cached_keys, state.cached_values, state.cached_positions
+    def _ensure_flash_kvcache(
+        self,
+        state: MHAState,
+        batch_size: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.context is None:
+            raise RuntimeError("flash_attn_with_kvcache requires a finite streaming context.")
+        head_dim = self.embed_dim // self.num_heads
+        flash_cached_keys = cast(torch.Tensor | None, getattr(state, "_flash_cached_keys", None))
+        flash_cached_values = cast(torch.Tensor | None, getattr(state, "_flash_cached_values", None))
+        if flash_cached_keys is None or flash_cached_values is None:
+            flash_cached_keys = torch.zeros(
+                (batch_size, self.context, self.num_heads, head_dim),
+                device=device,
+                dtype=dtype,
+            )
+            flash_cached_values = torch.zeros_like(flash_cached_keys)
+        else:
+            if flash_cached_keys.device != device or flash_cached_keys.dtype != dtype:
+                flash_cached_keys = flash_cached_keys.to(device=device, dtype=dtype)
+            if flash_cached_values.device != device or flash_cached_values.dtype != dtype:
+                flash_cached_values = flash_cached_values.to(device=device, dtype=dtype)
+        setattr(state, "_flash_cached_keys", flash_cached_keys)
+        setattr(state, "_flash_cached_values", flash_cached_values)
+        return flash_cached_keys, flash_cached_values
     def _build_streaming_kv(
         self,
         cached_k: torch.Tensor,
             state.cached_positions = pos_k.contiguous()
             return
+        assert state.cached_keys is not None
+        assert state.cached_values is not None
+        assert state.cached_positions is not None
         new_cached_k = k_all[:, :, -self.context :, :].contiguous()
         new_cached_v = v_all[:, :, -self.context :, :].contiguous()
         new_cached_pos = pos_k[:, -self.context :].contiguous()
+        state.cached_keys.copy_(torch.where(exec_mask, new_cached_k, cached_k))
+        state.cached_values.copy_(torch.where(exec_mask, new_cached_v, cached_v))
+        state.cached_positions.copy_(torch.where(exec_mask_pos, new_cached_pos, cached_pos))
     def _build_streaming_sdpa_bias(self, pos_q: torch.Tensor, pos_k: torch.Tensor) -> torch.Tensor:
         delta = pos_q[:, :, None] - pos_k[:, None, :]
         if flash_attn_varlen_func is None:
             raise RuntimeError("flash-attn is not installed.")
         window_size = (self.context, 0) if (self.context is not None and self.causal) else (-1, -1)
+        return cast(
+            torch.Tensor,
+            flash_attn_varlen_func(
+                q.contiguous(),
+                k.contiguous(),
+                v.contiguous(),
+                cu_seqlens_q,
+                cu_seqlens_k,
+                max_seqlen_q,
+                max_seqlen_k,
+                causal=self.causal,
+                window_size=window_size,
+            ),
         )
     def _forward_streaming_sdpa(self, x: torch.Tensor, state: MHAState) -> torch.Tensor:
         state.offset[:] = torch.where(state.exec_mask, state.offset + chunk_length, state.offset)
         return out
+    def _forward_streaming_flash_kvcache(self, x: torch.Tensor, state: MHAState) -> torch.Tensor:
+        from flash_attn import flash_attn_with_kvcache
+        if self.context is None:
+            raise RuntimeError("flash_attn_with_kvcache requires a finite streaming context.")
+        batch_size, chunk_length, _ = x.shape
+        q, k_cur, v_cur = self._project_qkv(x)
+        if self.rope is not None:
+            q, k_cur = self.rope(q, k_cur, state.offset, time_before_heads=False)
+        q = q.transpose(1, 2).contiguous()
+        k_cur = k_cur.transpose(1, 2).contiguous()
+        v_cur = v_cur.transpose(1, 2).contiguous()
+        exec_mask = state.exec_mask.view(batch_size, 1, 1, 1).to(dtype=k_cur.dtype)
+        k_cur = k_cur * exec_mask
+        v_cur = v_cur * exec_mask
+        k_cache, v_cache = self._ensure_flash_kvcache(state, batch_size, k_cur.device, k_cur.dtype)
+        cache_seqlens = state.offset.clamp(max=self.context).to(torch.int32)
+        window_size = (self.context - 1, 0)
+        out = cast(
+            torch.Tensor,
+            flash_attn_with_kvcache(
+                q,
+                k_cache,
+                v_cache,
+                k=k_cur,
+                v=v_cur,
+                cache_seqlens=cache_seqlens,
+                causal=True,
+                window_size=window_size,
+            ),
+        )
+        out = out.reshape(batch_size, chunk_length, self.embed_dim)
+        state.offset[:] = torch.where(state.exec_mask, state.offset + chunk_length, state.offset)
+        return out
     def _forward_non_streaming_sdpa(self, x: torch.Tensor, input_lengths: torch.Tensor) -> torch.Tensor:
         batch_size, max_seqlen, _ = x.shape
         q, k, v = self._project_qkv(x)
         if state is not None:
             if query.dim() != 3:
                 raise ValueError(f"Streaming attention expects a 3D tensor, got shape {tuple(query.shape)}")
+            if backend == "flash_attention_2" and self._use_flash_kvcache:
+                out = self._forward_streaming_flash_kvcache(query, state)
+            elif backend == "flash_attention_2":
+                out = self._forward_streaming_flash(query, state)
+            else:
+                out = self._forward_streaming_sdpa(query, state)
             return self.out_proj(out)
         if backend == "flash_attention_2":
 # =============================================================================
+_sync_module_proxy()
 @dataclass
 class LayerState(StreamingState):
     pass
 # =============================================================================
+_sync_module_proxy()
 @dataclass
 class TransformerState(StreamingState):
     offsets: torch.Tensor
             )
         self.post_init()
+        self._active_decode_session: "MossAudioTokenizerDecodeSession | None" = None
+        self._batch_decode_streaming_max_batch_size: int | None = None
+        self._batch_decode_streaming_batch_size: int | None = None
+        self._batch_decode_streaming_session: "MossAudioTokenizerDecodeSession | None" = None
+        self._batch_decode_streaming_next_request_id: int = 0
+    def create_decode_session(
+        self,
+        max_batch_size: int,
+        use_cuda_graph: bool = False,
+    ) -> MossAudioTokenizerDecodeSession:
+        active_session = self._active_decode_session
+        if active_session is not None and not active_session._closed:
+            raise RuntimeError(_ACTIVE_DECODE_SESSION_ERROR_MESSAGE)
+        for module in self.modules():
+            if isinstance(module, StreamingModule) and module._streaming_state is not None:
+                raise RuntimeError(_MODEL_STREAMING_CONFLICT_ERROR_MESSAGE)
+        session = MossAudioTokenizerDecodeSession(self, max_batch_size, use_cuda_graph=use_cuda_graph)
+        return session
+    def _reset_batch_decode_streaming_state(self) -> None:
+        streaming_session = self._batch_decode_streaming_session
+        self._batch_decode_streaming_session = None
+        self._batch_decode_streaming_max_batch_size = None
+        self._batch_decode_streaming_batch_size = None
+        self._batch_decode_streaming_next_request_id = 0
+        if streaming_session is not None and not streaming_session._closed:
+            streaming_session.close()
+    def _prepare_batch_decode_streaming_state(
+        self,
+        batch_size: int,
+        max_batch_size: int | None,
+        reset_stream: bool,
+    ) -> int:
+        if reset_stream:
+            self._reset_batch_decode_streaming_state()
+        if max_batch_size is not None and max_batch_size <= 0:
+            raise ValueError("`max_batch_size` must be > 0 when provided.")
+        streaming_max_batch_size = self._batch_decode_streaming_max_batch_size
+        if streaming_max_batch_size is None:
+            streaming_max_batch_size = batch_size if max_batch_size is None else max_batch_size
+        elif max_batch_size is not None and max_batch_size != streaming_max_batch_size:
+            raise ValueError(
+                "`max_batch_size` can only be set on the first streaming `batch_decode()` call for now. "
+                f"Expected {streaming_max_batch_size}, got {max_batch_size}."
+            )
+        if batch_size > streaming_max_batch_size:
+            raise ValueError(
+                "Streaming `batch_decode()` received a batch larger than the reserved `max_batch_size`. "
+                f"Got batch_size={batch_size}, max_batch_size={streaming_max_batch_size}."
+            )
+        return streaming_max_batch_size
+    def _ensure_batch_decode_streaming_session(
+        self,
+        max_batch_size: int,
+        use_cuda_graph: bool = False,
+    ) -> MossAudioTokenizerDecodeSession:
+        session = self._batch_decode_streaming_session
+        if session is not None and not session._closed:
+            if session._use_cuda_graph != use_cuda_graph:
+                raise ValueError(
+                    "`use_cuda_graph` must match the existing streaming `batch_decode()` session configuration. "
+                    f"Expected {session._use_cuda_graph}, got {use_cuda_graph}."
+                )
+            return session
+        session = self.create_decode_session(max_batch_size=max_batch_size, use_cuda_graph=use_cuda_graph)
+        self._batch_decode_streaming_session = session
+        self._batch_decode_streaming_max_batch_size = max_batch_size
+        self._batch_decode_streaming_next_request_id = 0
+        return session
+    def _append_batch_decode_streaming_requests(
+        self,
+        session: MossAudioTokenizerDecodeSession,
+        target_batch_size: int,
+    ) -> None:
+        requests_to_append = target_batch_size - len(session.active_request_ids)
+        for _ in range(requests_to_append):
+            request_id = self._batch_decode_streaming_next_request_id
+            session.append(request_id)
+            self._batch_decode_streaming_next_request_id += 1
+    def _resolve_batch_decode_streaming_finalize_request_ids(
+        self,
+        request_ids: list[str | int],
+        finalize_indices: list[int] | tuple[int, ...] | None,
+    ) -> list[str | int]:
+        normalized_finalize_indices = tuple(finalize_indices) if finalize_indices is not None else ()
+        if len(set(normalized_finalize_indices)) != len(normalized_finalize_indices):
+            raise ValueError(_BATCH_DECODE_STREAMING_DUPLICATE_FINALIZE_INDICES_ERROR_MESSAGE)
+        batch_size = len(request_ids)
+        finalize_request_ids: list[str | int] = []
+        for index in normalized_finalize_indices:
+            if index < 0 or index >= batch_size:
+                raise ValueError(
+                    _BATCH_DECODE_STREAMING_FINALIZE_INDEX_OUT_OF_RANGE_ERROR_TEMPLATE.format(
+                        index=index, batch_size=batch_size
+                    )
+                )
+            finalize_request_ids.append(request_ids[index])
+        return finalize_request_ids
+    def _raise_if_plain_decode_conflicts_with_active_session(self) -> None:
+        active_session = self._active_decode_session
+        if active_session is not None and not getattr(active_session, "_closed", False):
+            raise RuntimeError(_PLAIN_DECODE_SESSION_CONFLICT_ERROR_MESSAGE)
     def _start_streaming(self, batch_size: int):
         """Start streaming mode for all modules."""
+        active_session = self._active_decode_session
+        if active_session is not None and not getattr(active_session, "_closed", False):
+            raise RuntimeError(_MODEL_STREAMING_CONFLICT_ERROR_MESSAGE)
         def _start(module):
             if isinstance(module, StreamingModule):
     def _stop_streaming(self):
         """Stop streaming mode for all modules."""
+        active_session = self._active_decode_session
+        if active_session is not None and not getattr(active_session, "_closed", False):
+            raise RuntimeError(_MODEL_STREAMING_CONFLICT_ERROR_MESSAGE)
         def _stop(module):
             if isinstance(module, StreamingModule):
         codes_list: list[torch.Tensor],
         num_quantizers: int | None = None,
         chunk_duration: float | None = None,
+        streaming: bool = False,
+        max_batch_size: int | None = None,
+        finalize_indices: list[int] | tuple[int, ...] | None = None,
+        reset_stream: bool = False,
+        use_cuda_graph: bool = False,
     ) -> MossAudioTokenizerDecoderOutput:
+        if len(codes_list) == 0:
+            raise ValueError("`codes_list` must contain at least one code tensor.")
+        streaming_max_batch_size: int | None = None
+        if streaming:
+            streaming_max_batch_size = self._prepare_batch_decode_streaming_state(
+                batch_size=len(codes_list),
+                max_batch_size=max_batch_size,
+                reset_stream=reset_stream,
+            )
+        else:
+            if reset_stream:
+                self._reset_batch_decode_streaming_state()
+            self._raise_if_plain_decode_conflicts_with_active_session()
         audio_codes, audio_codes_lengths, num_quantizers_used = self._prepare_codes_batch(
             codes_list,
             num_quantizers=num_quantizers,
         batch_size = len(codes_list)
         device = audio_codes.device
+        if not streaming and chunk_duration is None:
             return self._decode_frame(audio_codes, audio_codes_lengths)
+        if streaming:
+            assert streaming_max_batch_size is not None
+            existing_session = self._batch_decode_streaming_session
+            reusing_streaming_session = existing_session is not None and not existing_session._closed
+            session = self._ensure_batch_decode_streaming_session(
+                max_batch_size=streaming_max_batch_size,
+                use_cuda_graph=use_cuda_graph,
+            )
+            pre_call_request_ids = list(session.active_request_ids)
+            pre_call_batch_size = len(pre_call_request_ids)
+            if batch_size < pre_call_batch_size:
+                raise ValueError(_BATCH_DECODE_STREAMING_SHRINK_ERROR_MESSAGE)
+            try:
+                finalize_request_ids = self._resolve_batch_decode_streaming_finalize_request_ids(
+                    request_ids=pre_call_request_ids,
+                    finalize_indices=finalize_indices,
+                )
+            except Exception:
+                if not reusing_streaming_session and pre_call_batch_size == 0:
+                    self._reset_batch_decode_streaming_state()
+                raise
+            try:
+                if batch_size > pre_call_batch_size:
+                    self._append_batch_decode_streaming_requests(session=session, target_batch_size=batch_size)
+                request_ids = list(session.active_request_ids)
+                _, audio, audio_lengths = session.step(
+                    request_ids=request_ids,
+                    codes=audio_codes,
+                    code_lengths=audio_codes_lengths,
+                )
+                for request_id in finalize_request_ids:
+                    session.remove(request_id)
+            except Exception:
+                self._reset_batch_decode_streaming_state()
+                raise
+            self._batch_decode_streaming_max_batch_size = session.max_batch_size
+            self._batch_decode_streaming_batch_size = len(session.active_request_ids)
+            return MossAudioTokenizerDecoderOutput(audio=audio, audio_lengths=audio_lengths)
+        assert chunk_duration is not None
         if chunk_duration <= 0:
             raise ValueError("`chunk_duration` must be > 0 when provided.")
             `MossAudioTokenizerDecoderOutput` or tuple containing decoded audio.
         """
         return_dict = return_dict if return_dict is not None else self.config.return_dict
+        self._raise_if_plain_decode_conflicts_with_active_session()
         if audio_codes.dim() == 2:
             codes_list = [audio_codes[:num_quantizers] if num_quantizers is not None else audio_codes]