File size: 138,814 Bytes

# Copyright 2026 OpenMOSS and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch MossAudioTokenizer model."""

from __future__ import annotations

import copy
import importlib
import math
import sys
import types
from contextlib import ExitStack, contextmanager
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from typing import cast

import torch
import torch.nn as nn
import torch.nn.functional as F

if __name__ not in sys.modules:
    _module_proxy = types.ModuleType(__name__)
    sys.modules[__name__] = _module_proxy


def _sync_module_proxy() -> None:
    sys.modules[__name__].__dict__.update(globals())

try:
    from transformers.modeling_utils import PreTrainedAudioTokenizerBase
except ImportError:
    from transformers.modeling_utils import PreTrainedModel as PreTrainedAudioTokenizerBase
from transformers.utils import ModelOutput, logging

try:
    from transformers.utils import auto_docstring as _hf_auto_docstring
except ImportError:
    _hf_auto_docstring = None

def auto_docstring(*args, **kwargs):
    if _hf_auto_docstring is None:
        if len(args) == 1 and callable(args[0]) and not kwargs:
            return args[0]

        def decorator(obj):
            return obj

        return decorator

    if len(args) == 1 and callable(args[0]) and not kwargs:
        obj = args[0]
        try:
            return _hf_auto_docstring(obj)
        except Exception:
            return obj

    try:
        decorator = _hf_auto_docstring(*args, **kwargs)
    except Exception:
        def decorator(obj):
            return obj

        return decorator

    def safe_decorator(obj):
        try:
            return decorator(obj)
        except Exception:
            return obj

    return safe_decorator

try:
    from .configuration_moss_audio_tokenizer import MossAudioTokenizerConfig
except ImportError:
    _module_dir = str(Path(__file__).resolve().parent)
    if _module_dir not in sys.path:
        sys.path.insert(0, _module_dir)
    from configuration_moss_audio_tokenizer import MossAudioTokenizerConfig


logger = logging.get_logger(__name__)


@lru_cache(maxsize=1)
def _get_flash_attn_module():
    try:
        return importlib.import_module("flash_attn")
    except Exception:
        return None


def _has_flash_attn() -> bool:
    return _get_flash_attn_module() is not None


def _get_flash_attn_varlen_func():
    flash_attn_module = _get_flash_attn_module()
    if flash_attn_module is None:
        return None
    return getattr(flash_attn_module, "flash_attn_varlen_func", None)


def _get_flash_attn_with_kvcache():
    flash_attn_module = _get_flash_attn_module()
    if flash_attn_module is None:
        return None
    return getattr(flash_attn_module, "flash_attn_with_kvcache", None)


SUPPORTED_ATTENTION_IMPLEMENTATIONS = {"sdpa", "flash_attention_2"}
SUPPORTED_COMPUTE_DTYPES = {"fp32": None, "bf16": torch.bfloat16, "fp16": torch.float16}


_ACTIVE_DECODE_SESSION_ERROR_MESSAGE = "MossAudioTokenizerModel only supports one active decode session at a time."
_CLOSED_DECODE_SESSION_ERROR_MESSAGE = "This decode session is closed."
_MODEL_STREAMING_CONFLICT_ERROR_MESSAGE = "Model-level streaming helpers cannot be used while a decode session is active."
_PLAIN_DECODE_SESSION_CONFLICT_ERROR_MESSAGE = "Plain decode helpers cannot be used while a decode session is active."
_DUPLICATE_DECODE_REQUEST_ERROR_TEMPLATE = "Decode session already contains request_id={request_id!r}."
_UNKNOWN_DECODE_REQUEST_ERROR_TEMPLATE = "Decode session does not contain an active request_id={request_id!r}."
_DECODE_SESSION_FULL_ERROR_TEMPLATE = "Decode session has no free slots remaining (max_batch_size={max_batch_size})."
_INVALID_DECODE_STEP_REQUEST_IDS_ERROR_MESSAGE = (
    "`request_ids` must exactly match the current active decode request order."
)
_BATCH_DECODE_STREAMING_DUPLICATE_FINALIZE_INDICES_ERROR_MESSAGE = "`finalize_indices` must not contain duplicates."
_BATCH_DECODE_STREAMING_FINALIZE_INDEX_OUT_OF_RANGE_ERROR_TEMPLATE = (
    "`finalize_indices` index {index} is out of range for the pre-call logical batch of size {batch_size}."
)
_BATCH_DECODE_STREAMING_SHRINK_ERROR_MESSAGE = (
    "`batch_decode(streaming=True)` must include all pre-call active rows in the current call before applying `finalize_indices`."
)


def resolve_compute_dtype(compute_dtype: str) -> torch.dtype | None:
    if compute_dtype not in SUPPORTED_COMPUTE_DTYPES:
        raise ValueError(
            f"Unsupported compute_dtype={compute_dtype!r}. Expected one of {sorted(SUPPORTED_COMPUTE_DTYPES)}."
        )
    return SUPPORTED_COMPUTE_DTYPES[compute_dtype]


@contextmanager
def disable_cuda_autocast():
    with torch.autocast(device_type="cuda", enabled=False):
        yield


# =============================================================================
# Output Classes
# =============================================================================


_sync_module_proxy()
@dataclass
@auto_docstring
class MossAudioTokenizerEncoderOutput(ModelOutput):
    r"""
    audio_codes (`torch.LongTensor` of shape `(num_quantizers, batch_size, sequence_length)`, *optional*):
        Discrete audio codes computed using the encoder and quantizer.
    audio_codes_lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Valid lengths for each sample's audio codes.
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, hidden_size, sequence_length)`, *optional*):
        Hidden states from the encoder before quantization.
    """

    audio_codes: torch.Tensor | None = None
    audio_codes_lengths: torch.Tensor | None = None
    encoder_hidden_states: torch.Tensor | None = None


_sync_module_proxy()
@dataclass
@auto_docstring
class MossAudioTokenizerDecoderOutput(ModelOutput):
    r"""
    audio (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
        Decoded audio waveform.
    audio_lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Valid lengths for each sample's audio.
    """

    audio: torch.Tensor | None = None
    audio_lengths: torch.Tensor | None = None


_sync_module_proxy()
@dataclass
@auto_docstring
class MossAudioTokenizerOutput(ModelOutput):
    r"""
    audio (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
        Decoded audio waveform.
    audio_lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Valid lengths for each sample's audio.
    audio_codes (`torch.LongTensor` of shape `(num_quantizers, batch_size, sequence_length)`, *optional*):
        Discrete audio codes computed using the encoder and quantizer.
    audio_codes_lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Valid lengths for each sample's audio codes.
    """

    audio: torch.Tensor | None = None
    audio_lengths: torch.Tensor | None = None
    audio_codes: torch.Tensor | None = None
    audio_codes_lengths: torch.Tensor | None = None


# =============================================================================
# Streaming Module Base Classes
# =============================================================================


_sync_module_proxy()
@dataclass
class StreamingState:
    """Base state for streaming modules."""

    batch_size: int
    device: torch.device

    def __post_init__(self):
        self.exec_mask = torch.ones(self.batch_size, dtype=torch.bool, device=self.device)

    def set_exec_mask(self, exec_mask: torch.Tensor):
        self.exec_mask[:] = exec_mask

    def reset(self, reset_mask: torch.Tensor) -> None:
        self.exec_mask[:] = torch.where(reset_mask, torch.ones_like(self.exec_mask), self.exec_mask)

    def __enter__(self):
        # ExitStack expects a context manager; returning self is conventional and useful for debugging.
        return self

    def __exit__(self, exc_type, exc_value, traceback) -> None:
        pass


class StreamingModule(nn.Module):
    """Base class for streaming components."""

    def __init__(self) -> None:
        super().__init__()
        self._streaming_state: StreamingState | None = None
        self._streaming_detached: bool = False
        self._cached_children: list[tuple[str, StreamingModule]] | None = None

    @property
    def is_streaming(self):
        return self._streaming_state is not None

    def _apply_named_streaming(self, fn):
        def _handle_module(prefix: str, module: nn.Module):
            if isinstance(module, StreamingModule):
                if module._streaming_detached and prefix != "":
                    return
                if self._cached_children is None:
                    raise RuntimeError("Internal error: _cached_children should be initialized before traversal.")
                self._cached_children.append((prefix, module))
            for name, child in module.named_children():
                new_prefix = f"{prefix}.{name}" if prefix else name
                _handle_module(new_prefix, child)

        if self._cached_children is None:
            self._cached_children = []
            _handle_module("", self)
        for name, child in self._cached_children:
            fn(name, child)

    def _start_streaming(self, batch_size: int, exit_stack: ExitStack):
        def _start_streaming_fn(name: str, module: StreamingModule):
            if module._streaming_state is not None:
                raise RuntimeError(f"{name} is already streaming!")
            state = module._init_streaming_state(batch_size)
            exit_stack.enter_context(state)
            module._streaming_state = state

        self._apply_named_streaming(_start_streaming_fn)

    def _stop_streaming(self) -> None:
        def _stop_streaming_fn(name: str, module: StreamingModule):
            module._streaming_state = None

        self._apply_named_streaming(_stop_streaming_fn)

    def _init_streaming_state(self, batch_size: int) -> StreamingState:
        device = next(iter(self.parameters())).device
        return StreamingState(batch_size, device)

    def streaming(self, batch_size: int) -> ExitStack:
        """Context manager to enter streaming mode."""
        exit_stack = ExitStack()
        self._start_streaming(batch_size, exit_stack)
        exit_stack.callback(self._stop_streaming)
        return exit_stack


class StreamingContainer(StreamingModule):
    """Container for streaming modules."""

    pass


class MossAudioTokenizerDecodeSession:
    model: MossAudioTokenizerModel
    max_batch_size: int
    _use_cuda_graph: bool
    active_request_ids: list[str | int]
    request_id_to_slot_index: dict[str | int, int]
    slot_index_to_request_id: list[str | int | None]
    slot_is_free: list[bool]
    request_id_to_code_offset: dict[str | int, int]
    request_id_to_audio_offset: dict[str | int, int]
    _flash_kvcache_attention_modules: list[MossAudioTokenizerMultiheadAttention]
    _graph_num_quantizers_capacity: int | None
    _graph_input_codes: torch.Tensor | None
    _graph_input_code_lengths: torch.Tensor | None
    _graph_output_audio: torch.Tensor | None
    _graph_output_audio_lengths: torch.Tensor | None
    _cuda_graph: torch.cuda.CUDAGraph | None
    _cuda_graph_key: tuple[str, int, int, str] | None
    _decode_streaming_exit_stack: ExitStack | None
    _closed: bool

    def __init__(self, model: MossAudioTokenizerModel, max_batch_size: int, use_cuda_graph: bool = False):
        if max_batch_size <= 0:
            raise ValueError("`max_batch_size` must be > 0.")

        decoder_attention_modules: list[MossAudioTokenizerMultiheadAttention] = []
        for decoder_module in model.decoder:
            for module in decoder_module.modules():
                if isinstance(module, MossAudioTokenizerMultiheadAttention):
                    if module.context is None:
                        raise ValueError(
                            "MossAudioTokenizerDecodeSession requires all decoder MHA modules to have a finite "
                            "`context` (context=None is unsupported for continuous-batch streaming)."
                        )
                    decoder_attention_modules.append(module)

        flash_kvcache_attention_modules: list[MossAudioTokenizerMultiheadAttention] = []
        if use_cuda_graph and _has_flash_attn():
            for module in decoder_attention_modules:
                module._use_flash_kvcache = True
                flash_kvcache_attention_modules.append(module)

        decode_streaming_exit_stack = ExitStack()
        try:
            for decoder_module in model.decoder:
                if isinstance(decoder_module, StreamingModule):
                    inner_stack = decoder_module.streaming(batch_size=max_batch_size)
                    _ = decode_streaming_exit_stack.enter_context(inner_stack)
        except Exception:
            decode_streaming_exit_stack.close()
            for module in flash_kvcache_attention_modules:
                module._use_flash_kvcache = False
            raise

        self.model = model
        self.max_batch_size = max_batch_size
        self._use_cuda_graph = use_cuda_graph
        self.active_request_ids: list[str | int] = []
        self.request_id_to_slot_index: dict[str | int, int] = {}
        self.slot_index_to_request_id: list[str | int | None] = [None] * max_batch_size
        self.slot_is_free: list[bool] = [True] * max_batch_size
        self.request_id_to_code_offset: dict[str | int, int] = {}
        self.request_id_to_audio_offset: dict[str | int, int] = {}
        self._flash_kvcache_attention_modules = flash_kvcache_attention_modules
        self._graph_num_quantizers_capacity = int(getattr(model.quantizer, "num_quantizers", 0)) if use_cuda_graph else None
        self._graph_input_codes = None
        self._graph_input_code_lengths = None
        self._graph_output_audio = None
        self._graph_output_audio_lengths = None
        self._cuda_graph = None
        self._cuda_graph_key = None
        self._decode_streaming_exit_stack: ExitStack | None = decode_streaming_exit_stack
        self._closed = False
        if use_cuda_graph:
            device = next(iter(model.parameters())).device
            if device.type == "cuda":
                self._ensure_cuda_graph_buffers(device)
        model._active_decode_session = self

    def _ensure_open(self) -> None:
        if self._closed:
            raise RuntimeError(_CLOSED_DECODE_SESSION_ERROR_MESSAGE)

    def append(self, request_id: str | int) -> None:
        self._ensure_open()

        if request_id in self.request_id_to_slot_index:
            raise RuntimeError(_DUPLICATE_DECODE_REQUEST_ERROR_TEMPLATE.format(request_id=request_id))

        slot_index = next((index for index, is_free in enumerate(self.slot_is_free) if is_free), None)
        if slot_index is None:
            raise RuntimeError(_DECODE_SESSION_FULL_ERROR_TEMPLATE.format(max_batch_size=self.max_batch_size))

        self.active_request_ids.append(request_id)
        self.request_id_to_slot_index[request_id] = slot_index
        self.slot_index_to_request_id[slot_index] = request_id
        self.slot_is_free[slot_index] = False
        self.request_id_to_code_offset[request_id] = 0
        self.request_id_to_audio_offset[request_id] = 0

    def _decoder_streaming_states(self) -> list[StreamingState]:
        decoder_streaming_states: list[StreamingState] = []
        for decoder_module in self.model.decoder:
            for module in decoder_module.modules():
                if isinstance(module, StreamingModule) and module._streaming_state is not None:
                    decoder_streaming_states.append(module._streaming_state)
        return decoder_streaming_states

    def _ensure_cuda_graph_buffers(self, device: torch.device) -> None:
        if not self._use_cuda_graph or device.type != "cuda":
            return
        graph_num_quantizers_capacity = self._graph_num_quantizers_capacity
        if graph_num_quantizers_capacity is None:
            graph_num_quantizers_capacity = int(getattr(self.model.quantizer, "num_quantizers", 0))
            self._graph_num_quantizers_capacity = graph_num_quantizers_capacity
        if graph_num_quantizers_capacity <= 0:
            raise RuntimeError("`use_cuda_graph=True` requires a quantizer with `num_quantizers > 0`.")
        if self._graph_input_codes is None or self._graph_input_codes.device != device:
            self._graph_input_codes = torch.zeros(
                (graph_num_quantizers_capacity, self.max_batch_size, 1),
                device=device,
                dtype=torch.long,
            )
            self._graph_input_code_lengths = torch.zeros(self.max_batch_size, device=device, dtype=torch.long)
            self._graph_output_audio = None
            self._graph_output_audio_lengths = None
            self._cuda_graph = None
            self._cuda_graph_key = None

    def _snapshot_decoder_streaming_states(self) -> list[tuple[StreamingState, dict[str, torch.Tensor | None]]]:
        snapshots: list[tuple[StreamingState, dict[str, torch.Tensor | None]]] = []
        for streaming_state in self._decoder_streaming_states():
            state_snapshot: dict[str, torch.Tensor | None] = {"exec_mask": streaming_state.exec_mask.clone()}
            if isinstance(streaming_state, TransformerState):
                state_snapshot["offsets"] = streaming_state.offsets.clone()
            if isinstance(streaming_state, MHAState):
                state_snapshot["offset"] = streaming_state.offset.clone()
                state_snapshot["cached_keys"] = None if streaming_state.cached_keys is None else streaming_state.cached_keys.clone()
                state_snapshot["cached_values"] = None if streaming_state.cached_values is None else streaming_state.cached_values.clone()
                state_snapshot["cached_positions"] = (
                    None if streaming_state.cached_positions is None else streaming_state.cached_positions.clone()
                )
                state_snapshot["flash_cached_keys"] = (
                    None
                    if getattr(streaming_state, "_flash_cached_keys", None) is None
                    else cast(torch.Tensor, getattr(streaming_state, "_flash_cached_keys")).clone()
                )
                state_snapshot["flash_cached_values"] = (
                    None
                    if getattr(streaming_state, "_flash_cached_values", None) is None
                    else cast(torch.Tensor, getattr(streaming_state, "_flash_cached_values")).clone()
                )
            snapshots.append((streaming_state, state_snapshot))
        return snapshots

    def _restore_decoder_streaming_states(
        self,
        snapshots: list[tuple[StreamingState, dict[str, torch.Tensor | None]]],
    ) -> None:
        for streaming_state, state_snapshot in snapshots:
            exec_mask = state_snapshot["exec_mask"]
            assert exec_mask is not None
            streaming_state.exec_mask.copy_(exec_mask)
            if isinstance(streaming_state, TransformerState):
                offsets = state_snapshot.get("offsets")
                assert offsets is not None
                streaming_state.offsets.copy_(offsets)
            if isinstance(streaming_state, MHAState):
                offset = state_snapshot.get("offset")
                assert offset is not None
                streaming_state.offset.copy_(offset)
                cached_keys = state_snapshot.get("cached_keys")
                cached_values = state_snapshot.get("cached_values")
                cached_positions = state_snapshot.get("cached_positions")
                if cached_keys is None or cached_values is None or cached_positions is None:
                    if streaming_state.cached_keys is not None:
                        streaming_state.cached_keys.zero_()
                    if streaming_state.cached_values is not None:
                        streaming_state.cached_values.zero_()
                    if streaming_state.cached_positions is not None:
                        streaming_state.cached_positions.fill_(-1)
                else:
                    if streaming_state.cached_keys is None or streaming_state.cached_keys.shape != cached_keys.shape:
                        streaming_state.cached_keys = cached_keys.clone()
                    else:
                        streaming_state.cached_keys.copy_(cached_keys)
                    if streaming_state.cached_values is None or streaming_state.cached_values.shape != cached_values.shape:
                        streaming_state.cached_values = cached_values.clone()
                    else:
                        streaming_state.cached_values.copy_(cached_values)
                    if streaming_state.cached_positions is None or streaming_state.cached_positions.shape != cached_positions.shape:
                        streaming_state.cached_positions = cached_positions.clone()
                    else:
                        streaming_state.cached_positions.copy_(cached_positions)

                flash_cached_keys = state_snapshot.get("flash_cached_keys")
                flash_cached_values = state_snapshot.get("flash_cached_values")
                current_flash_cached_keys = cast(torch.Tensor | None, getattr(streaming_state, "_flash_cached_keys", None))
                current_flash_cached_values = cast(torch.Tensor | None, getattr(streaming_state, "_flash_cached_values", None))
                if flash_cached_keys is None or flash_cached_values is None:
                    if current_flash_cached_keys is not None:
                        current_flash_cached_keys.zero_()
                    if current_flash_cached_values is not None:
                        current_flash_cached_values.zero_()
                else:
                    if current_flash_cached_keys is None or current_flash_cached_keys.shape != flash_cached_keys.shape:
                        setattr(streaming_state, "_flash_cached_keys", flash_cached_keys.clone())
                    else:
                        current_flash_cached_keys.copy_(flash_cached_keys)
                    if current_flash_cached_values is None or current_flash_cached_values.shape != flash_cached_values.shape:
                        setattr(streaming_state, "_flash_cached_values", flash_cached_values.clone())
                    else:
                        current_flash_cached_values.copy_(flash_cached_values)

    def _graphed_decode_frame(
        self,
        codes: torch.Tensor,
        code_lengths: torch.Tensor,
    ) -> MossAudioTokenizerDecoderOutput:
        self._ensure_cuda_graph_buffers(codes.device)
        graph_input_codes = self._graph_input_codes
        graph_input_code_lengths = self._graph_input_code_lengths
        if graph_input_codes is None or graph_input_code_lengths is None:
            raise RuntimeError("CUDA graph buffers are unavailable.")

        num_quantizers = codes.shape[0]
        graph_input_codes_view = graph_input_codes[:num_quantizers]
        graph_input_codes_view.copy_(codes)
        graph_input_code_lengths.copy_(code_lengths)
        cuda_graph_key = (str(codes.device), self.max_batch_size, num_quantizers, self.model.compute_dtype_name)

        if self._cuda_graph is None or self._cuda_graph_key != cuda_graph_key:
            state_snapshots = self._snapshot_decoder_streaming_states()
            current_stream = torch.cuda.current_stream(device=codes.device)
            warmup_stream = torch.cuda.Stream(device=codes.device)
            warmup_stream.wait_stream(current_stream)
            with torch.cuda.stream(warmup_stream):
                _ = self.model._decode_frame(graph_input_codes_view, graph_input_code_lengths)
            current_stream.wait_stream(warmup_stream)
            self._restore_decoder_streaming_states(state_snapshots)

            cuda_graph = torch.cuda.CUDAGraph()
            with torch.cuda.graph(cuda_graph):
                decoder_output = self.model._decode_frame(graph_input_codes_view, graph_input_code_lengths)

            self._cuda_graph = cuda_graph
            self._cuda_graph_key = cuda_graph_key
            self._graph_output_audio = decoder_output.audio
            self._graph_output_audio_lengths = decoder_output.audio_lengths
        else:
            self._cuda_graph.replay()

        return MossAudioTokenizerDecoderOutput(
            audio=self._graph_output_audio,
            audio_lengths=self._graph_output_audio_lengths,
        )

    def _reset_slot(self, slot_index: int) -> None:
        for streaming_state in self._decoder_streaming_states():
            reset_mask = torch.zeros(streaming_state.batch_size, dtype=torch.bool, device=streaming_state.exec_mask.device)
            reset_mask[slot_index] = True
            streaming_state.reset(reset_mask)

    def _pack_logical_codes_to_physical_slots(
        self,
        request_ids: list[str | int],
        codes: torch.Tensor,
        code_lengths: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor, list[int], torch.Tensor]:
        if request_ids != self.active_request_ids:
            raise ValueError(_INVALID_DECODE_STEP_REQUEST_IDS_ERROR_MESSAGE)

        if not request_ids:
            raise ValueError("`step()` requires at least one active request.")

        if codes.dim() == 2:
            codes = codes.unsqueeze(1)
        if codes.dim() != 3:
            raise ValueError(f"`codes` must be 3D with shape `(num_quantizers, batch_size, sequence_length)`, got {codes.shape}.")

        code_lengths = code_lengths.to(device=codes.device, dtype=torch.long)
        if code_lengths.dim() != 1:
            raise ValueError(f"`code_lengths` must be 1D with shape `(batch_size,)`, got {code_lengths.shape}.")

        num_quantizers, logical_batch_size, max_code_length = codes.shape
        if logical_batch_size != len(request_ids):
            raise ValueError(
                f"`codes.shape[1]` ({logical_batch_size}) must match len(`request_ids`) ({len(request_ids)})."
            )
        if code_lengths.shape[0] != logical_batch_size:
            raise ValueError(
                f"`code_lengths.shape[0]` ({code_lengths.shape[0]}) must match len(`request_ids`) ({len(request_ids)})."
            )
        if torch.any(code_lengths < 0):
            raise ValueError("`code_lengths` must be >= 0.")
        if torch.any(code_lengths > max_code_length):
            raise ValueError(f"`code_lengths` must be <= codes.shape[-1] ({max_code_length}).")

        packed_codes = codes.new_zeros((num_quantizers, self.max_batch_size, max_code_length))
        packed_code_lengths = code_lengths.new_zeros((self.max_batch_size,))
        logical_row_to_slot_index: list[int] = []

        for logical_row_index, request_id in enumerate(request_ids):
            slot_index = self.request_id_to_slot_index[request_id]
            logical_row_to_slot_index.append(slot_index)
            row_length = int(code_lengths[logical_row_index].item())
            if row_length > 0:
                packed_codes[:, slot_index, :row_length] = codes[:, logical_row_index, :row_length]
            packed_code_lengths[slot_index] = row_length

        return packed_codes, packed_code_lengths, logical_row_to_slot_index, code_lengths

    def _advance_request_progress(
        self,
        request_ids: list[str | int],
        code_lengths: torch.Tensor,
        audio_lengths: torch.Tensor,
    ) -> None:
        for logical_row_index, request_id in enumerate(request_ids):
            self.request_id_to_code_offset[request_id] += int(code_lengths[logical_row_index].item())
            self.request_id_to_audio_offset[request_id] += int(audio_lengths[logical_row_index].item())

    def step(
        self,
        request_ids: list[str | int],
        codes: torch.Tensor,
        code_lengths: torch.Tensor,
    ) -> tuple[list[str | int], torch.Tensor, torch.Tensor]:
        self._ensure_open()

        packed_codes, packed_code_lengths, logical_row_to_slot_index, logical_code_lengths = (
            self._pack_logical_codes_to_physical_slots(
                request_ids=request_ids,
                codes=codes,
                code_lengths=code_lengths,
            )
        )
        max_step_length = int(packed_code_lengths.max().item())

        if max_step_length <= 0:
            raise ValueError("`step()` requires at least one row with `code_length > 0`.")

        decoder_streaming_states = self._decoder_streaming_states()
        logical_audio_chunks: list[list[torch.Tensor]] = [[] for _ in request_ids]
        audio_device: torch.device | None = None
        audio_dtype: torch.dtype | None = None
        audio_num_channels: int | None = None

        try:
            for frame_index in range(max_step_length):
                frame_exec_mask = packed_code_lengths > frame_index
                for streaming_state in decoder_streaming_states:
                    streaming_state.set_exec_mask(frame_exec_mask)

                frame_codes = packed_codes[:, :, frame_index : frame_index + 1]
                frame_code_lengths = frame_exec_mask.to(dtype=packed_code_lengths.dtype)
                if self._use_cuda_graph and frame_codes.is_cuda:
                    decoder_output = self._graphed_decode_frame(frame_codes, frame_code_lengths)
                else:
                    decoder_output = self.model._decode_frame(frame_codes, frame_code_lengths)

                if decoder_output.audio is None or decoder_output.audio_lengths is None:
                    raise RuntimeError("Internal error: `_decode_frame` returned empty audio.")

                audio = decoder_output.audio
                audio_lengths = decoder_output.audio_lengths
                audio_device = audio.device
                audio_dtype = audio.dtype
                audio_num_channels = audio.shape[1]

                for logical_row_index, slot_index in enumerate(logical_row_to_slot_index):
                    audio_length = int(audio_lengths[slot_index].item())
                    if audio_length <= 0:
                        continue
                    logical_audio_chunks[logical_row_index].append(audio[slot_index : slot_index + 1, :, :audio_length])
        except Exception:
            self.close()
            raise
        finally:
            for streaming_state in decoder_streaming_states:
                streaming_state.set_exec_mask(torch.ones_like(streaming_state.exec_mask))

        if audio_device is None or audio_dtype is None or audio_num_channels is None:
            raise RuntimeError("Internal error: `step()` produced no decoder outputs.")

        logical_audio_rows: list[torch.Tensor] = []
        logical_audio_lengths: list[int] = []
        for row_chunks in logical_audio_chunks:
            if row_chunks:
                row_audio = torch.cat(row_chunks, dim=-1)
            else:
                row_audio = torch.zeros((1, audio_num_channels, 0), device=audio_device, dtype=audio_dtype)
            logical_audio_rows.append(row_audio)
            logical_audio_lengths.append(row_audio.shape[-1])

        audio_lengths = torch.tensor(logical_audio_lengths, device=audio_device, dtype=torch.long)
        max_audio_length = max(logical_audio_lengths)
        audio = torch.zeros(
            (len(request_ids), audio_num_channels, max_audio_length),
            device=audio_device,
            dtype=audio_dtype,
        )
        for logical_row_index, row_audio in enumerate(logical_audio_rows):
            row_audio_length = row_audio.shape[-1]
            if row_audio_length > 0:
                audio[logical_row_index, :, :row_audio_length] = row_audio[0]

        logical_request_ids = list(request_ids)
        self._advance_request_progress(
            request_ids=logical_request_ids,
            code_lengths=logical_code_lengths,
            audio_lengths=audio_lengths,
        )

        return logical_request_ids, audio, audio_lengths

    def remove(self, request_id: str | int) -> None:
        self._ensure_open()

        slot_index = self.request_id_to_slot_index.get(request_id)
        if slot_index is None or request_id not in self.active_request_ids:
            raise RuntimeError(_UNKNOWN_DECODE_REQUEST_ERROR_TEMPLATE.format(request_id=request_id))
        if self.slot_is_free[slot_index] or self.slot_index_to_request_id[slot_index] != request_id:
            raise RuntimeError(_UNKNOWN_DECODE_REQUEST_ERROR_TEMPLATE.format(request_id=request_id))

        self.active_request_ids.remove(request_id)
        self._reset_slot(slot_index)
        _ = self.request_id_to_slot_index.pop(request_id)
        self.slot_index_to_request_id[slot_index] = None
        self.slot_is_free[slot_index] = True
        _ = self.request_id_to_code_offset.pop(request_id, None)
        _ = self.request_id_to_audio_offset.pop(request_id, None)

    def close(self) -> None:
        if self._closed:
            return

        self._closed = True
        decode_streaming_exit_stack = self._decode_streaming_exit_stack
        self._decode_streaming_exit_stack = None
        try:
            if decode_streaming_exit_stack is not None:
                decode_streaming_exit_stack.close()
        finally:
            for module in self._flash_kvcache_attention_modules:
                module._use_flash_kvcache = False
            self._flash_kvcache_attention_modules = []
            self._cuda_graph = None
            self._cuda_graph_key = None
            self._graph_input_codes = None
            self._graph_input_code_lengths = None
            self._graph_output_audio = None
            self._graph_output_audio_lengths = None
            if self.model._active_decode_session is self:
                self.model._active_decode_session = None


# =============================================================================
# Normalization Layers
# =============================================================================


class MossAudioTokenizerRMSNorm(nn.Module):
    """Root Mean Square Layer Normalization."""

    def __init__(
        self,
        dim: int,
        eps: float = 1e-5,
        dtype: torch.dtype | None = None,
        device=None,
    ):
        super().__init__()
        self.eps = eps
        self.dtype = dtype
        self.alpha = nn.Parameter(torch.full((1, 1, dim), 1.0, requires_grad=True, device=device, dtype=dtype))

    def forward(self, x: torch.Tensor):
        x_dtype = x.dtype
        if self.dtype is not None:
            x = x.to(self.dtype)
        var = self.eps + torch.mean(x**2, dim=-1, keepdim=True)
        alpha = self.alpha.to(var)
        if x.dim() == 2:
            alpha = alpha.view(1, -1)
        y = (x * (alpha * torch.rsqrt(var))).to(x_dtype)
        return y


class MossAudioTokenizerLayerScale(nn.Module):
    """Layer scale from Touvron et al. 2021."""

    def __init__(
        self,
        channels: int,
        init: float = 1e-4,
        channel_last: bool = True,
        device=None,
        dtype=None,
    ):
        super().__init__()
        self.channel_last = channel_last
        self.scale = nn.Parameter(torch.full((channels,), init, requires_grad=True, device=device, dtype=dtype))

    def forward(self, x: torch.Tensor):
        if self.channel_last:
            return self.scale * x
        else:
            return self.scale[:, None] * x


def create_norm_fn(norm_type: str, dim: int, **kwargs) -> nn.Module:
    """Create normalization module."""
    if norm_type == "layer_norm":
        return nn.LayerNorm(dim, eps=1e-5, **kwargs)
    elif norm_type in {"rms_norm"}:
        return MossAudioTokenizerRMSNorm(dim, eps=1e-5, **kwargs)
    elif norm_type in {"rms_norm_f32"}:
        kwargs.pop("dtype", None)
        return MossAudioTokenizerRMSNorm(dim, eps=1e-8, dtype=torch.float, **kwargs)
    else:
        raise ValueError(f"Unknown norm type: {norm_type}")


# =============================================================================
# Rotary Position Embedding
# =============================================================================


def apply_rope(
    q: torch.Tensor,
    k: torch.Tensor,
    offset: torch.Tensor,
    max_period: float = 10_000,
    time_before_heads: bool = False,
):
    """Apply rotary position embedding."""
    if time_before_heads:
        B, T, H, D = q.shape
    else:
        B, H, T, D = q.shape
    if k.shape != q.shape:
        raise ValueError(f"Expected k.shape == q.shape, got k={tuple(k.shape)} q={tuple(q.shape)}")
    if D <= 0 or (D % 2) != 0:
        raise ValueError(f"RoPE requires an even last dimension, got D={D}")

    ds = torch.arange(D // 2, device=q.device, dtype=torch.float32)
    freqs = torch.exp(ds * (-math.log(max_period) * 2 / D))
    ts = offset.float().view(-1, 1) + torch.arange(T, device=q.device, dtype=torch.float32)

    if time_before_heads:
        ts = ts.view(B, -1, 1, 1)
    else:
        ts = ts.view(B, 1, -1, 1)

    dims = q.shape[:-1]
    q = q.view(*dims, D // 2, 2)
    k = k.view(*dims, D // 2, 2)

    qr, qi = q[..., 0].float(), q[..., 1].float()
    kr, ki = k[..., 0].float(), k[..., 1].float()

    rotr = torch.cos(freqs * ts)
    roti = torch.sin(freqs * ts)

    qor = qr * rotr - qi * roti
    qoi = qr * roti + qi * rotr
    kor = kr * rotr - ki * roti
    koi = kr * roti + ki * rotr

    dtype = q.dtype
    qo = torch.stack([qor.to(dtype), qoi.to(dtype)], dim=-1)
    ko = torch.stack([kor.to(dtype), koi.to(dtype)], dim=-1)

    return qo.view(*dims, D), ko.view(*dims, D)


def apply_rope_with_positions(
    q: torch.Tensor,
    k: torch.Tensor,
    positions: torch.Tensor,
    max_period: float = 10_000,
):
    """Apply rotary position embedding to packed `[N, H, D]` tensors."""
    N, H, D = q.shape
    if k.shape != q.shape:
        raise ValueError(f"Expected k.shape == q.shape, got k={tuple(k.shape)} q={tuple(q.shape)}")
    if D <= 0 or (D % 2) != 0:
        raise ValueError(f"RoPE requires an even last dimension, got D={D}")

    ds = torch.arange(D // 2, device=q.device, dtype=torch.float32)
    freqs = torch.exp(ds * (-math.log(max_period) * 2 / D))
    ts = positions.to(torch.float32).view(N, 1, 1)

    qr = q.float().view(N, H, D // 2, 2)[..., 0]
    qi = q.float().view(N, H, D // 2, 2)[..., 1]
    kr = k.float().view(N, H, D // 2, 2)[..., 0]
    ki = k.float().view(N, H, D // 2, 2)[..., 1]

    rotr = torch.cos(ts * freqs.view(1, 1, -1))
    roti = torch.sin(ts * freqs.view(1, 1, -1))

    qor = qr * rotr - qi * roti
    qoi = qr * roti + qi * rotr
    kor = kr * rotr - ki * roti
    koi = kr * roti + ki * rotr

    dtype = q.dtype
    qo = torch.stack([qor.to(dtype), qoi.to(dtype)], dim=-1)
    ko = torch.stack([kor.to(dtype), koi.to(dtype)], dim=-1)
    return qo.view(N, H, D), ko.view(N, H, D)


class MossAudioTokenizerRotaryEmbedding(nn.Module):
    """Rotary positional embedding (RoPE)."""

    def __init__(self, max_period: float = 10000.0):
        super().__init__()
        self.max_period = max_period

    def forward(
        self,
        q: torch.Tensor,
        k: torch.Tensor,
        offset: torch.Tensor,
        time_before_heads: bool = False,
    ):
        return apply_rope(q, k, offset, self.max_period, time_before_heads)


# =============================================================================
# Gating Modules
# =============================================================================


class MossAudioTokenizerActivationGating(nn.Module):
    """Gating FFN layer with activation."""

    def __init__(self, dim: int, dim_feedforward: int, activation, **factory_kwargs):
        super().__init__()
        if dim_feedforward == 4 * dim:
            hidden = (21 * dim) // 8
        else:
            hidden = (2 * dim_feedforward) // 3

        self.linear_in = nn.Linear(dim, 2 * hidden, bias=False, **factory_kwargs)
        self.linear_out = nn.Linear(hidden, dim, bias=False, **factory_kwargs)
        self.activation = activation

    def forward(self, x: torch.Tensor):
        x = self.linear_in(x)
        B, T, _ = x.shape
        x = x.view(B, T, 2, -1)
        x = self.activation(x[..., 0, :]) * x[..., 1, :]
        x = self.linear_out(x)
        return x


def _get_activation(name: str):
    if name in ["sigmoid", "tanh", "relu"]:
        return getattr(torch, name)
    elif name in ["leaky_relu", "elu", "gelu", "silu", "mish", "softsign"]:
        return getattr(F, name)
    elif name == "identity":
        return nn.Identity()
    else:
        raise ValueError(f"Unknown activation {name}")


def make_gating(name: str, dim: int, dim_feedforward: int, **factory_kwargs) -> nn.Module:
    return MossAudioTokenizerActivationGating(dim, dim_feedforward, _get_activation(name), **factory_kwargs)


# =============================================================================
# Positional Embeddings
# =============================================================================


def create_sin_embedding(
    positions: torch.Tensor,
    dim: int,
    max_period: float = 10000,
    dtype: torch.dtype = torch.float32,
) -> torch.Tensor:
    """Create sinusoidal positional embedding with shape [..., C]."""
    if dim % 2 != 0:
        raise ValueError(f"Sinusoidal embedding requires even dim, got dim={dim}")
    half_dim = dim // 2
    if half_dim <= 1:
        raise ValueError(f"Sinusoidal embedding requires dim >= 4, got dim={dim}")
    if positions.dim() == 0:
        positions = positions.view(1)
    positions = positions.to(dtype).unsqueeze(-1)
    adim = torch.arange(half_dim, device=positions.device, dtype=dtype)
    max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype)
    phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)


def pack_padded_sequence(
    x: torch.Tensor,
    input_lengths: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """Pack a padded `[B, T, D]` tensor into `[N, D]` plus metadata."""
    batch_size, max_seqlen, _ = x.shape
    positions = torch.arange(max_seqlen, device=x.device, dtype=torch.long)
    valid_mask = positions.view(1, max_seqlen) < input_lengths.view(batch_size, 1)
    packed_x = x[valid_mask]
    cu_seqlens = torch.zeros(batch_size + 1, device=x.device, dtype=torch.int32)
    cu_seqlens[1:] = torch.cumsum(input_lengths.to(torch.int32), dim=0)
    position_ids = positions.view(1, max_seqlen).expand(batch_size, -1)[valid_mask]
    return packed_x, valid_mask, cu_seqlens, position_ids


def unpack_packed_sequence(
    packed_x: torch.Tensor,
    valid_mask: torch.Tensor,
    batch_size: int,
    max_seqlen: int,
) -> torch.Tensor:
    """Unpack a packed `[N, D]` tensor back into `[B, T, D]`."""
    output = packed_x.new_zeros((batch_size, max_seqlen, packed_x.shape[-1]))
    output[valid_mask] = packed_x
    return output


# =============================================================================
# KV Cache for Attention
# =============================================================================


class KVCacheResult:
    """Container for KV cache results that supports tuple unpacking."""

    __slots__ = ("keys", "values", "positions")

    def __init__(self, keys: torch.Tensor, values: torch.Tensor, positions: torch.Tensor):
        self.keys = keys
        self.values = values
        self.positions = positions

    def __iter__(self):
        """Allow unpacking as (keys, values, positions)."""
        return iter((self.keys, self.values, self.positions))

    @staticmethod
    def from_kv(keys: torch.Tensor, values: torch.Tensor) -> KVCacheResult:
        B, H, T, D = keys.shape
        positions = torch.arange(T, device=keys.device, dtype=torch.long)
        return KVCacheResult(keys, values, positions.expand(B, -1))


class RingKVCache:
    """Efficient streaming KVCache compatible with CUDA Graph."""

    def __init__(
        self,
        batch_size: int,
        num_heads: int,
        dim_per_head: int,
        capacity: int,
        respect_exec_mask: bool = True,
        device: torch.device = torch.device("cuda"),
        dtype: torch.dtype = torch.bfloat16,
    ):
        self.capacity = capacity
        self.cache = torch.zeros(
            (2, batch_size, num_heads, capacity, dim_per_head),
            device=device,
            dtype=dtype,
        )
        self.respect_exec_mask = respect_exec_mask
        if self.respect_exec_mask:
            self.end_offset = torch.zeros(batch_size, device=device, dtype=torch.long)
        else:
            self.end_offset = torch.zeros(1, device=device, dtype=torch.long)

    def reset(self, reset_mask: torch.Tensor) -> None:
        self.end_offset[:] = torch.where(reset_mask, torch.zeros_like(self.end_offset), self.end_offset)

    def complete(self, k: torch.Tensor, v: torch.Tensor, exec_mask: torch.Tensor) -> KVCacheResult:
        B, H, T, D = k.shape
        if T <= 0:
            raise ValueError(f"Expected T > 0, got T={T}")

        indexes = torch.arange(T, device=self.end_offset.device, dtype=self.end_offset.dtype)
        indexes = indexes + self.end_offset.view(-1, 1)
        indexes = indexes % self.capacity

        if self.respect_exec_mask:
            this_indexes = indexes.view(B, 1, T, 1).expand(-1, H, T, D)
            self.cache[0].scatter_(2, this_indexes, k)
            self.cache[1].scatter_(2, this_indexes, v)
        else:
            self.cache[0].index_copy_(2, indexes[0], k)
            self.cache[1].index_copy_(2, indexes[0], v)

        keys = self.cache[0]
        values = self.cache[1]

        indexes = torch.arange(self.capacity, device=self.end_offset.device, dtype=torch.long)
        last_offset = self.end_offset.view(-1, 1) + T - 1
        end_index = last_offset % self.capacity
        delta = indexes - end_index

        positions = torch.where(
            delta <= 0,
            last_offset + delta,
            last_offset + delta - self.capacity,
        )

        if self.respect_exec_mask:
            self.end_offset[:] = torch.where(exec_mask, self.end_offset + T, self.end_offset)
        else:
            self.end_offset.add_(T)

        invalid = indexes >= self.end_offset.view(-1, 1)
        positions = torch.where(invalid, torch.full_like(positions, -1), positions)

        return KVCacheResult(keys, values, positions)


# =============================================================================
# Multi-Head Attention
# =============================================================================


_sync_module_proxy()
@dataclass
class MHAState(StreamingState):
    cached_keys: torch.Tensor | None
    cached_values: torch.Tensor | None
    cached_positions: torch.Tensor | None
    offset: torch.Tensor

    def reset(self, reset_mask: torch.Tensor):
        super().reset(reset_mask)
        self.offset[:] = torch.where(reset_mask, torch.zeros_like(self.offset), self.offset)
        if self.cached_positions is not None:
            self.cached_positions[reset_mask] = -1
        if self.cached_keys is not None:
            self.cached_keys[reset_mask] = 0
        if self.cached_values is not None:
            self.cached_values[reset_mask] = 0


def apply_weights_per_step(
    modules: nn.ModuleList,
    schedule: list[int] | None,
    x: torch.Tensor,
    offset: int | None,
) -> torch.Tensor:
    """Apply different weights for each time step."""
    if len(modules) == 1:
        return modules[0](x)

    if offset is None:
        raise ValueError("offset must be provided when using per-step weights (len(modules) > 1).")
    if x.dim() != 3:
        raise ValueError(
            f"Per-step weights require a dense `[B, T, C]` tensor when len(modules) > 1, got shape {tuple(x.shape)}."
        )
    ys = []
    B, T, C = x.shape
    for t in range(T):
        module_index = t + offset
        if schedule is not None:
            if module_index >= len(schedule) or module_index < 0:
                raise ValueError(
                    f"weights_per_step_schedule is too short for module_index={module_index} (len={len(schedule)})."
                )
            module_index = schedule[module_index]
        if module_index >= len(modules) or module_index < 0:
            raise ValueError(f"module_index={module_index} out of range for len(modules)={len(modules)}.")
        y = modules[module_index](x[:, t : t + 1])
        ys.append(y)
    return torch.cat(ys, 1)


class MossAudioTokenizerMultiheadAttention(StreamingModule):
    """Multi-head attention with streaming support."""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        causal: bool = False,
        context: int | None = None,
        rope: MossAudioTokenizerRotaryEmbedding | None = None,
        attention_implementation: str = "sdpa",
        device=None,
        dtype=None,
    ):
        super().__init__()
        factory_kwargs = {"device": device, "dtype": dtype}

        self.embed_dim = embed_dim
        self.causal = causal
        self.context = context
        self.rope = rope
        self.num_heads = num_heads
        if attention_implementation not in SUPPORTED_ATTENTION_IMPLEMENTATIONS:
            raise ValueError(
                f"Unsupported attention_implementation={attention_implementation!r}. "
                f"Expected one of {sorted(SUPPORTED_ATTENTION_IMPLEMENTATIONS)}."
            )
        self.attention_implementation = attention_implementation
        self._use_flash_kvcache = False
        self.in_proj = nn.Linear(embed_dim, 3 * embed_dim, bias=False, **factory_kwargs)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False, **factory_kwargs)

        self._register_load_state_dict_pre_hook(self._load_hook, with_module=True)

    def set_attention_implementation(self, attention_implementation: str) -> None:
        if attention_implementation not in SUPPORTED_ATTENTION_IMPLEMENTATIONS:
            raise ValueError(
                f"Unsupported attention_implementation={attention_implementation!r}. "
                f"Expected one of {sorted(SUPPORTED_ATTENTION_IMPLEMENTATIONS)}."
            )
        self.attention_implementation = attention_implementation

    @staticmethod
    def _load_hook(module, state_dict, prefix, *_):
        mappings = {
            "in_proj_weight": "in_proj.weight",
            "in_projs.0.weight": "in_proj.weight",
            "out_projs.0.weight": "out_proj.weight",
        }
        for suffix in ["", "_scb"]:
            for source, target in mappings.items():
                this_source = prefix + source + suffix
                if this_source in state_dict:
                    state_dict[prefix + target + suffix] = state_dict.pop(this_source)

    def _init_streaming_state(self, batch_size: int) -> MHAState:
        device = cast(torch.device, self.in_proj.weight.device)
        return MHAState(
            batch_size,
            device,
            cached_keys=None,
            cached_values=None,
            cached_positions=None,
            offset=torch.zeros(batch_size, device=cast(torch.device, device), dtype=torch.long),
        )

    def _supports_flash_attention(self, device: torch.device, dtype: torch.dtype) -> bool:
        return _has_flash_attn() and device.type == "cuda" and dtype in {torch.float16, torch.bfloat16}

    def _get_backend_check_dtype(self, x: torch.Tensor) -> torch.dtype:
        if x.device.type != "cuda":
            return x.dtype
        try:
            autocast_enabled = torch.is_autocast_enabled("cuda")
        except TypeError:
            autocast_enabled = torch.is_autocast_enabled()
        if not autocast_enabled:
            return x.dtype
        try:
            return torch.get_autocast_dtype("cuda")
        except TypeError:
            return torch.get_autocast_gpu_dtype()

    def resolve_attention_implementation(self, x: torch.Tensor, is_streaming: bool) -> str:
        if self.attention_implementation == "sdpa":
            return "sdpa"
        backend_dtype = self._get_backend_check_dtype(x)
        if self._supports_flash_attention(x.device, backend_dtype):
            return "flash_attention_2"
        if self.attention_implementation == "flash_attention_2":
            logger.warning_once(
                "Falling back to SDPA because flash_attention_2 is unavailable for device=%s dtype=%s "
                "(HAS_FLASH_ATTN=%s).",
                x.device,
                backend_dtype,
                _has_flash_attn(),
            )
        return "sdpa"

    def _project_qkv(
        self,
        x: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        dim_per_head = self.embed_dim // self.num_heads
        if x.dim() == 3:
            projected = self.in_proj(x)
            projected = projected.reshape(x.shape[0], x.shape[1], 3, self.num_heads, dim_per_head).permute(
                2, 0, 3, 1, 4
            )
            return projected[0], projected[1], projected[2]
        if x.dim() == 2:
            projected = self.in_proj(x)
            projected = projected.view(x.shape[0], 3, self.num_heads, dim_per_head)
            return projected[:, 0], projected[:, 1], projected[:, 2]
        raise ValueError(f"Expected a 2D or 3D tensor, got shape {tuple(x.shape)}")

    def _apply_dense_rope(self, q: torch.Tensor, k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        if self.rope is None:
            return q, k
        offset = torch.zeros(q.shape[0], device=q.device, dtype=torch.long)
        return self.rope(q, k, offset, time_before_heads=False)

    def _apply_packed_rope(
        self,
        q: torch.Tensor,
        k: torch.Tensor,
        position_ids: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        if self.rope is None:
            return q, k
        return apply_rope_with_positions(q, k, position_ids, max_period=self.rope.max_period)

    def _ensure_streaming_cache(
        self,
        state: MHAState,
        batch_size: int,
        device: torch.device,
        dtype: torch.dtype,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        head_dim = self.embed_dim // self.num_heads
        cache_length = 0 if self.context is None else self.context
        if state.cached_keys is None or state.cached_values is None or state.cached_positions is None:
            state.cached_keys = torch.zeros(
                (batch_size, self.num_heads, cache_length, head_dim),
                device=device,
                dtype=dtype,
            )
            state.cached_values = torch.zeros_like(state.cached_keys)
            state.cached_positions = torch.full(
                (batch_size, cache_length),
                -1,
                device=device,
                dtype=torch.long,
            )
        else:
            if state.cached_keys.device != device or state.cached_keys.dtype != dtype:
                state.cached_keys = state.cached_keys.to(device=device, dtype=dtype)
            if state.cached_values.device != device or state.cached_values.dtype != dtype:
                state.cached_values = state.cached_values.to(device=device, dtype=dtype)
            if state.cached_positions.device != device:
                state.cached_positions = state.cached_positions.to(device=device)
        return state.cached_keys, state.cached_values, state.cached_positions

    def _ensure_flash_kvcache(
        self,
        state: MHAState,
        batch_size: int,
        device: torch.device,
        dtype: torch.dtype,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        if self.context is None:
            raise RuntimeError("flash_attn_with_kvcache requires a finite streaming context.")
        head_dim = self.embed_dim // self.num_heads
        flash_cached_keys = cast(torch.Tensor | None, getattr(state, "_flash_cached_keys", None))
        flash_cached_values = cast(torch.Tensor | None, getattr(state, "_flash_cached_values", None))
        if flash_cached_keys is None or flash_cached_values is None:
            flash_cached_keys = torch.zeros(
                (batch_size, self.context, self.num_heads, head_dim),
                device=device,
                dtype=dtype,
            )
            flash_cached_values = torch.zeros_like(flash_cached_keys)
        else:
            if flash_cached_keys.device != device or flash_cached_keys.dtype != dtype:
                flash_cached_keys = flash_cached_keys.to(device=device, dtype=dtype)
            if flash_cached_values.device != device or flash_cached_values.dtype != dtype:
                flash_cached_values = flash_cached_values.to(device=device, dtype=dtype)
        setattr(state, "_flash_cached_keys", flash_cached_keys)
        setattr(state, "_flash_cached_values", flash_cached_values)
        return flash_cached_keys, flash_cached_values

    def _build_streaming_kv(
        self,
        cached_k: torch.Tensor,
        cached_v: torch.Tensor,
        cached_pos: torch.Tensor,
        k_cur: torch.Tensor,
        v_cur: torch.Tensor,
        pos_q: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        k_all = torch.cat([cached_k, k_cur], dim=2)
        v_all = torch.cat([cached_v, v_cur], dim=2)
        pos_k = torch.cat([cached_pos, pos_q], dim=1)
        return k_all, v_all, pos_k

    def _update_streaming_cache(
        self,
        state: MHAState,
        cached_k: torch.Tensor,
        cached_v: torch.Tensor,
        cached_pos: torch.Tensor,
        k_all: torch.Tensor,
        v_all: torch.Tensor,
        pos_k: torch.Tensor,
    ) -> None:
        exec_mask = state.exec_mask.view(-1, 1, 1, 1)
        exec_mask_pos = state.exec_mask.view(-1, 1)
        if self.context is None:
            if not bool(state.exec_mask.all().item()):
                raise RuntimeError("Streaming exec_mask with context=None is not supported.")
            state.cached_keys = k_all.contiguous()
            state.cached_values = v_all.contiguous()
            state.cached_positions = pos_k.contiguous()
            return

        assert state.cached_keys is not None
        assert state.cached_values is not None
        assert state.cached_positions is not None
        new_cached_k = k_all[:, :, -self.context :, :].contiguous()
        new_cached_v = v_all[:, :, -self.context :, :].contiguous()
        new_cached_pos = pos_k[:, -self.context :].contiguous()
        state.cached_keys.copy_(torch.where(exec_mask, new_cached_k, cached_k))
        state.cached_values.copy_(torch.where(exec_mask, new_cached_v, cached_v))
        state.cached_positions.copy_(torch.where(exec_mask_pos, new_cached_pos, cached_pos))

    def _build_streaming_sdpa_bias(self, pos_q: torch.Tensor, pos_k: torch.Tensor) -> torch.Tensor:
        delta = pos_q[:, :, None] - pos_k[:, None, :]
        attn_bias = (pos_k[:, None, :] >= 0) & (delta >= 0)
        if self.context is not None:
            attn_bias = attn_bias & (delta < self.context)
        return attn_bias[:, None, :, :]

    def _build_non_streaming_sdpa_bias(
        self,
        input_lengths: torch.Tensor,
        max_seqlen: int,
        device: torch.device,
    ) -> torch.Tensor:
        positions = torch.arange(max_seqlen, device=device, dtype=torch.long)
        valid_k = positions.view(1, 1, max_seqlen) < input_lengths.view(-1, 1, 1)
        if not self.causal and self.context is None:
            return valid_k[:, None, :, :].expand(-1, 1, max_seqlen, -1)
        delta = positions.view(1, max_seqlen, 1) - positions.view(1, 1, max_seqlen)
        attn_bias = torch.ones((1, max_seqlen, max_seqlen), device=device, dtype=torch.bool)
        if self.causal:
            attn_bias = attn_bias & (delta >= 0)
        if self.context is not None:
            attn_bias = attn_bias & (delta < self.context)
        return (attn_bias & valid_k)[:, None, :, :]

    def _run_flash_attention(
        self,
        q: torch.Tensor,
        k: torch.Tensor,
        v: torch.Tensor,
        cu_seqlens_q: torch.Tensor,
        cu_seqlens_k: torch.Tensor,
        max_seqlen_q: int,
        max_seqlen_k: int,
    ) -> torch.Tensor:
        flash_attn_varlen_func = _get_flash_attn_varlen_func()
        if flash_attn_varlen_func is None:
            raise RuntimeError("flash-attn is not installed.")
        window_size = (self.context, 0) if (self.context is not None and self.causal) else (-1, -1)
        return cast(
            torch.Tensor,
            flash_attn_varlen_func(
                q.contiguous(),
                k.contiguous(),
                v.contiguous(),
                cu_seqlens_q,
                cu_seqlens_k,
                max_seqlen_q,
                max_seqlen_k,
                causal=self.causal,
                window_size=window_size,
            ),
        )

    def _forward_streaming_sdpa(self, x: torch.Tensor, state: MHAState) -> torch.Tensor:
        batch_size, chunk_length, _ = x.shape
        q, k_cur, v_cur = self._project_qkv(x)
        if self.rope is not None:
            q, k_cur = self.rope(q, k_cur, state.offset, time_before_heads=False)
        pos_q = state.offset.view(-1, 1) + torch.arange(chunk_length, device=x.device, dtype=torch.long).view(1, -1)
        cached_k, cached_v, cached_pos = self._ensure_streaming_cache(state, batch_size, k_cur.device, k_cur.dtype)
        k_all, v_all, pos_k = self._build_streaming_kv(cached_k, cached_v, cached_pos, k_cur, v_cur, pos_q)
        attn_bias = self._build_streaming_sdpa_bias(pos_q, pos_k)
        out = F.scaled_dot_product_attention(q, k_all, v_all, attn_bias, dropout_p=0.0)
        out = out.transpose(1, 2).reshape(batch_size, chunk_length, self.embed_dim)

        self._update_streaming_cache(state, cached_k, cached_v, cached_pos, k_all, v_all, pos_k)
        state.offset[:] = torch.where(state.exec_mask, state.offset + chunk_length, state.offset)
        return out

    def _forward_streaming_flash(self, x: torch.Tensor, state: MHAState) -> torch.Tensor:
        batch_size, chunk_length, _ = x.shape
        q, k_cur, v_cur = self._project_qkv(x)
        if self.rope is not None:
            q, k_cur = self.rope(q, k_cur, state.offset, time_before_heads=False)
        pos_q = state.offset.view(-1, 1) + torch.arange(chunk_length, device=x.device, dtype=torch.long).view(1, -1)
        cached_k, cached_v, cached_pos = self._ensure_streaming_cache(state, batch_size, k_cur.device, k_cur.dtype)
        k_all, v_all, pos_k = self._build_streaming_kv(cached_k, cached_v, cached_pos, k_cur, v_cur, pos_q)

        q_chunks = []
        k_chunks = []
        v_chunks = []
        cu_q = [0]
        cu_k = [0]
        max_kv_len = 0

        for batch_idx in range(batch_size):
            valid_k = pos_k[batch_idx] >= 0
            q_i = q[batch_idx].transpose(0, 1).contiguous()
            k_i = k_all[batch_idx, :, valid_k, :].transpose(0, 1).contiguous()
            v_i = v_all[batch_idx, :, valid_k, :].transpose(0, 1).contiguous()
            q_chunks.append(q_i)
            k_chunks.append(k_i)
            v_chunks.append(v_i)
            cu_q.append(cu_q[-1] + q_i.shape[0])
            cu_k.append(cu_k[-1] + k_i.shape[0])
            max_kv_len = max(max_kv_len, int(k_i.shape[0]))

        out_flat = self._run_flash_attention(
            torch.cat(q_chunks, dim=0),
            torch.cat(k_chunks, dim=0),
            torch.cat(v_chunks, dim=0),
            torch.tensor(cu_q, device=x.device, dtype=torch.int32),
            torch.tensor(cu_k, device=x.device, dtype=torch.int32),
            max_seqlen_q=chunk_length,
            max_seqlen_k=max_kv_len,
        )

        outputs = []
        start = 0
        for _ in range(batch_size):
            outputs.append(out_flat[start : start + chunk_length].transpose(0, 1).contiguous())
            start += chunk_length
        out = torch.stack(outputs, dim=0)
        out = out.transpose(1, 2).reshape(batch_size, chunk_length, self.embed_dim)

        self._update_streaming_cache(state, cached_k, cached_v, cached_pos, k_all, v_all, pos_k)
        state.offset[:] = torch.where(state.exec_mask, state.offset + chunk_length, state.offset)
        return out

    def _forward_streaming_flash_kvcache(self, x: torch.Tensor, state: MHAState) -> torch.Tensor:
        flash_attn_with_kvcache = _get_flash_attn_with_kvcache()
        if self.context is None:
            raise RuntimeError("flash_attn_with_kvcache requires a finite streaming context.")
        if flash_attn_with_kvcache is None:
            raise RuntimeError("flash-attn is not installed.")

        batch_size, chunk_length, _ = x.shape
        q, k_cur, v_cur = self._project_qkv(x)
        if self.rope is not None:
            q, k_cur = self.rope(q, k_cur, state.offset, time_before_heads=False)

        q = q.transpose(1, 2).contiguous()
        k_cur = k_cur.transpose(1, 2).contiguous()
        v_cur = v_cur.transpose(1, 2).contiguous()

        exec_mask = state.exec_mask.view(batch_size, 1, 1, 1).to(dtype=k_cur.dtype)
        k_cur = k_cur * exec_mask
        v_cur = v_cur * exec_mask

        k_cache, v_cache = self._ensure_flash_kvcache(state, batch_size, k_cur.device, k_cur.dtype)
        cache_seqlens = state.offset.clamp(max=self.context).to(torch.int32)
        window_size = (self.context - 1, 0)

        out = cast(
            torch.Tensor,
            flash_attn_with_kvcache(
                q,
                k_cache,
                v_cache,
                k=k_cur,
                v=v_cur,
                cache_seqlens=cache_seqlens,
                causal=True,
                window_size=window_size,
            ),
        )
        out = out.reshape(batch_size, chunk_length, self.embed_dim)
        state.offset[:] = torch.where(state.exec_mask, state.offset + chunk_length, state.offset)
        return out

    def _forward_non_streaming_sdpa(self, x: torch.Tensor, input_lengths: torch.Tensor) -> torch.Tensor:
        batch_size, max_seqlen, _ = x.shape
        q, k, v = self._project_qkv(x)
        q, k = self._apply_dense_rope(q, k)
        attn_bias = self._build_non_streaming_sdpa_bias(input_lengths, max_seqlen, x.device)
        out = F.scaled_dot_product_attention(q, k, v, attn_bias, dropout_p=0.0)
        valid_q = (torch.arange(max_seqlen, device=x.device).view(1, max_seqlen) < input_lengths.view(-1, 1)).view(
            batch_size, 1, max_seqlen, 1
        )
        # Some SDPA backends return NaNs for fully-masked padded query rows in local-causal attention.
        # Multiplying by zero is not sufficient because NaN * 0 is still NaN; use torch.where so padded
        # rows are materialized as exact zeros before they can leak into later layers as masked K/V values.
        out = torch.where(valid_q, out, torch.zeros((), device=out.device, dtype=out.dtype))
        return out.transpose(1, 2).reshape(batch_size, max_seqlen, self.embed_dim)

    def _forward_non_streaming_flash(
        self,
        x: torch.Tensor,
        cu_seqlens: torch.Tensor,
        max_seqlen: int,
        position_ids: torch.Tensor,
    ) -> torch.Tensor:
        q, k, v = self._project_qkv(x)
        q, k = self._apply_packed_rope(q, k, position_ids)
        out = self._run_flash_attention(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen)
        return out.reshape(x.shape[0], self.embed_dim)

    def forward(
        self,
        query: torch.Tensor,
        cu_seqlens: torch.Tensor | None = None,
        max_seqlen: int | None = None,
        position_ids: torch.Tensor | None = None,
        input_lengths: torch.Tensor | None = None,
    ):
        state = cast(MHAState | None, self._streaming_state)
        backend = self.resolve_attention_implementation(query, is_streaming=state is not None)

        if state is not None:
            if query.dim() != 3:
                raise ValueError(f"Streaming attention expects a 3D tensor, got shape {tuple(query.shape)}")
            if backend == "flash_attention_2" and self._use_flash_kvcache:
                out = self._forward_streaming_flash_kvcache(query, state)
            elif backend == "flash_attention_2":
                out = self._forward_streaming_flash(query, state)
            else:
                out = self._forward_streaming_sdpa(query, state)
            return self.out_proj(out)

        if backend == "flash_attention_2":
            if query.dim() != 2:
                raise ValueError(f"Packed flash attention expects a 2D tensor, got shape {tuple(query.shape)}")
            if cu_seqlens is None or max_seqlen is None or position_ids is None:
                raise ValueError("Packed flash attention requires cu_seqlens, max_seqlen, and position_ids.")
            out = self._forward_non_streaming_flash(query, cu_seqlens, max_seqlen, position_ids)
            return self.out_proj(out)

        if query.dim() != 3:
            raise ValueError(f"Non-streaming SDPA expects a 3D tensor, got shape {tuple(query.shape)}")
        if input_lengths is None:
            raise ValueError("Non-streaming SDPA requires input_lengths.")
        out = self._forward_non_streaming_sdpa(query, input_lengths)
        return self.out_proj(out)


# =============================================================================
# Transformer Layer
# =============================================================================


_sync_module_proxy()
@dataclass
class LayerState(StreamingState):
    pass


class MossAudioTokenizerTransformerLayer(StreamingModule):
    """Transformer layer with streaming support."""

    def __init__(
        self,
        d_model: int,
        num_heads: int,
        dim_feedforward: int = 2048,
        causal: bool = False,
        context: int | None = None,
        rope: MossAudioTokenizerRotaryEmbedding | None = None,
        attention_implementation: str = "sdpa",
        norm: str = "layer_norm",
        layer_scale: float | None = None,
        gating: str = "none",
        device=None,
        dtype=None,
    ):
        super().__init__()
        factory_kwargs = {"device": device, "dtype": dtype}

        self.self_attn = MossAudioTokenizerMultiheadAttention(
            embed_dim=d_model,
            num_heads=num_heads,
            causal=causal,
            context=context,
            rope=rope,
            attention_implementation=attention_implementation,
            **factory_kwargs,
        )
        self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)
        self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)
        if gating == "none":
            self.ffn = nn.Sequential(
                nn.Linear(d_model, dim_feedforward, bias=False, **factory_kwargs),
                nn.GELU(),
                nn.Linear(dim_feedforward, d_model, bias=False, **factory_kwargs),
            )
        else:
            self.ffn = make_gating(gating, d_model, dim_feedforward, **factory_kwargs)

        if layer_scale is None:
            self.layer_scale_1 = nn.Identity()
            self.layer_scale_2 = nn.Identity()
        else:
            self.layer_scale_1 = MossAudioTokenizerLayerScale(
                channels=d_model, init=layer_scale, channel_last=True, **cast(dict[str, object], factory_kwargs)
            )
            self.layer_scale_2 = MossAudioTokenizerLayerScale(
                channels=d_model, init=layer_scale, channel_last=True, **cast(dict[str, object], factory_kwargs)
            )

        self._register_load_state_dict_pre_hook(self._load_hook, with_module=True)

    @staticmethod
    def _load_hook(module, state_dict, prefix, *_):
        mappings = {
            "linear1.weight": "ffn.0.weight",
            "linear2.weight": "ffn.2.weight",
            "linear1.bias": "ffn.0.bias",
            "linear2.bias": "ffn.2.bias",
        }
        for source, target in mappings.items():
            this_source = prefix + source
            if this_source in state_dict:
                state_dict[prefix + target] = state_dict.pop(this_source)

    def _init_streaming_state(self, batch_size: int) -> LayerState:
        device = next(iter(self.parameters())).device
        return LayerState(batch_size, device)

    def forward(self, x: torch.Tensor, **kwargs):
        residual = x
        x = self.norm1(x)
        x = residual.to(x) + self.layer_scale_1(self.self_attn(x, **kwargs))
        residual = x
        x = self.norm2(x)
        x = residual.to(x) + self.layer_scale_2(self.ffn(x))
        return x


# =============================================================================
# Streaming Transformer
# =============================================================================


_sync_module_proxy()
@dataclass
class TransformerState(StreamingState):
    offsets: torch.Tensor

    def reset(self, reset_mask: torch.Tensor):
        super().reset(reset_mask)
        self.offsets[:] = torch.where(reset_mask, torch.zeros_like(self.offsets), self.offsets)


class MossAudioTokenizerTransformer(StreamingModule):
    """Transformer with streaming/causal support."""

    def __init__(
        self,
        d_model: int,
        num_heads: int,
        num_layers: int,
        dim_feedforward: int = 2048,
        causal: bool = False,
        context: int | None = None,
        positional_embedding: str = "sin",
        max_period: float = 10_000,
        positional_scale: float = 1.0,
        attention_implementation: str = "sdpa",
        device=None,
        dtype=None,
        **kwargs,
    ):
        super().__init__()
        if d_model % num_heads != 0:
            raise ValueError(f"d_model must be divisible by num_heads, got d_model={d_model}, num_heads={num_heads}")

        self.positional_embedding = positional_embedding
        self.max_period = max_period
        self.positional_scale = positional_scale

        self.rope: MossAudioTokenizerRotaryEmbedding | None = None
        if positional_embedding in {"rope", "sin_rope"}:
            self.rope = MossAudioTokenizerRotaryEmbedding(max_period=max_period)

        self.layers = nn.ModuleList()
        for _ in range(num_layers):
            self.layers.append(
                MossAudioTokenizerTransformerLayer(
                    d_model=d_model,
                    num_heads=num_heads,
                    dim_feedforward=dim_feedforward,
                    causal=causal,
                    context=context,
                    rope=self.rope,
                    attention_implementation=attention_implementation,
                    device=device,
                    dtype=dtype,
                    **kwargs,
                )
            )

    def _init_streaming_state(self, batch_size: int) -> TransformerState:
        device = next(self.parameters()).device
        return TransformerState(
            batch_size,
            device,
            offsets=torch.zeros(batch_size, device=device, dtype=torch.long),
        )

    def resolve_attention_implementation(self, x: torch.Tensor) -> str:
        if len(self.layers) == 0:
            return "sdpa"
        first_layer = cast(MossAudioTokenizerTransformerLayer, self.layers[0])
        return first_layer.self_attn.resolve_attention_implementation(x, is_streaming=self._streaming_state is not None)

    def set_attention_implementation(self, attention_implementation: str) -> None:
        for layer in self.layers:
            cast(MossAudioTokenizerTransformerLayer, layer).self_attn.set_attention_implementation(attention_implementation)

    def forward(self, x: torch.Tensor, **kwargs):
        C = x.shape[-1]
        state = self._streaming_state
        if x.dim() == 3:
            B, T, _ = x.shape
            offsets = (
                torch.zeros(1, dtype=torch.long, device=x.device)
                if state is None
                else (
                    state.offsets
                    if isinstance(state, TransformerState)
                    else torch.zeros(1, dtype=torch.long, device=x.device)
                )
            )
        else:
            B = 0
            T = 0
            offsets = None

        if self.positional_embedding in {"sin", "sin_rope"}:
            if x.dim() == 3:
                positions = torch.arange(T, device=x.device).view(1, -1) + cast(torch.Tensor, offsets).view(-1, 1)
            else:
                position_ids = kwargs.get("position_ids")
                if position_ids is None:
                    raise ValueError("Packed transformer inputs require position_ids when using sinusoidal embeddings.")
                positions = position_ids
            pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
            x = x + self.positional_scale * pos_emb

        for layer in self.layers:
            x = layer(x, **kwargs)

        if state is not None and x.dim() == 3:
            assert isinstance(state, TransformerState)
            state.offsets[:] = torch.where(state.exec_mask, state.offsets + T, state.offsets)
        return x


class MossAudioTokenizerProjectedTransformer(StreamingContainer):
    """Transformer with input/output projections."""

    def __init__(
        self,
        input_dimension: int,
        output_dimension: int,
        d_model: int,
        *,
        conv_layout: bool = False,
        module_type: str,
        **kwargs,
    ):
        super().__init__()
        self.module_type = module_type
        self.downsample_ratio: int = 1
        self.input_dimension = input_dimension
        self.output_dimension = output_dimension

        self.input_proj = nn.Linear(input_dimension, d_model, bias=False)
        self.transformer = MossAudioTokenizerTransformer(d_model=d_model, **kwargs)
        self.conv_layout = conv_layout
        self.output_proj = nn.Linear(d_model, output_dimension, bias=False)

    def set_attention_implementation(self, attention_implementation: str) -> None:
        self.transformer.set_attention_implementation(attention_implementation)

    def forward(self, x, input_lengths, **kwargs):
        x = self.input_proj(x.transpose(1, 2))  # (B, D, T) -> (B, T, D)
        if not self.is_streaming and self.transformer.resolve_attention_implementation(x) == "flash_attention_2":
            batch_size, max_seqlen, _ = x.shape
            if max_seqlen > 0 and bool(input_lengths.any().item()):
                max_valid_seqlen = int(input_lengths.max().item())
                packed_x, valid_mask, cu_seqlens, position_ids = pack_padded_sequence(x, input_lengths)
                packed_x = self.transformer(
                    packed_x,
                    cu_seqlens=cu_seqlens,
                    max_seqlen=max_valid_seqlen,
                    position_ids=position_ids,
                    input_lengths=input_lengths,
                    **kwargs,
                )
                x = unpack_packed_sequence(packed_x, valid_mask, batch_size, max_seqlen)
            else:
                x = x.new_zeros(x.shape)
        else:
            x = self.transformer(x, input_lengths=input_lengths, **kwargs)
        x = self.output_proj(x).transpose(1, 2)  # (B, T, D) -> (B, D, T)
        return x, input_lengths


# =============================================================================
# Patched Pretransform Module
# =============================================================================


class MossAudioTokenizerPatchedPretransform(nn.Module):
    """Patching module for downsampling/upsampling."""

    def __init__(self, patch_size: int, is_downsample: bool, module_type: str, **kwargs):
        super().__init__()
        self.patch_size = patch_size
        self.downsample_ratio: int = patch_size
        self.is_downsample = is_downsample
        self.module_type = module_type

    def encode(self, x, input_lengths):
        b, d, _ = x.shape
        h = self.patch_size
        x = x.reshape(b, d, -1, h).permute(0, 1, 3, 2).reshape(b, d * h, -1)
        # We pad the input waveform to a multiple of `downsample_rate` before applying the encoder.
        # Use a ceil division to match that padding and avoid dropping the last (partially padded) frame.
        output_lengths = input_lengths // self.patch_size
        return x, output_lengths

    def decode(self, x, input_lengths):
        b, dh, l = x.shape
        h = self.patch_size
        d = dh // h
        x = x.reshape(b, d, h, l).permute(0, 1, 3, 2).reshape(b, d, l * h)
        output_lengths = input_lengths * self.patch_size
        return x, output_lengths

    def forward(self, x, input_lengths):
        if self.is_downsample:
            return self.encode(x, input_lengths)
        else:
            return self.decode(x, input_lengths)


# =============================================================================
# Vector Quantization
# =============================================================================


def WNConv1d(*args, **kwargs):
    """Weight-normalized Conv1d."""
    return nn.utils.parametrizations.weight_norm(nn.Conv1d(*args, **kwargs))


def remap_weight_norm_state_dict_keys(state_dict: dict[str, torch.Tensor], prefix: str) -> None:
    replacements = (
        (".weight_g", ".parametrizations.weight.original0"),
        (".weight_v", ".parametrizations.weight.original1"),
    )
    for key in list(state_dict.keys()):
        if not key.startswith(prefix):
            continue
        new_key = key
        for source, target in replacements:
            new_key = new_key.replace(source, target)
        if new_key != key:
            state_dict[new_key] = state_dict.pop(key)


class MossAudioTokenizerVectorQuantize(nn.Module):
    """Single codebook vector quantization (inference only)."""

    def __init__(
        self,
        input_dim: int,
        codebook_size: int,
        codebook_dim: int,
        **kwargs,
    ):
        super().__init__()
        self.input_dim = input_dim
        self.codebook_size = codebook_size
        self.codebook_dim = codebook_dim

        if input_dim != codebook_dim:
            self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
            self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
        else:
            self.in_proj = nn.Identity()
            self.out_proj = nn.Identity()

        self.codebook = nn.Embedding(codebook_size, codebook_dim)

    @torch.no_grad()
    def forward(self, z: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Args:
            z: Input tensor of shape (B, D, T)
        Returns:
            z_q: Quantized tensor of shape (B, D, T)
            indices: Code indices of shape (B, T)
            z_e: Encoded tensor before quantization
        """
        z = z.float()
        z_e = self.in_proj(z).float()

        encodings = z_e.transpose(1, 2).reshape(-1, z_e.shape[1])

        codebook_weight = self.codebook.weight
        dist = (
            encodings.pow(2).sum(1, keepdim=True)
            - 2 * encodings @ codebook_weight.float().t()
            + codebook_weight.float().pow(2).sum(1, keepdim=True).t()
        )

        indices = (-dist).max(1)[1]
        indices = indices.reshape(z.size(0), -1)

        z_q = self.decode_code(indices)
        z_q = self.out_proj(z_q).float()

        return z_q, indices, z_e

    def decode_code(self, embed_id: torch.Tensor) -> torch.Tensor:
        """Decode code indices to embeddings."""
        return self.codebook(embed_id).transpose(1, 2).float()


class MossAudioTokenizerLFQ(nn.Module):
    """LFQ (inference-only) used by ResidualLFQ."""

    def __init__(
        self,
        input_dim: int,
        codebook_size: int,
        codebook_dim: int,
        **kwargs,
    ):
        super().__init__()
        self.input_dim = input_dim
        self.codebook_size = codebook_size
        self.codebook_dim = codebook_dim

        if self.input_dim != self.codebook_dim:
            self.in_proj = WNConv1d(self.input_dim, self.codebook_dim, kernel_size=1)
            self.out_proj = WNConv1d(self.codebook_dim, self.input_dim, kernel_size=1)
        else:
            self.in_proj = nn.Identity()
            self.out_proj = nn.Identity()

        self.codebook = nn.Embedding(codebook_size, codebook_dim)

    @torch.no_grad()
    def forward(self, z: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """Quantize z into codebook vectors."""
        z = z.float()
        z_e = self.in_proj(z).float()
        z_q, indices = self.decode_latents(z_e)
        z_q = (z_e + (z_q - z_e).detach()).float()
        z_q = self.out_proj(z_q).float()
        return z_q, indices, z_e

    def embed_code(self, embed_id: torch.Tensor) -> torch.Tensor:
        return F.embedding(embed_id, self.codebook.weight)

    def decode_code_wo_out_proj(self, embed_id: torch.Tensor) -> torch.Tensor:
        return self.embed_code(embed_id).transpose(1, 2)

    def decode_code(self, embed_id: torch.Tensor) -> torch.Tensor:
        z_q = self.decode_code_wo_out_proj(embed_id).float()
        z_q = self.out_proj(z_q).float()
        return z_q

    def decode_latents(self, latents: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        """Match training LFQ: L2-normalize then argmin squared distance."""
        encodings = latents.transpose(1, 2).reshape(-1, latents.shape[1]).float()
        codebook = self.codebook.weight.float()

        encodings = F.normalize(encodings)
        codebook = F.normalize(codebook)

        dist = (
            encodings.pow(2).sum(1, keepdim=True)
            - 2 * encodings @ codebook.t()
            + codebook.pow(2).sum(1, keepdim=True).t()
        )
        indices = (-dist).max(1)[1]
        indices = indices.reshape(latents.size(0), -1)
        z_q = self.decode_code_wo_out_proj(indices).float()
        return z_q, indices


class MossAudioTokenizerResidualVQ(nn.Module):
    """Residual Vector Quantization (inference only)."""

    def __init__(
        self,
        input_dim: int = 1024,
        rvq_dim: int | None = None,
        output_dim: int | None = None,
        num_quantizers: int = 32,
        codebook_size: int = 1024,
        codebook_dim: int = 8,
        **kwargs,
    ):
        super().__init__()
        self.input_dim = input_dim
        self.rvq_dim = rvq_dim or input_dim
        self.output_dim = output_dim or input_dim
        self.num_quantizers = num_quantizers
        self.codebook_size = codebook_size
        self.codebook_dim = codebook_dim

        self.input_proj = (
            WNConv1d(input_dim, self.rvq_dim, kernel_size=1) if input_dim != self.rvq_dim else nn.Identity()
        )
        self.output_proj = (
            WNConv1d(self.rvq_dim, self.output_dim, kernel_size=1)
            if self.rvq_dim != self.output_dim
            else nn.Identity()
        )

        self.quantizers = nn.ModuleList(
            [
                MossAudioTokenizerVectorQuantize(
                    input_dim=self.rvq_dim,
                    codebook_size=codebook_size,
                    codebook_dim=codebook_dim,
                    **kwargs,
                )
                for _ in range(num_quantizers)
            ]
        )
        self._register_load_state_dict_pre_hook(self._load_hook, with_module=True)

    @staticmethod
    def _load_hook(module, state_dict, prefix, *_):
        remap_weight_norm_state_dict_keys(state_dict, prefix)

    @torch.no_grad()
    def forward(
        self,
        z: torch.Tensor,
        input_length: torch.Tensor,
        n_quantizers: int | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Args:
            z: Input tensor of shape (B, D, T)
            input_length: Valid lengths for each sample (B,)
            n_quantizers: Number of quantizers to use
        Returns:
            quantized_out: Quantized output (B, D, T)
            all_indices: All code indices (N, B, T)
            output_length: Output lengths (B,)
        """
        with disable_cuda_autocast():
            z = self.input_proj(z).float()

            batch_size, _, max_time = z.shape
            mask = torch.arange(max_time, device=z.device).expand(batch_size, max_time) < input_length.unsqueeze(1)

            quantized_out = torch.zeros_like(z, dtype=torch.float32)
            residual = z.clone().float()
            all_indices = []

            n_quantizers = n_quantizers or self.num_quantizers

            for i, quantizer in enumerate(self.quantizers):
                if i >= n_quantizers:
                    break

                masked_residual = residual * mask.unsqueeze(1)
                z_q_i, indices_i, _ = quantizer(masked_residual.float())

                update_mask = mask.unsqueeze(1)
                quantized_out = quantized_out + z_q_i * update_mask
                residual = residual - z_q_i * update_mask
                all_indices.append(indices_i)

            all_indices = torch.stack(all_indices)  # (N, B, T)
            quantized_out = self.output_proj(quantized_out.float()).float()

        return quantized_out, all_indices, input_length

    def decode_codes(self, codes: torch.Tensor) -> torch.Tensor:
        """Decode codes from multiple quantizers to embeddings."""
        with disable_cuda_autocast():
            nq, B, T = codes.shape
            emb = torch.zeros(B, self.rvq_dim, T, device=codes.device, dtype=torch.float32)

            for i, quantizer in enumerate(self.quantizers[:nq]):
                quantizer = cast(MossAudioTokenizerVectorQuantize, quantizer)
                quantized_i = quantizer.decode_code(codes[i]).float()
                emb += quantized_i

            emb = self.output_proj(emb.float()).float()
        return emb


class MossAudioTokenizerResidualLFQ(nn.Module):
    """Residual LFQ (inference only)."""

    def __init__(
        self,
        input_dim: int = 1024,
        rvq_dim: int | None = None,
        output_dim: int | None = None,
        num_quantizers: int = 32,
        codebook_size: int = 1024,
        codebook_dim: int = 8,
        **kwargs,
    ):
        super().__init__()
        self.input_dim = input_dim
        self.rvq_dim = rvq_dim or input_dim
        self.output_dim = output_dim or input_dim
        self.num_quantizers = num_quantizers
        self.codebook_size = codebook_size
        self.codebook_dim = codebook_dim

        self.input_proj = (
            WNConv1d(input_dim, self.rvq_dim, kernel_size=1) if input_dim != self.rvq_dim else nn.Identity()
        )
        self.output_proj = (
            WNConv1d(self.rvq_dim, self.output_dim, kernel_size=1)
            if self.rvq_dim != self.output_dim
            else nn.Identity()
        )

        self.quantizers = nn.ModuleList(
            [
                MossAudioTokenizerLFQ(
                    input_dim=self.rvq_dim,
                    codebook_size=codebook_size,
                    codebook_dim=codebook_dim,
                    **kwargs,
                )
                for _ in range(num_quantizers)
            ]
        )
        self._register_load_state_dict_pre_hook(self._load_hook, with_module=True)

    @staticmethod
    def _load_hook(module, state_dict, prefix, *_):
        remap_weight_norm_state_dict_keys(state_dict, prefix)

    @torch.no_grad()
    def forward(
        self,
        z: torch.Tensor,
        input_length: torch.Tensor,
        n_quantizers: int | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """Inference quantization."""
        with disable_cuda_autocast():
            z = self.input_proj(z).float()

            batch_size, _, max_time = z.shape
            mask = torch.arange(max_time, device=z.device).expand(batch_size, max_time) < input_length.unsqueeze(1)

            quantized_out = torch.zeros_like(z, dtype=torch.float32)
            residual = z.clone().float()
            all_indices = []

            n_quantizers = n_quantizers or self.num_quantizers
            for i, quantizer in enumerate(self.quantizers):
                if i >= n_quantizers:
                    break

                masked_residual = residual * mask.unsqueeze(1)
                z_q_i, indices_i, _ = quantizer(masked_residual.float())

                update_mask = mask.unsqueeze(1)
                quantized_out = quantized_out + z_q_i * update_mask
                residual = residual - z_q_i * update_mask
                all_indices.append(indices_i)

            all_indices = (
                torch.stack(all_indices)
                if all_indices
                else torch.empty(0, batch_size, max_time, device=z.device, dtype=torch.long)
            )
            quantized_out = self.output_proj(quantized_out.float()).float()
        return quantized_out, all_indices, input_length

    def decode_codes(self, codes: torch.Tensor) -> torch.Tensor:
        with disable_cuda_autocast():
            nq, B, T = codes.shape
            emb = torch.zeros(B, self.rvq_dim, T, device=codes.device, dtype=torch.float32)
            for i, quantizer in enumerate(self.quantizers[:nq]):
                quantizer = cast(MossAudioTokenizerLFQ, quantizer)
                emb += quantizer.decode_code(codes[i]).float()
            emb = self.output_proj(emb.float()).float()
        return emb


# =============================================================================
# Main Model Classes
# =============================================================================


@auto_docstring
class MossAudioTokenizerPreTrainedModel(PreTrainedAudioTokenizerBase):
    """Base class for MossAudioTokenizer models."""

    config_class = MossAudioTokenizerConfig
    base_model_prefix = ""
    main_input_name = "input_values"
    input_modalities = "audio"
    supports_gradient_checkpointing = False
    _no_split_modules = [
        "MossAudioTokenizerTransformerLayer",
        "MossAudioTokenizerResidualVQ",
        "MossAudioTokenizerResidualLFQ",
    ]


@auto_docstring(
    custom_intro="""
    The MossAudioTokenizer neural audio codec model for audio tokenization and synthesis.
    """
)
class MossAudioTokenizerModel(MossAudioTokenizerPreTrainedModel):
    """
    MossAudioTokenizer model for audio tokenization and synthesis.

    This model can encode audio waveforms into discrete tokens and decode
    tokens back into audio waveforms.
    """

    def __init__(self, config: MossAudioTokenizerConfig):
        super().__init__(config)

        self.config = config
        _ = config.version
        self.sampling_rate = config.sampling_rate
        self.downsample_rate = config.downsample_rate
        self.number_channels = config.number_channels
        self.enable_channel_interleave = getattr(config, "enable_channel_interleave", True)
        self.attention_implementation = config.attention_implementation
        self.compute_dtype_name = config.compute_dtype
        self.compute_dtype = resolve_compute_dtype(config.compute_dtype)

        encoder_context_durations = [
            float(module_kwargs.get("context_duration", config.causal_transformer_context_duration))
            for module_kwargs in config.encoder_kwargs
            if module_kwargs["module_type"] == "Transformer"
        ]
        self.causal_transformer_context_duration = (
            min(encoder_context_durations) if encoder_context_durations else config.causal_transformer_context_duration
        )

        # Build encoder
        channel_interleave_factor = (
            self.number_channels if self.enable_channel_interleave and self.number_channels > 1 else 1
        )
        current_frame_rate: float = float(self.sampling_rate * channel_interleave_factor)
        self.encoder = nn.ModuleList()

        for encoder_kwargs_i in config.encoder_kwargs:
            encoder_kwargs_i = dict(encoder_kwargs_i)  # Make a copy
            if encoder_kwargs_i["module_type"] == "PatchedPretransform":
                self.encoder.append(MossAudioTokenizerPatchedPretransform(**encoder_kwargs_i, is_downsample=True))
            elif encoder_kwargs_i["module_type"] == "Transformer":
                context_duration = float(encoder_kwargs_i.pop("context_duration", self.causal_transformer_context_duration))
                self.encoder.append(
                    MossAudioTokenizerProjectedTransformer(
                        **encoder_kwargs_i,
                        context=int(round(current_frame_rate * context_duration)),
                        attention_implementation=self.attention_implementation,
                    )
                )
            current_frame_rate /= self.encoder[-1].downsample_ratio

        # Build quantizer
        quantizer_kwargs = dict(config.quantizer_kwargs)
        quantizer_type = quantizer_kwargs.get("quantizer_type", getattr(config, "quantizer_type", "rvq"))
        if quantizer_type in {"rvq", "spec_rvq"}:
            self.quantizer = MossAudioTokenizerResidualVQ(**quantizer_kwargs)
        elif quantizer_type in {"rlfq", "random_prefix_rlfq"}:
            self.quantizer = MossAudioTokenizerResidualLFQ(**quantizer_kwargs)
        else:
            raise ValueError(f"Unsupported quantizer_type: {quantizer_type}")

        # Build decoder
        decoder_kwargs_list = copy.deepcopy(config.decoder_kwargs)
        self.decoder = nn.ModuleList()

        for decoder_kwargs_i in decoder_kwargs_list:
            decoder_kwargs_i = dict(decoder_kwargs_i)
            if decoder_kwargs_i["module_type"] == "PatchedPretransform":
                self.decoder.append(MossAudioTokenizerPatchedPretransform(**decoder_kwargs_i, is_downsample=False))
            elif decoder_kwargs_i["module_type"] == "Transformer":
                context_duration = float(decoder_kwargs_i.pop("context_duration", self.causal_transformer_context_duration))
                self.decoder.append(
                    MossAudioTokenizerProjectedTransformer(
                        **decoder_kwargs_i,
                        context=int(round(current_frame_rate * context_duration)),
                        attention_implementation=self.attention_implementation,
                    )
                )
            current_frame_rate *= self.decoder[-1].downsample_ratio

        expected_output_frame_rate = float(self.sampling_rate * channel_interleave_factor)
        if int(round(current_frame_rate)) != int(round(expected_output_frame_rate)):
            raise ValueError(
                "Decoder stack does not invert the encoder frame rate correctly: "
                f"got current_frame_rate={current_frame_rate}, expected={expected_output_frame_rate}."
            )

        self.post_init()
        self._active_decode_session: "MossAudioTokenizerDecodeSession | None" = None
        self._batch_decode_streaming_max_batch_size: int | None = None
        self._batch_decode_streaming_batch_size: int | None = None
        self._batch_decode_streaming_session: "MossAudioTokenizerDecodeSession | None" = None
        self._batch_decode_streaming_next_request_id: int = 0

    def create_decode_session(
        self,
        max_batch_size: int,
        use_cuda_graph: bool = False,
    ) -> MossAudioTokenizerDecodeSession:
        active_session = self._active_decode_session
        if active_session is not None and not active_session._closed:
            raise RuntimeError(_ACTIVE_DECODE_SESSION_ERROR_MESSAGE)

        for module in self.modules():
            if isinstance(module, StreamingModule) and module._streaming_state is not None:
                raise RuntimeError(_MODEL_STREAMING_CONFLICT_ERROR_MESSAGE)

        session = MossAudioTokenizerDecodeSession(self, max_batch_size, use_cuda_graph=use_cuda_graph)
        return session

    def _reset_batch_decode_streaming_state(self) -> None:
        streaming_session = self._batch_decode_streaming_session
        self._batch_decode_streaming_session = None
        self._batch_decode_streaming_max_batch_size = None
        self._batch_decode_streaming_batch_size = None
        self._batch_decode_streaming_next_request_id = 0
        if streaming_session is not None and not streaming_session._closed:
            streaming_session.close()

    def _prepare_batch_decode_streaming_state(
        self,
        batch_size: int,
        max_batch_size: int | None,
        reset_stream: bool,
    ) -> int:
        if reset_stream:
            self._reset_batch_decode_streaming_state()

        if max_batch_size is not None and max_batch_size <= 0:
            raise ValueError("`max_batch_size` must be > 0 when provided.")

        streaming_max_batch_size = self._batch_decode_streaming_max_batch_size
        if streaming_max_batch_size is None:
            streaming_max_batch_size = batch_size if max_batch_size is None else max_batch_size
        elif max_batch_size is not None and max_batch_size != streaming_max_batch_size:
            raise ValueError(
                "`max_batch_size` can only be set on the first streaming `batch_decode()` call for now. "
                f"Expected {streaming_max_batch_size}, got {max_batch_size}."
            )

        if batch_size > streaming_max_batch_size:
            raise ValueError(
                "Streaming `batch_decode()` received a batch larger than the reserved `max_batch_size`. "
                f"Got batch_size={batch_size}, max_batch_size={streaming_max_batch_size}."
            )

        return streaming_max_batch_size

    def _ensure_batch_decode_streaming_session(
        self,
        max_batch_size: int,
        use_cuda_graph: bool = False,
    ) -> MossAudioTokenizerDecodeSession:
        session = self._batch_decode_streaming_session
        if session is not None and not session._closed:
            if session._use_cuda_graph != use_cuda_graph:
                raise ValueError(
                    "`use_cuda_graph` must match the existing streaming `batch_decode()` session configuration. "
                    f"Expected {session._use_cuda_graph}, got {use_cuda_graph}."
                )
            return session

        session = self.create_decode_session(max_batch_size=max_batch_size, use_cuda_graph=use_cuda_graph)
        self._batch_decode_streaming_session = session
        self._batch_decode_streaming_max_batch_size = max_batch_size
        self._batch_decode_streaming_next_request_id = 0
        return session

    def _append_batch_decode_streaming_requests(
        self,
        session: MossAudioTokenizerDecodeSession,
        target_batch_size: int,
    ) -> None:
        requests_to_append = target_batch_size - len(session.active_request_ids)
        for _ in range(requests_to_append):
            request_id = self._batch_decode_streaming_next_request_id
            session.append(request_id)
            self._batch_decode_streaming_next_request_id += 1

    def _resolve_batch_decode_streaming_finalize_request_ids(
        self,
        request_ids: list[str | int],
        finalize_indices: list[int] | tuple[int, ...] | None,
    ) -> list[str | int]:
        normalized_finalize_indices = tuple(finalize_indices) if finalize_indices is not None else ()
        if len(set(normalized_finalize_indices)) != len(normalized_finalize_indices):
            raise ValueError(_BATCH_DECODE_STREAMING_DUPLICATE_FINALIZE_INDICES_ERROR_MESSAGE)

        batch_size = len(request_ids)
        finalize_request_ids: list[str | int] = []
        for index in normalized_finalize_indices:
            if index < 0 or index >= batch_size:
                raise ValueError(
                    _BATCH_DECODE_STREAMING_FINALIZE_INDEX_OUT_OF_RANGE_ERROR_TEMPLATE.format(
                        index=index, batch_size=batch_size
                    )
                )
            finalize_request_ids.append(request_ids[index])

        return finalize_request_ids

    def _raise_if_plain_decode_conflicts_with_active_session(self) -> None:
        active_session = self._active_decode_session
        if active_session is not None and not getattr(active_session, "_closed", False):
            raise RuntimeError(_PLAIN_DECODE_SESSION_CONFLICT_ERROR_MESSAGE)

    def _start_streaming(self, batch_size: int):
        """Start streaming mode for all modules."""
        active_session = self._active_decode_session
        if active_session is not None and not getattr(active_session, "_closed", False):
            raise RuntimeError(_MODEL_STREAMING_CONFLICT_ERROR_MESSAGE)

        def _start(module):
            if isinstance(module, StreamingModule):
                module._streaming_state = module._init_streaming_state(batch_size)

        self.apply(_start)

    def _stop_streaming(self):
        """Stop streaming mode for all modules."""
        active_session = self._active_decode_session
        if active_session is not None and not getattr(active_session, "_closed", False):
            raise RuntimeError(_MODEL_STREAMING_CONFLICT_ERROR_MESSAGE)

        def _stop(module):
            if isinstance(module, StreamingModule):
                module._streaming_state = None

        self.apply(_stop)

    @contextmanager
    def streaming(self, batch_size: int = 1):
        """Context manager for streaming mode."""
        self._start_streaming(batch_size)
        try:
            yield
        finally:
            self._stop_streaming()

    def _set_streaming_exec_mask(self, exec_mask: torch.Tensor) -> None:
        exec_mask = exec_mask.to(torch.bool)

        def _set_exec_mask(module: nn.Module):
            if isinstance(module, StreamingModule) and module._streaming_state is not None:
                module._streaming_state.set_exec_mask(exec_mask.to(module._streaming_state.device))

        self.apply(_set_exec_mask)

    def _plan_batch_stream_step(
        self,
        remaining: torch.Tensor,
        max_step_length: int,
        alignment: int,
    ) -> tuple[int, torch.Tensor]:
        positive_mask = remaining > 0
        if not bool(positive_mask.any().item()):
            raise RuntimeError("Cannot plan a streaming step when no samples remain.")

        if max_step_length > 0:
            full_step_mask = remaining >= max_step_length
            if bool(full_step_mask.any().item()):
                return max_step_length, full_step_mask

        positive_remaining = remaining[positive_mask]
        min_remaining = int(positive_remaining.min().item())

        if alignment > 1:
            aligned_step = (min_remaining // alignment) * alignment
            if aligned_step > 0:
                return aligned_step, remaining >= aligned_step
            return min_remaining, remaining == min_remaining

        step_length = min_remaining
        if max_step_length > 0:
            step_length = min(step_length, max_step_length)
        return step_length, remaining >= step_length

    def _infer_num_quantizers(self, codes_chunks: list[list[torch.Tensor]], requested_num_quantizers: int | None) -> int:
        if requested_num_quantizers is not None:
            return requested_num_quantizers
        for chunks_i in codes_chunks:
            if chunks_i:
                return int(chunks_i[0].shape[0])
        num_quantizers = getattr(self.quantizer, "num_quantizers", None)
        if num_quantizers is None:
            raise RuntimeError("Unable to infer the number of quantizers from empty streaming output.")
        return int(num_quantizers)

    def _infer_waveform_dtype(self, wav_chunks: list[list[torch.Tensor]]) -> torch.dtype:
        for chunks_i in wav_chunks:
            if chunks_i:
                return chunks_i[0].dtype
        return torch.float32

    @contextmanager
    def _codec_inference_autocast(self):
        device = next(self.parameters()).device
        if device.type == "cuda" and self.compute_dtype is not None:
            with torch.autocast(device_type="cuda", dtype=self.compute_dtype):
                yield
        else:
            yield

    def set_attention_implementation(self, attention_implementation: str) -> None:
        self.attention_implementation = attention_implementation
        for module in self.modules():
            if isinstance(module, MossAudioTokenizerProjectedTransformer):
                module.set_attention_implementation(attention_implementation)

    def set_compute_dtype(self, compute_dtype: str) -> None:
        self.compute_dtype_name = compute_dtype
        self.compute_dtype = resolve_compute_dtype(compute_dtype)

    def _prepare_waveform_batch(
        self,
        wav_list: list[torch.Tensor],
    ) -> tuple[torch.Tensor, torch.Tensor]:
        if len(wav_list) == 0:
            raise ValueError("`wav_list` must contain at least one waveform.")

        device = wav_list[0].device
        dtype = wav_list[0].dtype
        batch_size = len(wav_list)
        lengths = torch.zeros(batch_size, device=device, dtype=torch.long)

        normalized_wavs: list[torch.Tensor] = []
        for i, wav in enumerate(wav_list):
            if self.number_channels == 1:
                if wav.dim() == 1:
                    wav_i = wav.unsqueeze(0)
                elif wav.dim() == 2 and wav.shape[0] == 1:
                    wav_i = wav
                else:
                    raise ValueError(
                        f"Expected wav_list[{i}] to have shape `(T,)` or `(1, T)` for a mono model, got {tuple(wav.shape)}."
                    )
            else:
                if wav.dim() != 2 or wav.shape[0] != self.number_channels:
                    raise ValueError(
                        f"Expected wav_list[{i}] to have shape `({self.number_channels}, T)`, got {tuple(wav.shape)}."
                    )
                wav_i = wav

            normalized_wavs.append(wav_i)
            lengths[i] = wav_i.shape[-1]

        max_length = int(lengths.max().item()) if batch_size > 0 else 0
        input_values = torch.zeros(batch_size, self.number_channels, max_length, device=device, dtype=dtype)
        for i, wav_i in enumerate(normalized_wavs):
            input_values[i, :, : wav_i.shape[-1]] = wav_i
        return input_values, lengths

    def _prepare_codes_batch(
        self,
        codes_list: list[torch.Tensor],
        num_quantizers: int | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor, int]:
        if len(codes_list) == 0:
            raise ValueError("`codes_list` must contain at least one code tensor.")

        batch_size = len(codes_list)
        device = codes_list[0].device
        nqs = [codes.shape[0] for codes in codes_list]
        if num_quantizers is None:
            num_quantizers = nqs[0]
            if any(nq != num_quantizers for nq in nqs):
                raise ValueError(
                    "All elements in `codes_list` must have the same number of quantizers when `num_quantizers` is None. "
                    "Pass `num_quantizers=...` to decode a common prefix."
                )
        elif min(nqs) < num_quantizers:
            raise ValueError(
                "`num_quantizers` must be <= the number of quantizers for every element in `codes_list`. "
                f"Got num_quantizers={num_quantizers}, min(codes.shape[0])={min(nqs)}."
            )

        lengths = torch.tensor([codes.shape[-1] for codes in codes_list], device=device, dtype=torch.long)
        max_length = int(lengths.max().item()) if batch_size > 0 else 0
        audio_codes = torch.zeros(num_quantizers, batch_size, max_length, device=device, dtype=torch.long)

        for i, codes in enumerate(codes_list):
            codes_i = codes[:num_quantizers]
            audio_codes[:, i, : codes_i.shape[-1]] = codes_i
        return audio_codes, lengths, num_quantizers

    def _flatten_channels_for_codec(
        self,
        input_values: torch.Tensor,
        input_lengths: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        if input_values.dim() != 3:
            raise ValueError(f"Expected `input_values` with shape `(B, C, T)`, got {tuple(input_values.shape)}.")
        if input_values.shape[1] != self.number_channels:
            raise ValueError(
                f"Expected `input_values.shape[1] == {self.number_channels}`, got {input_values.shape[1]}."
            )

        if input_values.shape[-1] % self.downsample_rate != 0:
            pad_length = self.downsample_rate - (input_values.shape[-1] % self.downsample_rate)
            input_values = F.pad(input_values, (0, pad_length))

        if self.number_channels > 1 and self.enable_channel_interleave:
            input_values = input_values.transpose(1, 2).contiguous().view(input_values.shape[0], 1, -1)
            input_lengths = input_lengths * self.number_channels
        return input_values, input_lengths

    def _restore_channels_from_codec(
        self,
        output_values: torch.Tensor,
        output_lengths: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        if self.number_channels == 1 or not self.enable_channel_interleave:
            return output_values.float(), output_lengths

        output_values = (
            output_values.squeeze(1)
            .contiguous()
            .view(output_values.shape[0], -1, self.number_channels)
            .transpose(1, 2)
            .contiguous()
            .float()
        )
        output_lengths = torch.div(output_lengths, self.number_channels, rounding_mode="floor")
        return output_values, output_lengths

    def _stack_hidden_states(
        self,
        hidden_chunks: list[list[torch.Tensor]],
        lengths: torch.Tensor,
    ) -> torch.Tensor | None:
        hidden_dim = None
        for chunks_i in hidden_chunks:
            if chunks_i:
                hidden_dim = chunks_i[0].shape[0]
                break
        if hidden_dim is None:
            return None

        batch_size = len(hidden_chunks)
        max_length = int(lengths.max().item()) if batch_size > 0 else 0
        device = lengths.device
        hidden_states = torch.zeros(batch_size, hidden_dim, max_length, device=device, dtype=torch.float32)
        for i, chunks_i in enumerate(hidden_chunks):
            if not chunks_i:
                continue
            hidden_i = torch.cat(chunks_i, dim=-1).float()
            hidden_states[i, :, : hidden_i.shape[-1]] = hidden_i
        return hidden_states

    @torch.no_grad()
    def _encode_frame(
        self,
        input_values: torch.Tensor,
        input_lengths: torch.Tensor | None = None,
        n_quantizers: int | None = None,
    ) -> MossAudioTokenizerEncoderOutput:
        if input_values.dim() == 1:
            input_values = input_values.view(1, 1, -1)
        elif input_values.dim() == 2:
            if self.number_channels == 1:
                input_values = input_values.unsqueeze(1)
            else:
                input_values = input_values.unsqueeze(0)

        batch_size, _, time = input_values.shape
        device = input_values.device
        if input_lengths is None:
            input_lengths = torch.full((batch_size,), time, device=device, dtype=torch.long)

        input_values, input_lengths = self._flatten_channels_for_codec(input_values, input_lengths)

        with self._codec_inference_autocast():
            encoder_hidden_states, encoder_hidden_lengths = input_values, input_lengths
            for encoder_module in self.encoder:
                encoder_hidden_states, encoder_hidden_lengths = encoder_module(
                    encoder_hidden_states,
                    encoder_hidden_lengths,
                )

        quantizer = cast(MossAudioTokenizerResidualVQ | MossAudioTokenizerResidualLFQ, self.quantizer)
        _, audio_codes, audio_codes_lengths = quantizer(encoder_hidden_states.float(), encoder_hidden_lengths, n_quantizers)

        return MossAudioTokenizerEncoderOutput(
            audio_codes=audio_codes,
            audio_codes_lengths=audio_codes_lengths,
            encoder_hidden_states=encoder_hidden_states.float(),
        )

    @torch.no_grad()
    def _decode_frame(
        self,
        codes: torch.Tensor,
        codes_lengths: torch.Tensor | None = None,
    ) -> MossAudioTokenizerDecoderOutput:
        _, batch_size, time = codes.shape
        device = codes.device
        if codes_lengths is None:
            codes_lengths = torch.full((batch_size,), time, device=device, dtype=torch.long)

        quantizer = cast(MossAudioTokenizerResidualVQ | MossAudioTokenizerResidualLFQ, self.quantizer)
        decoder_hidden_states = quantizer.decode_codes(codes).float()

        with self._codec_inference_autocast():
            audio, audio_lengths = decoder_hidden_states, codes_lengths
            for decoder_module in self.decoder:
                audio, audio_lengths = decoder_module(audio, audio_lengths)

        audio, audio_lengths = self._restore_channels_from_codec(audio, audio_lengths)
        return MossAudioTokenizerDecoderOutput(audio=audio, audio_lengths=audio_lengths)

    @torch.no_grad()
    def batch_encode(
        self,
        wav_list: list[torch.Tensor],
        num_quantizers: int | None = None,
        chunk_duration: float | None = None,
    ) -> MossAudioTokenizerEncoderOutput:
        input_values, input_lengths = self._prepare_waveform_batch(wav_list)
        batch_size = len(wav_list)
        device = input_values.device

        if chunk_duration is None:
            return self._encode_frame(input_values, input_lengths, n_quantizers=num_quantizers)

        if chunk_duration <= 0:
            raise ValueError("`chunk_duration` must be > 0 when provided.")

        chunk_length = int(round(chunk_duration * self.sampling_rate))
        if chunk_length <= 0:
            raise ValueError("`chunk_duration` is too small and results in chunk_length <= 0.")
        if chunk_length % self.downsample_rate != 0:
            raise ValueError(
                "`chunk_duration * config.sampling_rate` must be divisible by `config.downsample_rate`. "
                f"Got chunk_length={chunk_length}, downsample_rate={self.downsample_rate}."
            )

        cursors = torch.zeros_like(input_lengths)
        codes_chunks: list[list[torch.Tensor]] = [[] for _ in range(batch_size)]
        hidden_chunks: list[list[torch.Tensor]] = [[] for _ in range(batch_size)]

        with self.streaming(batch_size=batch_size):
            while bool((cursors < input_lengths).any().item()):
                remaining = input_lengths - cursors
                step_length, active_mask = self._plan_batch_stream_step(
                    remaining=remaining,
                    max_step_length=chunk_length,
                    alignment=self.downsample_rate,
                )
                x_step = torch.zeros(
                    batch_size,
                    self.number_channels,
                    step_length,
                    device=device,
                    dtype=input_values.dtype,
                )
                input_lengths_step = torch.zeros(batch_size, device=device, dtype=torch.long)
                active_indices = torch.nonzero(active_mask, as_tuple=False).flatten().tolist()

                for i in active_indices:
                    start = int(cursors[i].item())
                    end = start + step_length
                    x_step[i] = input_values[i, :, start:end]
                    input_lengths_step[i] = step_length

                self._set_streaming_exec_mask(active_mask)
                result = self._encode_frame(x_step, input_lengths_step, n_quantizers=num_quantizers)
                assert result.audio_codes is not None
                assert result.audio_codes_lengths is not None

                for i in active_indices:
                    codes_length_i = int(result.audio_codes_lengths[i].item())
                    if codes_length_i > 0:
                        codes_chunks[i].append(result.audio_codes[:, i, :codes_length_i].clone())
                        if result.encoder_hidden_states is not None:
                            hidden_chunks[i].append(result.encoder_hidden_states[i, :, :codes_length_i].clone())
                    cursors[i] += step_length

        num_quantizers_used = self._infer_num_quantizers(codes_chunks, num_quantizers)
        empty_codes = torch.empty((num_quantizers_used, 0), device=device, dtype=torch.long)
        codes_list = [torch.cat(chunks_i, dim=-1) if chunks_i else empty_codes.clone() for chunks_i in codes_chunks]
        audio_codes, audio_codes_lengths, _ = self._prepare_codes_batch(codes_list, num_quantizers=num_quantizers_used)
        encoder_hidden_states = self._stack_hidden_states(hidden_chunks, audio_codes_lengths)
        return MossAudioTokenizerEncoderOutput(
            audio_codes=audio_codes,
            audio_codes_lengths=audio_codes_lengths,
            encoder_hidden_states=encoder_hidden_states,
        )

    @torch.no_grad()
    def batch_decode(
        self,
        codes_list: list[torch.Tensor],
        num_quantizers: int | None = None,
        chunk_duration: float | None = None,
        streaming: bool = False,
        max_batch_size: int | None = None,
        finalize_indices: list[int] | tuple[int, ...] | None = None,
        reset_stream: bool = False,
        use_cuda_graph: bool = False,
    ) -> MossAudioTokenizerDecoderOutput:
        if len(codes_list) == 0:
            raise ValueError("`codes_list` must contain at least one code tensor.")

        streaming_max_batch_size: int | None = None
        if streaming:
            streaming_max_batch_size = self._prepare_batch_decode_streaming_state(
                batch_size=len(codes_list),
                max_batch_size=max_batch_size,
                reset_stream=reset_stream,
            )
        else:
            if reset_stream:
                self._reset_batch_decode_streaming_state()
            self._raise_if_plain_decode_conflicts_with_active_session()

        audio_codes, audio_codes_lengths, num_quantizers_used = self._prepare_codes_batch(
            codes_list,
            num_quantizers=num_quantizers,
        )
        batch_size = len(codes_list)
        device = audio_codes.device

        if not streaming and chunk_duration is None:
            return self._decode_frame(audio_codes, audio_codes_lengths)

        if streaming:
            assert streaming_max_batch_size is not None
            existing_session = self._batch_decode_streaming_session
            reusing_streaming_session = existing_session is not None and not existing_session._closed
            session = self._ensure_batch_decode_streaming_session(
                max_batch_size=streaming_max_batch_size,
                use_cuda_graph=use_cuda_graph,
            )
            pre_call_request_ids = list(session.active_request_ids)
            pre_call_batch_size = len(pre_call_request_ids)
            if batch_size < pre_call_batch_size:
                raise ValueError(_BATCH_DECODE_STREAMING_SHRINK_ERROR_MESSAGE)

            try:
                finalize_request_ids = self._resolve_batch_decode_streaming_finalize_request_ids(
                    request_ids=pre_call_request_ids,
                    finalize_indices=finalize_indices,
                )
            except Exception:
                if not reusing_streaming_session and pre_call_batch_size == 0:
                    self._reset_batch_decode_streaming_state()
                raise

            try:
                if batch_size > pre_call_batch_size:
                    self._append_batch_decode_streaming_requests(session=session, target_batch_size=batch_size)

                request_ids = list(session.active_request_ids)
                _, audio, audio_lengths = session.step(
                    request_ids=request_ids,
                    codes=audio_codes,
                    code_lengths=audio_codes_lengths,
                )
                for request_id in finalize_request_ids:
                    session.remove(request_id)
            except Exception:
                self._reset_batch_decode_streaming_state()
                raise

            self._batch_decode_streaming_max_batch_size = session.max_batch_size
            self._batch_decode_streaming_batch_size = len(session.active_request_ids)
            return MossAudioTokenizerDecoderOutput(audio=audio, audio_lengths=audio_lengths)

        assert chunk_duration is not None
        if chunk_duration <= 0:
            raise ValueError("`chunk_duration` must be > 0 when provided.")

        chunk_length = int(round(chunk_duration * self.sampling_rate))
        if chunk_length <= 0:
            raise ValueError("`chunk_duration` is too small and results in chunk_length <= 0.")
        if chunk_length % self.downsample_rate != 0:
            raise ValueError(
                "`chunk_duration * config.sampling_rate` must be divisible by `config.downsample_rate`. "
                f"Got chunk_length={chunk_length}, downsample_rate={self.downsample_rate}."
            )

        chunk_frame_length = chunk_length // self.downsample_rate
        cursors = torch.zeros_like(audio_codes_lengths)
        wav_chunks: list[list[torch.Tensor]] = [[] for _ in range(batch_size)]

        with self.streaming(batch_size=batch_size):
            while bool((cursors < audio_codes_lengths).any().item()):
                remaining = audio_codes_lengths - cursors
                step_frames, active_mask = self._plan_batch_stream_step(
                    remaining=remaining,
                    max_step_length=chunk_frame_length,
                    alignment=1,
                )
                codes_step = torch.zeros(
                    num_quantizers_used,
                    batch_size,
                    step_frames,
                    device=device,
                    dtype=torch.long,
                )
                codes_lengths_step = torch.zeros(batch_size, device=device, dtype=torch.long)
                active_indices = torch.nonzero(active_mask, as_tuple=False).flatten().tolist()

                for i in active_indices:
                    start = int(cursors[i].item())
                    end = start + step_frames
                    codes_step[:, i, :] = audio_codes[:, i, start:end]
                    codes_lengths_step[i] = step_frames

                self._set_streaming_exec_mask(active_mask)
                result = self._decode_frame(codes_step, codes_lengths_step)
                assert result.audio is not None
                assert result.audio_lengths is not None

                for i in active_indices:
                    audio_length_i = int(result.audio_lengths[i].item())
                    if audio_length_i > 0:
                        wav_chunks[i].append(result.audio[i, :, :audio_length_i].clone())
                    cursors[i] += step_frames

        wav_dtype = self._infer_waveform_dtype(wav_chunks)
        audio_lengths = torch.tensor(
            [sum(chunk.shape[-1] for chunk in chunks_i) for chunks_i in wav_chunks],
            device=device,
            dtype=torch.long,
        )
        max_audio_length = int(audio_lengths.max().item()) if batch_size > 0 else 0
        audio = torch.zeros(batch_size, self.number_channels, max_audio_length, device=device, dtype=wav_dtype)
        for i, chunks_i in enumerate(wav_chunks):
            if not chunks_i:
                continue
            wav_i = torch.cat(chunks_i, dim=-1)
            audio[i, :, : wav_i.shape[-1]] = wav_i
        return MossAudioTokenizerDecoderOutput(audio=audio, audio_lengths=audio_lengths)

    def encode(  # type: ignore[override]
        self,
        input_values: torch.Tensor,
        padding_mask: torch.Tensor | None = None,
        num_quantizers: int | None = None,
        return_dict: bool | None = None,
        chunk_duration: float | None = None,
    ):
        """
        Encodes the input audio waveform into discrete codes.

        Args:
            input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Float values of the input audio waveform.
            padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to indicate valid audio samples.
            num_quantizers (`int`, *optional*):
                Number of quantizers to use. By default, all quantizers are used.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            chunk_duration (`float`, *optional*):
                If provided, encode the input waveform in successive chunks of `chunk_duration` seconds while keeping a
                streaming KV cache for the causal transformers.

                `chunk_duration` must be <= `config.causal_transformer_context_duration`, and
                `chunk_duration * config.sampling_rate` must be divisible by `config.downsample_rate`.

        Returns:
            `MossAudioTokenizerEncoderOutput` or tuple containing audio codes and lengths.
        """
        return_dict = return_dict if return_dict is not None else self.config.return_dict
        wav_list: list[torch.Tensor]
        if input_values.dim() == 1:
            wav_list = [input_values]
        elif input_values.dim() == 2:
            if self.number_channels == 1:
                lengths = (
                    padding_mask.sum(dim=-1).long()
                    if padding_mask is not None and padding_mask.dim() == 2
                    else torch.full((input_values.shape[0],), input_values.shape[-1], device=input_values.device, dtype=torch.long)
                )
                wav_list = [input_values[i, : int(lengths[i].item())] for i in range(input_values.shape[0])]
            else:
                length = (
                    int(padding_mask.sum().item())
                    if padding_mask is not None and padding_mask.dim() == 1
                    else int(input_values.shape[-1])
                )
                wav_list = [input_values[:, :length]]
        elif input_values.dim() == 3:
            if input_values.shape[1] != self.number_channels:
                raise ValueError(
                    f"Expected `input_values.shape[1] == {self.number_channels}`, got {input_values.shape[1]}."
                )
            lengths = (
                padding_mask.sum(dim=-1).long()
                if padding_mask is not None
                else torch.full((input_values.shape[0],), input_values.shape[-1], device=input_values.device, dtype=torch.long)
            )
            wav_list = [input_values[i, :, : int(lengths[i].item())] for i in range(input_values.shape[0])]
        else:
            raise ValueError(f"Unsupported `input_values` shape: {tuple(input_values.shape)}")

        encoder_output = self.batch_encode(wav_list, num_quantizers=num_quantizers, chunk_duration=chunk_duration)

        if not return_dict:
            assert encoder_output.audio_codes is not None
            assert encoder_output.audio_codes_lengths is not None
            return (
                cast(torch.Tensor, encoder_output.audio_codes),
                cast(torch.Tensor, encoder_output.audio_codes_lengths),
            )
        return encoder_output

    def decode(  # type: ignore[override]
        self,
        audio_codes: torch.Tensor,
        padding_mask: torch.Tensor | None = None,
        return_dict: bool | None = None,
        chunk_duration: float | None = None,
        num_quantizers: int | None = None,
    ):
        """
        Decodes the given codes into an output audio waveform.

        Args:
            audio_codes (`torch.LongTensor` of shape `(num_quantizers, batch_size, sequence_length)`):
                Discrete code embeddings computed using `model.encode`.
            padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to indicate valid code positions.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            chunk_duration (`float`, *optional*):
                If provided, decode the input codes in successive chunks of `chunk_duration` seconds while keeping a
                streaming KV cache for the causal transformers.

            num_quantizers (`int`, *optional*):
                Number of quantizers to use. By default, all quantizers in `audio_codes` are used.

                `chunk_duration` must be <= `config.causal_transformer_context_duration`, and
                `chunk_duration * config.sampling_rate` must be divisible by `config.downsample_rate`.

        Returns:
            `MossAudioTokenizerDecoderOutput` or tuple containing decoded audio.
        """
        return_dict = return_dict if return_dict is not None else self.config.return_dict
        self._raise_if_plain_decode_conflicts_with_active_session()

        if audio_codes.dim() == 2:
            codes_list = [audio_codes[:num_quantizers] if num_quantizers is not None else audio_codes]
        elif audio_codes.dim() == 3:
            if num_quantizers is not None and num_quantizers > audio_codes.shape[0]:
                raise ValueError(
                    f"`num_quantizers` ({num_quantizers}) must be <= audio_codes.shape[0] ({audio_codes.shape[0]})."
                )
            codes_lengths = (
                padding_mask.sum(dim=-1).long()
                if padding_mask is not None
                else torch.full((audio_codes.shape[1],), audio_codes.shape[-1], device=audio_codes.device, dtype=torch.long)
            )
            codes_list = [
                (audio_codes[:num_quantizers, i, : int(codes_lengths[i].item())] if num_quantizers is not None else audio_codes[:, i, : int(codes_lengths[i].item())])
                for i in range(audio_codes.shape[1])
            ]
        else:
            raise ValueError(f"Unsupported `audio_codes` shape: {tuple(audio_codes.shape)}")

        decoder_output = self.batch_decode(codes_list, num_quantizers=num_quantizers, chunk_duration=chunk_duration)

        if not return_dict:
            assert decoder_output.audio is not None
            return (cast(torch.Tensor, decoder_output.audio),)
        return decoder_output

    @auto_docstring
    def forward(
        self,
        input_values: torch.FloatTensor | None = None,
        padding_mask: torch.BoolTensor | None = None,
        audio_codes: torch.Tensor | None = None,
        num_quantizers: int | None = None,
        return_dict: bool | None = None,
    ) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None] | MossAudioTokenizerOutput:  # type: ignore[override]
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
            Raw audio input converted to Float.
        padding_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid computing on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        audio_codes (`torch.LongTensor` of shape `(num_quantizers, batch_size, sequence_length)`, *optional*):
            Discrete code embeddings computed using `model.encode`.
        num_quantizers (`int`, *optional*):
            Number of quantizers (codebooks) to use. By default, all quantizers are used.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import MossAudioTokenizerModel

        >>> model = MossAudioTokenizerModel.from_pretrained("moss_audio_tokenizer-model")

        >>> # Create dummy audio input
        >>> audio = torch.randn(1, 1, 24000)  # 1 second of audio at 24kHz

        >>> outputs = model(input_values=audio)
        >>> audio_codes = outputs.audio_codes
        >>> audio_values = outputs.audio
        ```
        """
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        output_audio_codes: torch.Tensor | None = None
        output_audio_codes_lengths: torch.Tensor | None = None
        output_audio: torch.Tensor | None = None
        output_audio_lengths: torch.Tensor | None = None
        decoded_from_encoded_codes = False

        # Encode if input_values provided
        if input_values is not None:
            encoder_output = self.encode(input_values, padding_mask, num_quantizers, return_dict=True)
            encoder_output = cast(MossAudioTokenizerEncoderOutput, encoder_output)
            output_audio_codes = encoder_output.audio_codes
            output_audio_codes_lengths = encoder_output.audio_codes_lengths

            # If codes not provided separately, use encoded codes for decoding
            if audio_codes is None:
                audio_codes = output_audio_codes
                decoded_from_encoded_codes = True

        # Decode if codes available
        if audio_codes is not None:
            # If we're decoding the codes we just produced, use the computed lengths so we don't decode padded garbage.
            if decoded_from_encoded_codes and output_audio_codes_lengths is not None:
                decoder_output = self._decode_frame(audio_codes, output_audio_codes_lengths)
            else:
                decoder_output = self.decode(
                    audio_codes,
                    padding_mask=padding_mask,
                    return_dict=True,
                    num_quantizers=num_quantizers,
                )
                decoder_output = cast(MossAudioTokenizerDecoderOutput, decoder_output)
            output_audio = decoder_output.audio
            output_audio_lengths = decoder_output.audio_lengths

        if not return_dict:
            return (output_audio_codes, output_audio, output_audio_lengths)

        return MossAudioTokenizerOutput(
            audio=output_audio,
            audio_lengths=output_audio_lengths,
            audio_codes=output_audio_codes,
            audio_codes_lengths=output_audio_codes_lengths,
        )


__all__ = ["MossAudioTokenizerModel", "MossAudioTokenizerPreTrainedModel"]