Add initial implementation of MossTTSDelay model, configuration, and processing utilities

Browse files

Files changed (12) hide show

__init__.py +0 -0
assets/prompt1.wav +0 -3
assets/prompt2.wav +0 -3
assets/ref1.wav +0 -3
assets/ref2.wav +0 -3
config.json +89 -0
configuration_moss_tts.py +114 -0
inference_utils.py +154 -0
modeling_moss_tts.py +532 -0
processing_moss_tts.py +629 -0
processor_config.json +6 -0
special_tokens_map.json +31 -0

__init__.py ADDED Viewed

File without changes

assets/prompt1.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2839f28ad240479cb73f5b374bed748608f2d6639e45efe3f2727c3aaecdde22
-size 232236

assets/prompt2.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d924b8fb75e1ad7bddd207dd4f9ba71fc867422b95f569ac6aa2262b7695cc14
-size 174512

assets/ref1.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:32e84754bbe11be6082baba99236217e238d3dbfb97ff2545a3e675e031e5fdd
-size 140692

assets/ref2.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e5112b5e2bef2a727534af85da1e56048a5ab5552de7aa7cbb5f48b0fa4f5eec
-size 448172

config.json ADDED Viewed

	@@ -0,0 +1,89 @@

+{
+  "model_type": "moss_tts_delay",
+  "architectures": [
+    "MossTTSDelayModel"
+  ],
+  "auto_map": {
+      "AutoConfig": "configuration_moss_tts.MossTTSDelayConfig",
+      "AutoModel": "modeling_moss_tts.MossTTSDelayModel"
+  },
+  "dtype": "bfloat16",
+  "initializer_range": 0.02,
+  "language_config": {
+    "_name_or_path": "Qwen/Qwen3-8B",
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "pad_token_id": 151643,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 40960,
+    "max_window_layers": 36,
+    "model_type": "qwen3",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": null,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 155648
+  },
+  "n_vq": 32,
+  "audio_vocab_size": 1024,
+  "audio_user_slot_token_id": 151654,
+  "audio_assistant_gen_slot_token_id": 151656,
+  "audio_assistant_delay_slot_token_id": 151662,
+  "audio_start_token_id": 151652,
+  "audio_end_token_id": 151653,
+  "audio_pad_code": 1024,
+  "sampling_rate": 24000,
+  "transformers_version": "4.57.1"
+}

configuration_moss_tts.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# coding=utf-8
+# Copyright 2026 OpenMOSS and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MossTTSDelay model configuration """
+from typing import Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers.models.qwen3 import Qwen3Config
+logger = logging.get_logger(__name__)
+class MossTTSDelayConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MossTTSDelayModel`]. It is used to instantiate an
+    MossTTSDelay model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the MossTTSDelay [MossTTSDelay-8B](https://huggingface.co/OpenMOSS/mosstts-8b) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        language_config (`Union[Qwen3Config, dict]`, *optional*):
+            Configuration for the backbone language model (Qwen3).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        n_vq (`int`, *optional*, defaults to 32):
+            Number of additional VQ (Vector Quantization) heads/channels for audio.
+            Determines the number of codebooks used in the audio representation.
+        audio_vocab_size (`int`, *optional*, defaults to 1024):
+            Vocabulary size for the audio tokens (codebooks 1 to N).
+        audio_user_slot_token_id (`int`, *optional*, defaults to 151654):
+            The specific token ID used as a placeholder/slot for user-side audio inputs in the prompt.
+        audio_assistant_gen_slot_token_id (`int`, *optional*, defaults to 151656):
+            The specific token ID representing the generation slot for the assistant's audio output.
+            Acting as the trigger for the TTS generation process.
+        audio_assistant_delay_slot_token_id (`int`, *optional*, defaults to 151662):
+            The token ID used in the 'Delay Pattern' paradigm to represent the delayed/offset positions
+            between different VQ channels.
+        audio_start_token_id (`int`, *optional*, defaults to 151652):
+            Special token ID used to denote the start of an audio sequence in the stream.
+        audio_end_token_id (`int`, *optional*, defaults to 151653):
+            Special token ID used to denote the end of an audio sequence (EOS for audio).
+        audio_pad_code (`int`, *optional*, defaults to 1024):
+            The padding value used within the audio VQ codebooks. Typically equals `audio_vocab_size`.
+    """
+    model_type = "moss_tts_delay"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        language_config: Optional[Union[Qwen3Config, dict]] = None,
+        initializer_range: float = 0.02,
+        n_vq: int = 32,
+        pad_token_id: int = 151643,
+        im_start_token_id: int = 151644,
+        im_end_token_id: int = 151645,
+        audio_vocab_size: int = 1024,
+        audio_user_slot_token_id: int = 151654,
+        audio_assistant_gen_slot_token_id: int = 151656,
+        audio_assistant_delay_slot_token_id: int = 151662,
+        audio_start_token_id: int = 151652,
+        audio_end_token_id: int = 151653,
+        audio_pad_code: int = 1024,
+        sampling_rate: int = 24000,
+        **kwargs,
+    ):
+        if isinstance(language_config, dict):
+            self.language_config = Qwen3Config(**language_config)
+        elif language_config is None:
+            self.language_config = Qwen3Config()
+        else:
+            self.language_config = language_config
+        self.initializer_range = initializer_range
+        self.n_vq = n_vq
+        self.audio_vocab_size = audio_vocab_size
+        self.audio_user_slot_token_id = audio_user_slot_token_id
+        self.audio_assistant_gen_slot_token_id = audio_assistant_gen_slot_token_id
+        self.audio_assistant_delay_slot_token_id = audio_assistant_delay_slot_token_id
+        self.audio_start_token_id = audio_start_token_id
+        self.audio_end_token_id = audio_end_token_id
+        self.audio_pad_code = audio_pad_code
+        self.sampling_rate = sampling_rate
+        self.hidden_size = self.language_config.hidden_size
+        self.vocab_size = self.language_config.vocab_size
+        self.im_start_token_id = self.language_config
+        self.pad_token_id = pad_token_id
+        self.im_start_token_id = im_start_token_id
+        self.im_end_token_id = im_end_token_id
+        super().__init__(**kwargs)
+    def to_dict(self):
+        output = super().to_dict()
+        if hasattr(self.language_config, "to_dict"):
+            output["language_config"] = self.language_config.to_dict()
+        else:
+            output["language_config"] = self.language_config
+        return output

inference_utils.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import torch
+import torchaudio
+import torch.nn.functional as F
+from typing import Optional, List, Tuple
+from tqdm import tqdm
+def apply_top_k(logits, top_k):
+    batch_size, vocab_size = logits.shape
+    top_k = min(top_k, vocab_size)
+    top_k_values, top_k_indices = torch.topk(logits, top_k, dim=-1)
+    filtered_logits = torch.full_like(logits, float("-inf"))
+    batch_indices = torch.arange(batch_size).unsqueeze(-1)
+    filtered_logits[batch_indices, top_k_indices] = top_k_values
+    return filtered_logits
+def apply_top_p(logits, top_p):
+    probs = F.softmax(logits, dim=-1)
+    sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=-1)
+    cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+    sorted_indices_to_remove = cumulative_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = False
+    batch_size = logits.shape[0]
+    filtered_logits = logits.clone()
+    for i in range(batch_size):
+        indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
+        filtered_logits[i, indices_to_remove] = float("-inf")
+    return filtered_logits
+def apply_top_p_optimized(logits, top_p):
+    probs = F.softmax(logits, dim=-1)
+    sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=-1)
+    cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+    sorted_indices_to_remove = cumulative_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = False
+    indices_to_remove = torch.zeros_like(logits, dtype=torch.bool).scatter_(
+        dim=-1, index=sorted_indices, src=sorted_indices_to_remove
+    )
+    logits[indices_to_remove] = float("-inf")
+    return logits
+def apply_repetition_penalty_delay_pattern(
+    logits: torch.Tensor,
+    prev_tokens: torch.LongTensor,
+    penalty: float,
+):
+    """
+    logits: [B, H, V]  or [N, V]
+    prev_tokens: [B, T, H] or [N, T] or [B, H]
+    Apply the repetition penalty independently for each H (VQ head).
+    """
+    if penalty == 1.0 or prev_tokens is None:
+        return logits
+    vocab_size = logits.size(-1)
+    # Case 1: regular [N, V] (text layer)
+    if logits.dim() == 2:
+        prev_tokens_flat = prev_tokens.reshape(-1)
+        unique_tokens = torch.unique(prev_tokens_flat)
+        token_logits = logits[:, unique_tokens]
+        pos_mask = token_logits > 0
+        token_logits[pos_mask] /= penalty
+        token_logits[~pos_mask] *= penalty
+        logits[:, unique_tokens] = token_logits
+        return logits
+    # Case 2: Delay Pattern audio [B, H, V]
+    assert logits.dim() == 3, "Delay Pattern audio logits must be [B, H, V]"
+    B, H, V = logits.shape
+    for h in range(H):
+        # prev_tokens_h: [B, T] or [B]
+        prev_tokens_h = prev_tokens[..., h].reshape(-1)
+        unique_tokens = torch.unique(prev_tokens_h)
+        if unique_tokens.numel() == 0:
+            continue
+        token_logits = logits[:, h, unique_tokens]
+        pos_mask = token_logits > 0
+        token_logits[pos_mask] /= penalty
+        token_logits[~pos_mask] *= penalty
+        logits[:, h, unique_tokens] = token_logits
+    return logits
+def sample_token(
+    logits,
+    prev_tokens: Optional[torch.LongTensor] = None,
+    repetition_penalty: float = 1.0,
+    top_p=None,
+    top_k=None,
+    do_sample=True,
+):
+    vocab_size = logits.size(-1)
+    # ===== Repetition Penalty (before reshaping!) =====
+    if prev_tokens is not None and repetition_penalty != 1.0:
+        logits = apply_repetition_penalty_delay_pattern(
+            logits,
+            prev_tokens,
+            repetition_penalty,
+        )
+    if not do_sample:
+        return torch.argmax(logits, dim=-1)
+    # ===== Only flatten after this, for top-k / top-p / multinomial =====
+    original_shape = logits.shape
+    reshaped_logits = logits.view(-1, vocab_size)
+    if top_k is not None and top_k > 0:
+        reshaped_logits = apply_top_k(reshaped_logits, top_k)
+    if top_p is not None and top_p < 1.0:
+        reshaped_logits = apply_top_p_optimized(reshaped_logits, top_p)
+    probs = F.softmax(reshaped_logits, dim=-1)
+    next_tokens = torch.multinomial(probs, num_samples=1)
+    return next_tokens.view(original_shape[:-1])
+def find_last_equal_C(tensor, C):
+    """
+    tensor: torch.Tensor of shape [batch_size, seq_len]
+    C: scalar value to match
+    Returns: torch.Tensor of shape [batch_size] with last indices
+    """
+    mask = (tensor == C).int()  # Shape: [batch_size, seq_len], bool tensor
+    flipped_mask = mask.flip(dims=[1])  # Flip along sequence dimension
+    flipped_indices = flipped_mask.argmax(dim=1)  # First True in flipped
+    seq_len = tensor.shape[1]
+    last_indices = (seq_len - 1) - flipped_indices  # Convert to original indices
+    # Optional: Handle cases with no C (set to -1), though problem assumes existence
+    actual_values = tensor[torch.arange(tensor.shape[0]), last_indices]
+    no_match = actual_values != C
+    last_indices[no_match] = -1
+    return last_indices

modeling_moss_tts.py ADDED Viewed

	@@ -0,0 +1,532 @@

+# coding=utf-8
+# Copyright 2026 OpenMOSS and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Modeling classes for MossTTSDelay. """
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_outputs import ModelOutput
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.cache_utils import Cache
+from transformers.models.qwen3 import Qwen3Model
+from transformers import initialization as init
+from .configuration_moss_tts import MossTTSDelayConfig
+from .inference_utils import sample_token, find_last_equal_C
+try:
+    from .processing_moss_tts import UserMessage, AssistantMessage, MossTTSDelayProcessor
+except Exception:
+    UserMessage = None
+    AssistantMessage = None
+    MossTTSDelayProcessor = None
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "MossTTSDelayConfig"
+@dataclass
+class MossTTSDelayOutputWithPast(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Weighted sum of channel losses.
+        all_sum_losses (`torch.FloatTensor` of shape `(batch_size, n_vq + 1)`, *optional*):
+            Sum of losses for each sample and each channel before averaging.
+        all_token_nums (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Number of non-masked tokens per sample.
+        sample_losses (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+            Loss per sample.
+        channel_losses (`torch.FloatTensor` of shape `(n_vq + 1,)`, *optional*):
+            Loss per channel (text head + vq heads).
+        logits (`List[torch.FloatTensor]`, *optional*):
+            List of prediction scores from each head.
+        past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed):
+            Tuple of torch.FloatTensor (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer).
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed):
+            Tuple of torch.FloatTensor (one for each layer) of the attention weights.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    all_sum_losses: Optional[torch.FloatTensor] = None
+    all_token_nums: Optional[torch.LongTensor] = None
+    sample_losses: Optional[torch.FloatTensor] = None
+    channel_losses: Optional[torch.FloatTensor] = None
+    logits: Optional[List[torch.FloatTensor]] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+class MossTTSDelayPreTrainedModel(PreTrainedModel):
+    config_class = MossTTSDelayConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen3DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    def _init_weights(self, module):
+        """
+        Transformers 5.0+ safe init:
+        - MUST use transformers.initialization helpers
+        - MUST respect param._is_hf_initialized to avoid overwriting ckpt-loaded params
+        """
+        # Let HF handle its standard modules first (LayerNorm, Linear, Embedding, etc.)
+        super()._init_weights(module)
+        # Pick a std consistent with HF conventions
+        # Prefer model/text config initializer_range if present.
+        std = None
+        if hasattr(self.config, "initializer_range"):
+            std = self.config.initializer_range
+        elif hasattr(self.config, "language_config") and hasattr(self.config.language_config, "initializer_range"):
+            std = self.config.language_config.initializer_range
+        else:
+            std = 0.02
+        # Initialize extra audio embeddings
+        if isinstance(module, nn.Embedding):
+            # Only touch our extra embeddings (avoid double touching LM's embeddings if not desired)
+            # If you prefer, you can skip this check and rely on super()._init_weights for all embeddings.
+            if getattr(module, "num_embeddings", None) == self.config.audio_vocab_size + 1:
+                init.normal_(module.weight, mean=0.0, std=std)
+                # If you later set padding_idx, you must explicitly zero it (and respect _is_hf_initialized!)
+                # init.zeros_ will internally check param flags, but slicing needs manual care.
+        # Initialize multi-head projections you added
+        if isinstance(module, nn.Linear):
+            # For your lm_heads, super()._init_weights already covers typical Linear.
+            # This block is only needed if you have custom Linear variants later.
+            pass
+MOSSTTS_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`MossTTSDelayConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The MossTTSDelay Model architecture tailored for Text-to-Speech generation with multi-head VQ prediction.",
+    MOSSTTS_START_DOCSTRING,
+)
+class MossTTSDelayModel(MossTTSDelayPreTrainedModel):
+    UserMessage = UserMessage
+    AssistantMessage = AssistantMessage
+    Processor = MossTTSDelayProcessor
+    def __init__(self, config: MossTTSDelayConfig):
+        super().__init__(config)
+        self.config = config
+        config.language_config.torch_dtype = config.torch_dtype
+        self.language_model = Qwen3Model(config.language_config)
+        # Audio VQ Embeddings (Extra channels)
+        # Note: input_ids[..., 0] uses Qwen's embedding.
+        # input_ids[..., 1:] use these extensions.
+        self.emb_ext = nn.ModuleList()
+        for vq_idx in range(self.config.n_vq):
+            # Add +1 for potential padding/special tokens logic if strictly required by upstream data prep
+            self.emb_ext.append(
+                nn.Embedding(self.config.audio_vocab_size + 1, config.language_config.hidden_size, padding_idx=None)
+            )
+        # Multi-Head Prediction Layers
+        # Head 0: Main language head
+        # Head 1..N: Audio VQ heads
+        self.lm_heads = nn.ModuleList([
+            nn.Linear(config.language_config.hidden_size, config.language_config.vocab_size, bias=False)
+        ])
+        for vq_idx in range(self.config.n_vq):
+            self.lm_heads.append(
+                nn.Linear(config.language_config.hidden_size, self.config.audio_vocab_size + 1, bias=False)
+            )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self, input_ids: torch.LongTensor) -> torch.Tensor:
+        """
+        Computes the combined embeddings from text and multiple audio VQ channels.
+        Args:
+            input_ids: Shape (Batch, Seq_Len, 1 + n_vq)
+        """
+        # Base Text/Content Embedding
+        # input_ids[..., 0] is standard text or semantic tokens
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids[..., 0])
+        # Add VQ Embeddings
+        for i, embed_layer in enumerate(self.emb_ext):
+            # i corresponds to channel i+1 in input_ids
+            # We assume the data pipeline ensures indices are within range
+            inputs_embeds = inputs_embeds + embed_layer(input_ids[..., i + 1])
+        return inputs_embeds
+    def set_input_embeddings(self, value):
+        self.language_model.embed_tokens = value
+    def get_output_embeddings(self):
+        # Returning a list of heads might break some HF utilities expecting a single head.
+        # However, for custom models, this is acceptable.
+        return self.lm_heads
+    @add_start_docstrings_to_model_forward(MOSSTTS_START_DOCSTRING)
+    @replace_return_docstrings(output_type=MossTTSDelayOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        hidden_out_layers: Optional[List[int]] = None,
+        channelwise_loss_weight: Optional[List[float]] = None,
+        **kwargs,
+    ) -> Union[Tuple, MossTTSDelayOutputWithPast]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, 1 + n_vq)`):
+                Indices of input sequence tokens in the vocabulary.
+                Dimension 2 contains: [Text/Semantics, VQ_0, VQ_1, ..., VQ_N].
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length, 1 + n_vq)`, *optional*):
+                Labels for computing the masked language modeling loss.
+            channelwise_loss_weight (`List[float]`, *optional*):
+                Manual weights for summing losses across different heads (Text vs Audio channels).
+        Returns:
+        """
+        if len(input_ids.shape) != 3 or input_ids.shape[-1] != self.config.n_vq + 1:
+            raise ValueError("`Input_ids`'s shape should be exactly (batch_size, sequence_length, 1 + n_vq).")
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        # 1. Prepare Embeddings
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings(input_ids)
+        # 2. Backbone Forward
+        # Qwen3Model outputs standard CausalLMOutputWithPast or similar
+        outputs = self.language_model(
+            input_ids=None, # Passed via inputs_embeds
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=True, # Always need hidden states for multi-head projection
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        # 3. Handle specific layer outputs if requested (Delay Pattern often requires features from specific layers)
+        last_hidden_state = outputs.last_hidden_state
+        if hidden_out_layers is None:
+            # Default to using the last layer for all heads
+            # In some architectures (like MusicGen), different codebooks come from different transformer layers.
+            # Here we default to the final layer as per original code behavior [-1] * (n + 1).
+            hidden_states_for_heads = [last_hidden_state] * (len(self.lm_heads))
+        else:
+            # If hidden_out_layers is provided (e.g. [-1, -2, -3...]), fetch them from all_hidden_states
+            # Note: outputs.hidden_states includes embedding output at index 0 usually.
+            all_hs = outputs.hidden_states
+            hidden_states_for_heads = [all_hs[idx] for idx in hidden_out_layers]
+        # 4. Project to Logits (Multi-Head)
+        layer_logits = []
+        for i, (hs, head) in enumerate(zip(hidden_states_for_heads, self.lm_heads)):
+            logits = head(hs)
+            # Original code logic: Mask the last token index for audio heads (indices > 0)
+            # This implies the vocab size is (N+1) but the model shouldn't predict the (N+1)-th token
+            # (perhaps reserved for padding in the input but invalid for prediction).
+            if i > 0:
+                logits[..., -1] = float("-inf")
+            layer_logits.append(logits)
+        # 5. Loss Calculation
+        loss = None
+        all_sum_losses = None
+        all_token_nums = None
+        sample_losses = None
+        channel_losses = None
+        if labels is not None:
+            # Ensure labels match input shape rank (B, S, C)
+            if labels.dim() != 3:
+                raise ValueError(f"Labels must have rank 3 (B, S, C), got {labels.shape}")
+            batch_size = labels.size(0)
+            n_heads = len(layer_logits)
+            # Container for per-sample, per-channel losses
+            # Shape: [Batch, n_heads]
+            all_sum_losses_list = []
+            # Count valid tokens (not -100) per sample.
+            # Note: Assuming mask is consistent across channels or we take sum over dim 1 (seq)
+            # Usually strict masking means checking one channel or all.
+            # Original code: torch.sum(labels != -100, dim=1) -> [B, C]
+            all_token_nums = torch.sum(labels != -100, dim=1)
+            for i, logits in enumerate(layer_logits):
+                # logits: [B, S, V]
+                # cur_labels: [B, S]
+                cur_labels = labels[..., i]
+                # Flatten for CrossEntropy
+                # logits: [B*S, V], labels: [B*S]
+                loss_fct = CrossEntropyLoss(reduction='none')
+                vocab_size = logits.size(-1)
+                reshaped_logits = logits.view(-1, vocab_size)
+                reshaped_labels = cur_labels.contiguous().view(-1)
+                # Calculate loss per token
+                per_token_loss = loss_fct(reshaped_logits, reshaped_labels)
+                # Reshape back to [B, S] and sum over Sequence dimension to get per-sample loss
+                per_token_loss = per_token_loss.view(batch_size, -1)
+                per_sample_loss = torch.sum(per_token_loss, dim=-1) # [B]
+                all_sum_losses_list.append(per_sample_loss)
+            # Stack to [B, n_heads]
+            all_sum_losses = torch.stack(all_sum_losses_list, dim=1)
+            # Weighted Loss Aggregation
+            if channelwise_loss_weight is not None:
+                if len(channelwise_loss_weight) != n_heads:
+                    raise ValueError(f"channelwise_loss_weight length {len(channelwise_loss_weight)} != {n_heads}")
+                w_tensor = torch.tensor(channelwise_loss_weight, device=all_sum_losses.device, dtype=all_sum_losses.dtype)
+                # Sample losses: Weighted sum over channels per sample / Total weight
+                # Normalize by token count per channel
+                # Avoid division by zero with epsilon or mask
+                token_counts_safe = all_token_nums.float().clamp(min=1.0)
+                normalized_losses = all_sum_losses / token_counts_safe
+                sample_losses = (normalized_losses * w_tensor).sum(dim=1) / w_tensor.sum()
+                # Channel losses: Sum over batch / Sum tokens over batch
+                total_loss_per_channel = all_sum_losses.sum(dim=0)
+                total_tokens_per_channel = all_token_nums.sum(dim=0).float().clamp(min=1.0)
+                channel_losses = total_loss_per_channel / total_tokens_per_channel
+                # Final scalar loss
+                loss = (channel_losses * w_tensor).sum() / w_tensor.sum()
+            else:
+                # Default average if no weights provided
+                total_tokens = all_token_nums.sum().float().clamp(min=1.0)
+                loss = all_sum_losses.sum() / total_tokens
+                channel_losses = all_sum_losses.sum(dim=0) / all_token_nums.sum(dim=0).clamp(min=1.0)
+        return MossTTSDelayOutputWithPast(
+            loss=loss,
+            all_sum_losses=all_sum_losses,
+            all_token_nums=all_token_nums,
+            sample_losses=sample_losses,
+            channel_losses=channel_losses,
+            logits=layer_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @torch.inference_mode()
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        max_new_tokens: int = 1000,
+        text_temperature: float = 1.5,
+        text_top_p: float = 1.0,
+        text_top_k: int = 50,
+        audio_temperature: float = 1.5,
+        audio_top_p: float = 0.8,
+        audio_top_k: int = 50,
+        audio_repetition_penalty: float = 1.0,
+    ):
+        if text_temperature > 0:
+            text_do_sample = True
+        else:
+            text_temperature = 1
+            text_do_sample = False
+        if audio_temperature > 0:
+            audio_do_sample = True
+        else:
+            audio_temperature = 1
+            audio_do_sample = False
+        past_key_values = None
+        device = input_ids.device
+        current_input_ids = input_ids
+        current_attention_mask = attention_mask
+        batch_size, seq_len, n_vq = input_ids.shape
+        n_vq -= 1
+        generation_ids = input_ids[:]
+        is_stopping = torch.zeros(batch_size, dtype=torch.bool, device=device)
+        # 三个阶段: 1. 非 audio; 2. audio not delay; 3. audio delay
+        audio_lengths = torch.zeros(batch_size, dtype=torch.int64, device=device) # 0 的时候表示阶段1;
+        torch_int64_max = torch.iinfo(torch.int64).max
+        delayed_lengths = torch.full((batch_size,), torch_int64_max, dtype=torch.int64, device=device) # 最大值的时候表示阶段2;
+        # 考虑 continuation 时 audio_start 已经在 input_ids 中的情况;
+        # NOTE 注意我们目前不考虑任何输入已经开始 delay 的情况;
+        # 需要同时考虑 continuation 和直接生成的情况;
+        is_continuation = (input_ids[:, -1, 0] == self.config.audio_start_token_id) | (input_ids[:, -1, 0] == self.config.audio_assistant_gen_slot_token_id)
+        audio_start_indices = find_last_equal_C(input_ids[..., 0], self.config.audio_start_token_id)
+        audio_start_mask = is_continuation & (audio_start_indices != -1)
+        audio_lengths[audio_start_mask] = seq_len - audio_start_indices[audio_start_mask]
+        is_audio = audio_start_mask.clone()
+        pre_exclude_mask0 = torch.tensor([self.config.pad_token_id, self.config.audio_assistant_gen_slot_token_id, self.config.audio_assistant_delay_slot_token_id, self.config.audio_end_token_id], device=device)
+        pre_exclude_mask1 = torch.ones(self.config.language_config.vocab_size, device=device).bool()
+        pre_exclude_mask1[[self.config.audio_assistant_gen_slot_token_id, self.config.audio_assistant_delay_slot_token_id]] = False
+        # 注意 time_step 未必表示对于实际对话时，当前输出token的位置，因为有续写的情况;
+        for time_step in tqdm(range(max_new_tokens), desc=f"Generating bs{batch_size} ..."):
+            outputs = self(
+                input_ids=current_input_ids,
+                attention_mask=current_attention_mask,
+                past_key_values=past_key_values,
+                use_cache=True,
+            )
+            past_key_values = outputs.past_key_values
+            next_token_logits = [logit[:, -1, :] / text_temperature if logit_idx == 0 else logit[:, -1, :] / audio_temperature for logit_idx, logit in enumerate(outputs.logits)] # List, len=n_vq+1, [batch_size, 1, vocab_size];
+            next_token_logits[0] = next_token_logits[0].clone()
+            # 1. 先处理 text token;
+            next_text_token = torch.full((batch_size,), self.config.pad_token_id, device=device)
+            # 第二个 audio_assistant_delay_slot_token_id 和 audio_end 是不需要采样的，audio_start, 每一个 audio_assistant_gen_slot_token_ids 和第一个 audio_assistant_delay_slot_token_id 是需要采样的;
+            next_text_token[~is_stopping & (delayed_lengths < n_vq)] = self.config.audio_assistant_delay_slot_token_id
+            is_audio_eos = ~is_stopping & (delayed_lengths == n_vq)
+            next_text_token[is_audio_eos] = self.config.audio_end_token_id
+            is_audio[is_audio_eos] = False
+            sampling_text_mask = ~is_stopping & (delayed_lengths > n_vq)
+            next_token_logits[0][~is_audio] = next_token_logits[0][~is_audio].index_fill(-1, pre_exclude_mask0, float('-inf'))
+            next_token_logits[0][is_audio] = next_token_logits[0][is_audio].masked_fill(pre_exclude_mask1, float('-inf'))
+            if time_step == 0:
+                next_token_logits[0][..., 151662] = float('-inf')
+            if time_step <= n_vq:
+                next_token_logits[0][..., self.config.im_end_token_id] = float('-inf')
+            # 文本层不使用重复惩罚;
+            next_text_token[sampling_text_mask] = sample_token(
+                logits=next_token_logits[0][sampling_text_mask],
+                top_p=text_top_p,
+                top_k=text_top_k,
+                do_sample=text_do_sample
+            )
+            is_audio[next_text_token == self.config.audio_start_token_id] = True
+            # 只存在一种停止逻辑，即 next_text_token = <|im_end|>;
+            is_stopping[next_text_token == self.config.im_end_token_id] = True
+            # 2. 再处理 audio tokens;
+            # audio_start 和 audio_end 之外的内容直接pad，默认是 pad，我们只需要填充有值的部分即可;
+            next_audio_tokens = torch.full((batch_size, n_vq), self.config.audio_pad_code, device=device)
+            # 需要考虑的是与 audio_start 的距离;
+            # 先查看是否是pad的情况; true 表示有值;
+            pre_audio_mask = audio_lengths.unsqueeze(1) > torch.arange(n_vq, dtype=int, device=device).expand(batch_size, n_vq)
+            post_audio_mask = torch.arange(n_vq, dtype=int, device=device).expand(batch_size, n_vq) > delayed_lengths.unsqueeze(1) - 1
+            post_audio_mask[delayed_lengths == torch_int64_max] = True
+            sampling_audio_mask = pre_audio_mask & post_audio_mask
+            next_audio_tokens[~sampling_audio_mask] = self.config.audio_pad_code
+            if sampling_audio_mask.sum() > 0:
+                audio_logits = torch.stack(next_token_logits[1:], dim=1)[sampling_audio_mask] # torch.stack -> [batch_size, n_vq - 1, vocab_size]
+                audio_logits[..., self.config.audio_pad_code] = float('-inf')
+                next_audio_tokens[sampling_audio_mask] = sample_token(
+                    logits=audio_logits,
+                    prev_tokens=generation_ids[:, :, 1:],
+                    repetition_penalty=audio_repetition_penalty,
+                    top_p=audio_top_p,
+                    top_k=audio_top_k,
+                    do_sample=audio_do_sample
+                )
+            # 这里显示的是下一个时间步时可以直接使用的 audio_lengths 和 delayed_lengths 的状态;
+            # audio_lengths[(next_text_token == self.audio_start_token_id) & (audio_lengths > 0)] += 1
+            # audio_lengths[(next_text_token == self.audio_start_token_id) | (next_text_token == self.audio_assistant_gen_slot_token_id)] += 1
+            audio_lengths[(next_text_token == self.config.audio_start_token_id) | (next_text_token == self.config.audio_assistant_gen_slot_token_id) | (next_text_token == self.config.audio_assistant_delay_slot_token_id)] += 1
+            audio_lengths[next_text_token == self.config.audio_end_token_id] = 0
+            delayed_lengths[(delayed_lengths == torch_int64_max) & (next_text_token == self.config.audio_assistant_delay_slot_token_id)] = 0
+            delayed_lengths[delayed_lengths != torch_int64_max] += 1
+            delayed_lengths[delayed_lengths > n_vq] = torch_int64_max
+            current_input_ids = torch.cat([next_text_token[:, None, None], next_audio_tokens[:, None, :]], dim=2) # [batch_size, 1, n_vq + 1]
+            current_attention_mask = torch.cat([current_attention_mask, (~is_stopping).unsqueeze(-1)], dim=-1)
+            generation_ids = torch.cat([generation_ids, current_input_ids], dim=1) # [batch_size, seq_len, n_vq + 1]
+            if is_stopping.sum() == batch_size:
+                break
+        start_indices = find_last_equal_C(input_ids[..., 0], self.config.im_start_token_id) + 3
+        start_lengths = seq_len - start_indices
+        output = []
+        for start_idx, start_length, cur_generation_ids in zip(start_indices, start_lengths, generation_ids):
+            output.append((start_length, cur_generation_ids[start_idx:]))
+        return output

processing_moss_tts.py ADDED Viewed

	@@ -0,0 +1,629 @@

+# coding=utf-8
+# Copyright 2026 OpenMOSS and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Dict, List, Optional, Tuple, Type, Union, Literal, Final
+from dataclasses import dataclass
+from pathlib import Path
+import re
+import torchaudio
+import torch
+from transformers import PreTrainedTokenizerBase, BatchFeature, ProcessorMixin, logging, AutoConfig, AutoModel, AutoTokenizer
+from .configuration_moss_tts import MossTTSDelayConfig
+logger = logging.get_logger(__name__)
+AUDIO_PLACEHOLDER = "<|audio|>"
+@dataclass
+class Message:
+    pass
+@dataclass
+class UserMessage(Message):
+    text: Optional[str] = None
+    reference: Optional[List[Optional[Union[str, torch.Tensor]]]] = None
+    instruction: Optional[str] = None
+    tokens: Optional[int] = None
+    quality: Optional[str] = None
+    sound_event: Optional[str] = None
+    ambient_sound: Optional[str] = None
+    language: Optional[str] = None
+    def __post_init__(self):
+        template = """<user_inst>
+- Reference(s):
+{reference}
+- Instruction:
+{instruction}
+- Tokens:
+{tokens}
+- Quality:
+{quality}
+- Sound Event:
+{sound_event}
+- Ambient Sound:
+{ambient_sound}
+- Language:
+{language}
+- Text:
+{text}
+</user_inst>"""
+        audio_codes_list = []
+        if self.reference is None:
+            reference = "None"
+        elif isinstance(self.reference, List):
+            reference = []
+            for speaker_idx, speaker_reference in enumerate(self.reference):
+                if speaker_reference is not None:
+                    reference.append(f"[S{speaker_idx}]:\n{AUDIO_PLACEHOLDER}")
+            reference = "\n".join(reference)
+            audio_codes_list = [speaker_reference for speaker_reference in self.reference if speaker_reference is not None]
+        else:
+            raise TypeError("`reference` should be exactly a list when it is not None.")
+        content = (
+            template
+            .replace("{reference}", str(reference))
+            .replace("{instruction}", str(self.instruction))
+            .replace("{tokens}", str(self.tokens))
+            .replace("{quality}", str(self.quality))
+            .replace("{sound_event}", str(self.sound_event))
+            .replace("{ambient_sound}", str(self.ambient_sound))
+            .replace("{language}", str(self.language))
+            .replace("{text}", str(self.text))
+        )
+        self._content = content
+        self._audio_codes_list = audio_codes_list
+    def to_dict(self):
+        return {
+            "role": "user",
+            "content": self._content,
+            "audio_codes_list": self._audio_codes_list
+        }
+@dataclass
+class AssistantMessage(Message):
+    audio_codes_list: List[Union[str, torch.Tensor]]
+    content: str = AUDIO_PLACEHOLDER
+    def to_dict(self):
+        return {
+            "role": "assistant",
+            "content": self.content,
+            "audio_codes_list": self.audio_codes_list
+        }
+USER_MESSAGE_FIELDS = (
+    "text",
+    "reference",
+    "instruction",
+    "tokens",
+    "quality",
+    "sound_event",
+    "ambient_sound",
+    "language",
+)
+class MossTTSDelayProcessor(ProcessorMixin):
+    tokenizer_class = "AutoTokenizer"
+    audio_tokenizer_class = "AutoModel"
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        audio_tokenizer: AutoModel = None,
+        model_config: Optional[MossTTSDelayConfig] = None,
+        **kwargs
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            audio_tokenizer=audio_tokenizer,
+            **kwargs
+        )
+        if model_config is None:
+            model_config = MossTTSDelayConfig()
+        self.model_config = model_config
+        self.imstart_token_id = tokenizer.convert_tokens_to_ids("<|im_start|>")
+        self.imend_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+        self.newline_token_id = 198
+        self.audio_user_slot_token = tokenizer.convert_ids_to_tokens(self.model_config.audio_user_slot_token_id)
+        self.audio_assistant_gen_slot_token = tokenizer.convert_ids_to_tokens(self.model_config.audio_assistant_gen_slot_token_id)
+        self.audio_assistant_delay_slot_token = tokenizer.convert_ids_to_tokens(self.model_config.audio_assistant_delay_slot_token_id)
+        self.audio_start_token = tokenizer.convert_ids_to_tokens(self.model_config.audio_start_token_id)
+        self.audio_end_token = tokenizer.convert_ids_to_tokens(self.model_config.audio_end_token_id)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, trust_remote_code=True, **kwargs):
+        kwargs.pop("_from_auto")
+        pretrained_model_name_or_path = Path(pretrained_model_name_or_path)
+        model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs)
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs)
+        audio_tokenizer_name_or_path = kwargs.pop("codec_path", "OpenMOSS-Team/MOSS-Audio-Tokenizer")
+        assert isinstance(audio_tokenizer_name_or_path, str), f"Unsupported audio_tokenizer_path input format: {type(audio_tokenizer_name_or_path)}"
+        audio_tokenizer = AutoModel.from_pretrained(audio_tokenizer_name_or_path, trust_remote_code=trust_remote_code, **kwargs)
+        return cls(
+            tokenizer=tokenizer,
+            audio_tokenizer=audio_tokenizer,
+            model_config=model_config,
+            **kwargs
+        )
+    def __call__(
+        self,
+        conversations: Union[Message, Dict, List[Message], List[Dict], List[List[Message]], List[List[Dict]]],
+        mode: str = "generation",
+        apply_chat_template: bool = True,
+        n_vq: Optional[int] = None
+    ) -> BatchFeature:
+        """
+        mode 只会在将 Message 转换为 to_dict 时起作用;
+        """
+        if mode not in {"generation", "continuation"}:
+            raise RuntimeError
+        if isinstance(conversations, (Message, Dict)):
+            conversations = [conversations]
+        truncation = False
+        if mode == "continuation":
+            truncation = True
+        input_ids_list = []
+        for conversation in conversations:
+            if isinstance(conversation, (Message, Dict)):
+                conversation = [conversation]
+            if (mode == "generation") ^ (len(conversation) % 2 != 0):
+                raise ValueError
+            if (mode == "generation") ^ (conversation[-1]['role'] == "user"):
+                raise ValueError
+            unified_codes = []
+            for message_idx, message in enumerate(conversation):
+                message = self._normalize_message(message)
+                if apply_chat_template:
+                    add_generation_prompt = mode == "generation" and message_idx == len(conversation) - 1
+                    try:
+                        content = self.tokenizer.apply_chat_template(
+                            [{"role": message["role"], "content": message["content"]}],
+                            add_generation_prompt=add_generation_prompt,
+                            tokenize=False,
+                        )
+                    except TypeError:
+                        try:
+                            content = self.tokenizer.apply_chat_template(
+                                [{"role": message["role"], "content": message["content"]}],
+                                add_generation_prompt=add_generation_prompt,
+                            )
+                        except Exception:
+                            logger.warning("apply_chat_template failed; fallback to raw content.")
+                            content = message["content"]
+                else:
+                    content = message['content']
+                audio_codes_list = []
+                for audio_codes in message["audio_codes_list"]:
+                    if isinstance(audio_codes, torch.Tensor):
+                        if n_vq is not None and audio_codes.shape[1] != n_vq:
+                            raise RuntimeError("audio_codes's n_vq is not equal to the parameter `n_vq`. Your can set the parameter `n_vq` as None if you have already tokenzied the wavs.")
+                    else:
+                        audio_codes = self.encode_audios_from_path(audio_codes, n_vq)[0]
+                    audio_codes_list.append(audio_codes)
+                unified_codes.append(self._get_unified_codes(message['role'], content, audio_codes_list, truncation))
+            unified_codes = torch.cat(unified_codes)
+            input_ids_list.append(unified_codes)
+        return self._pad(input_ids_list)
+    @staticmethod
+    def build_user_message(
+        text: Optional[str] = None,
+        reference: Optional[List[Optional[Union[str, torch.Tensor]]]] = None,
+        instruction: Optional[str] = None,
+        tokens: Optional[int] = None,
+        quality: Optional[str] = None,
+        sound_event: Optional[str] = None,
+        ambient_sound: Optional[str] = None,
+        language: Optional[str] = None,
+    ) -> Dict:
+        if reference is not None and not isinstance(reference, list):
+            reference = [reference]
+        return UserMessage(
+            text=text,
+            reference=reference,
+            instruction=instruction,
+            tokens=tokens,
+            quality=quality,
+            sound_event=sound_event,
+            ambient_sound=ambient_sound,
+            language=language,
+        ).to_dict()
+    @staticmethod
+    def build_assistant_message(
+        audio_codes_list: List[Union[str, torch.Tensor]],
+        content: str = AUDIO_PLACEHOLDER,
+    ) -> Dict:
+        return AssistantMessage(
+            audio_codes_list=audio_codes_list,
+            content=content,
+        ).to_dict()
+    def _normalize_message(self, message: Union[Message, Dict]) -> Dict:
+        if isinstance(message, Message):
+            return message.to_dict()
+        if not isinstance(message, dict):
+            raise TypeError("Each message must be a Message or dict.")
+        if "role" not in message:
+            raise ValueError("Message dict must include a 'role' field.")
+        if "content" in message and "audio_codes_list" in message:
+            return message
+        role = message["role"]
+        if role == "user":
+            kwargs = {key: message.get(key) for key in USER_MESSAGE_FIELDS}
+            return self.build_user_message(**kwargs)
+        if role == "assistant":
+            return self.build_assistant_message(
+                audio_codes_list=message.get("audio_codes_list", []),
+                content=message.get("content", AUDIO_PLACEHOLDER),
+            )
+        raise ValueError(f"Unsupported role: {role}")
+    def _pad(self, input_ids_list: List[torch.Tensor]):
+        device = input_ids_list[0].device
+        lengths = torch.tensor([w.shape[0] for w in input_ids_list], device=device)
+        pad_input_ids = torch.nn.utils.rnn.pad_sequence(input_ids_list, batch_first=True, padding_value=self.model_config.audio_pad_code, padding_side="left")
+        other_channel_mask = (pad_input_ids.shape[1] - lengths).unsqueeze(1) > torch.arange(pad_input_ids.shape[1], device=device).unsqueeze(0)
+        pad_input_ids[..., 0][other_channel_mask] = self.model_config.pad_token_id
+        attention_mask = torch.zeros(pad_input_ids.shape[0], pad_input_ids.shape[1], device=device)
+        attention_mask[~other_channel_mask] = 1
+        attention_mask = attention_mask.bool()
+        return {
+            "input_ids": pad_input_ids, # [batch_size, seqlen, n_vq]
+            "attention_mask": attention_mask,
+        }
+    @staticmethod
+    def _replace_audio_placeholders(
+        content: str,
+        lengths: List[int],
+        n_vq: int,
+        gen_slot_token: str,
+        delay_slot_token: str,
+        audio_start_token: str,
+        audio_end_token: str
+    ) -> str:
+        if n_vq < 1:
+            raise ValueError(f"n_vq must be >= 1, got {n_vq}")
+        num_placeholders = content.count(AUDIO_PLACEHOLDER)
+        if num_placeholders != len(lengths):
+            raise ValueError(
+                f"Number of {AUDIO_PLACEHOLDER} ({num_placeholders}) "
+                f"does not match lengths ({len(lengths)})"
+            )
+        def build_audio_block(length: int) -> str:
+            if length < 0:
+                raise ValueError(f"length must be >= 0, got {length}")
+            if length == 0:
+                return f"{audio_start_token}{audio_end_token}"
+            step_tokens = gen_slot_token * length + (delay_slot_token * (n_vq - 1))
+            return f"{audio_start_token}{step_tokens}{audio_end_token}"
+        lengths_iter = iter(lengths)
+        def replacer(match: re.Match) -> str:
+            length = next(lengths_iter)
+            return build_audio_block(length)
+        result = re.sub(re.escape(AUDIO_PLACEHOLDER), replacer, content)
+        return result
+    @staticmethod
+    def _merge_consecutive_audio_placeholders(
+        content: str,
+        audio_codes_list: List[torch.Tensor],
+    ) -> Tuple[str, List[torch.Tensor]]:
+        matches = list(re.finditer(re.escape(AUDIO_PLACEHOLDER), content))
+        if len(matches) <= 1:
+            return content, audio_codes_list
+        if len(matches) != len(audio_codes_list):
+            raise ValueError("Audio placeholders do not match the provided audio codes list.")
+        new_audio_codes_list = []
+        new_parts = []
+        last_pos = 0
+        i = 0
+        while i < len(matches):
+            j = i
+            while (
+                j + 1 < len(matches)
+                and content[matches[j].end():matches[j + 1].start()].strip() == ""
+            ):
+                j += 1
+            new_parts.append(content[last_pos:matches[i].start()])
+            new_parts.append(AUDIO_PLACEHOLDER)
+            last_pos = matches[j].end()
+            if j == i:
+                new_audio_codes_list.append(audio_codes_list[i])
+            else:
+                new_audio_codes_list.append(torch.cat(audio_codes_list[i:j + 1], dim=0))
+            i = j + 1
+        new_parts.append(content[last_pos:])
+        return "".join(new_parts), new_audio_codes_list
+    @staticmethod
+    def apply_delay_pattern(codes: torch.Tensor, pad_code: int) -> torch.Tensor:
+        delayed_tokens = torch.full(
+            (codes.shape[0] + codes.shape[1] - 1, codes.shape[1]),
+            pad_code,
+            device=codes.device,
+            dtype=codes.dtype,
+        )
+        for i in range(codes.shape[1]):
+            delayed_tokens[i: i + codes.shape[0], i] = codes[:, i]
+        return delayed_tokens
+    @staticmethod
+    def apply_de_delay_pattern(delay_codes: torch.Tensor) -> torch.Tensor:
+        tokens = torch.full(
+            (delay_codes.shape[0] - delay_codes.shape[1] + 1, delay_codes.shape[1]),
+            0,
+            device=delay_codes.device,
+            dtype=delay_codes.dtype,
+        )
+        for i in range(delay_codes.shape[1]):
+            tokens[:, i] = delay_codes[i: i + tokens.shape[0], i]
+        return tokens
+    def _get_unified_codes(self, role: str, content: str, audio_codes_list: List[Union[str, torch.Tensor]], truncation: bool) -> torch.Tensor:
+        """
+        此时的 content 已经是带上了对话格式
+        """
+        if role == "user":
+            audio_gen_slot_token = audio_delay_slot_token = self.audio_user_slot_token
+        else:
+            audio_gen_slot_token = self.audio_assistant_gen_slot_token
+            audio_delay_slot_token = self.audio_assistant_delay_slot_token
+        if len(audio_codes_list):
+            n_vq = audio_codes_list[0].shape[1]
+        else:
+            n_vq = self.model_config.n_vq
+        if len(audio_codes_list) > 1 and AUDIO_PLACEHOLDER in content:
+            content, audio_codes_list = self._merge_consecutive_audio_placeholders(
+                content, audio_codes_list
+            )
+        content = self._replace_audio_placeholders(
+            content=content,
+            lengths=[len(audio_codes) for audio_codes in audio_codes_list],
+            n_vq=n_vq,
+            gen_slot_token=audio_gen_slot_token,
+            delay_slot_token=audio_delay_slot_token,
+            audio_start_token=self.audio_start_token,
+            audio_end_token=self.audio_end_token,
+        )
+        text_codes = torch.tensor(self.tokenizer.encode(content), device=audio_codes_list[0].device if audio_codes_list else None)
+        audio_start_indices = torch.where(text_codes == self.model_config.audio_start_token_id)[0]
+        audio_end_indices = torch.where(text_codes == self.model_config.audio_end_token_id)[0]
+        if len(audio_start_indices) != len(audio_codes_list) or len(audio_end_indices) != len(audio_codes_list):
+            raise ValueError("Audio placeholders do not match the provided audio codes list.")
+        delay_audio_codes_list = []
+        if len(audio_codes_list) == 0:
+            delay_audio_codes_list = torch.full(
+                (len(text_codes), n_vq),
+                self.model_config.audio_pad_code,
+                device=text_codes.device,
+                dtype=text_codes.dtype,
+            )
+        else:
+            prefix_idx = 0
+            for audio_start_idx, audio_end_idx, audio_codes in zip(audio_start_indices, audio_end_indices, audio_codes_list):
+                delay_audio_codes = self.apply_delay_pattern(audio_codes, self.model_config.audio_pad_code)
+                pad_codes = torch.full(
+                    (audio_start_idx - prefix_idx + 1, n_vq),
+                    self.model_config.audio_pad_code,
+                    device=audio_codes.device,
+                    dtype=audio_codes.dtype,
+                )
+                delay_audio_codes_list.extend([pad_codes, delay_audio_codes])
+                prefix_idx = audio_end_idx
+            if truncation:
+                delay_audio_codes_list[-1] = delay_audio_codes_list[-1][:-(n_vq - 1), :]
+            else:
+                pad_codes = torch.full(
+                    (len(text_codes) - audio_end_indices[-1], n_vq),
+                    self.model_config.audio_pad_code,
+                    device=audio_codes_list[0].device,
+                    dtype=audio_codes_list[0].dtype,
+                )
+                delay_audio_codes_list.append(pad_codes)
+            delay_audio_codes_list = torch.cat(delay_audio_codes_list)
+        if text_codes.shape[0] != delay_audio_codes_list.shape[0]:
+            text_codes = text_codes[:delay_audio_codes_list.shape[0]]
+        unified_codes = torch.cat([text_codes.unsqueeze(1), delay_audio_codes_list], dim=1)
+        return unified_codes
+    def _parse_text_codes(self, start_length, text_codes):
+        text = self.tokenizer.decode(text_codes)
+        prefix = self.tokenizer.decode(text_codes[:start_length])
+        text = text[len(prefix):]
+        AUDIO_PATTERN = re.compile(
+            rf'(?:{self.audio_start_token})?'
+            rf'(?:{self.audio_assistant_gen_slot_token})*'
+            rf'(?:{self.audio_assistant_delay_slot_token})*'
+            rf'{self.audio_end_token}'
+        )
+        def normalize_audio_segments(text: str) -> str:
+            def repl(match: re.Match) -> str:
+                seg = match.group(0)
+                # 如果片段内包含至少一个 gen_slot，则替换为 <|audio|>
+                if self.audio_assistant_gen_slot_token in seg:
+                    return AUDIO_PLACEHOLDER
+                # 否则直接删除
+                return ""
+            return AUDIO_PATTERN.sub(repl, text)
+        return normalize_audio_segments(text)
+    def _parse_audio_codes(self, start_length, audio_codes):
+        # De-delay back to [T', n_vq]
+        audio_codes = self.apply_de_delay_pattern(audio_codes)
+        # Rows that are all pad are separators between real audio segments.
+        is_pad = (audio_codes == self.model_config.audio_pad_code).all(dim=1)
+        non_pad = ~is_pad
+        if not non_pad.any():
+            return []
+        idx = torch.nonzero(non_pad).squeeze(1)
+        breaks = torch.where(idx[1:] != idx[:-1] + 1)[0] + 1
+        if breaks.numel() == 0:
+            segments_idx = [idx]
+        else:
+            segments_idx = torch.split(idx, breaks.tolist())
+        audio_codes_list = [audio_codes[s] for s in segments_idx]
+        decoded_audio_list = []
+        for segment_codes in audio_codes_list:
+            decoded_segment = self.decode_audio_codes([segment_codes])
+            if len(decoded_segment) > 0:
+                decoded_audio_list.append(decoded_segment[0])
+        # Keep codec causal context by decoding the whole first segment first,
+        # then trim at waveform level according to start_length ratio.
+        if start_length > 0 and len(audio_codes_list) > 0 and len(decoded_audio_list) > 0:
+            first_codes_length = audio_codes_list[0].shape[0]
+            if first_codes_length > 0:
+                trim_ratio = max(0.0, min(float(start_length) / float(first_codes_length), 1.0))
+                first_audio = decoded_audio_list[0]
+                if trim_ratio >= 1.0:
+                    decoded_audio_list = decoded_audio_list[1:]
+                elif trim_ratio > 0.0:
+                    trim_samples = int(first_audio.shape[-1] * trim_ratio)
+                    decoded_audio_list[0] = first_audio[..., trim_samples:]
+        return decoded_audio_list
+    def decode(self, output: List[Tuple[int, torch.Tensor]]):
+        """
+        1. 这里不管怎样，都需要一个完整的 assistant generation ids;
+        2. 支持从任意位置进行截断；
+        """
+        genearted_messages = []
+        for start_length, generation_ids in output:
+            content = self._parse_text_codes(start_length, generation_ids[:, 0])
+            audio_codes_list = self._parse_audio_codes(start_length, generation_ids[:, 1:])
+            if content == "":
+                message = None
+            else:
+                message = AssistantMessage(
+                    content=content,
+                    audio_codes_list=audio_codes_list
+                )
+            genearted_messages.append(message)
+        return genearted_messages
+    @staticmethod
+    def loudness_normalize(wav: torch.Tensor, target_dbfs: float = -20, gain_range: tuple[float, float] = (-3.0, 3.0)) -> torch.Tensor:
+        wav = wav.to(torch.float32)
+        if wav.numel() == 0: return wav
+        rms = torch.sqrt(torch.mean(wav ** 2))
+        current_dbfs = 20.0 * torch.log10(rms + 1e-9)
+        gain = float(target_dbfs - current_dbfs)
+        gain = max(gain_range[0], min(gain, gain_range[1]))
+        factor = 10.0 ** (gain / 20.0)
+        return wav * factor
+    def encode_audios_from_wav(self, wav_list: List[torch.Tensor], sampling_rate: int, n_vq: int = None):
+        if isinstance(wav_list, torch.Tensor):
+            wav_list = [wav_list]
+        wav_list_ = []
+        resample = False
+        if sampling_rate != self.model_config.sampling_rate:
+            resample = True
+        for wav in wav_list:
+            if wav.shape[0] > 1:
+                wav = torch.mean(wav, dim=0, keepdim=True)
+            if resample:
+                wav = torchaudio.functional.resample(waveform=wav, orig_freq=sampling_rate, new_freq=self.model_config.sampling_rate)
+            wav_list_.append(self.loudness_normalize(wav.squeeze(0)))
+        return self.audio_tokenizer.encode(wav_list_, n_vq)
+    def encode_audios_from_path(self, wav_path_list: List[str], n_vq: int = None):
+        if isinstance(wav_path_list, str):
+            wav_path_list = [wav_path_list]
+        wav_list = []
+        sampling_rate = None
+        for wav_path in wav_path_list:
+            wav, sr = torchaudio.load(wav_path)
+            if sampling_rate is None:
+                sampling_rate = sr
+            elif sampling_rate != sr:
+                raise ValueError("sampling_rate of audios in the same batch should be the same.")
+            wav_list.append(wav)
+        return self.encode_audios_from_wav(wav_list, sampling_rate, n_vq)
+    def decode_audio_codes(self, audio_tokens_list: List[torch.Tensor]):
+        return self.audio_tokenizer.decode(audio_tokens_list)

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "processor_class": "MossTTSDelayProcessor",
+    "auto_map": {
+        "AutoProcessor": "processing_moss_tts.MossTTSDelayProcessor"
+    }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|audio_start|>",
+    "<|audio_end|>",
+    "<|audio_user_slot|>",
+    "<|image_pad|>",
+    "<|audio_assistant_gen_slot|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}