Update modeling

Files changed (4) hide show

modeling_mossttsrealtime.py +13 -15
modeling_mossttsrealtime_local.py +36 -15
processing_mossttsrealtime.py +14 -7
streaming_mossttsrealtime.py +31 -20

modeling_mossttsrealtime.py CHANGED Viewed

@@ -11,8 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""MossTTSRealtime backbone model."""
 from __future__ import annotations
@@ -23,12 +22,12 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import ModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.qwen3 import Qwen3Model
 from transformers.models.qwen3.modeling_qwen3 import Qwen3Attention, Qwen3DecoderLayer
 from .configuration_mossttsrealtime import MossTTSRealtimeConfig
 from .modeling_mossttsrealtime_local import MossTTSRealtimeLocalTransformerForCausalLM
@@ -51,21 +50,14 @@ class MossTTSRealtimePretrainedModel(PreTrainedModel):
     }
     def _init_weights(self, module):
-        from transformers import initialization as init
         std = self.config.initializer_range
         if isinstance(module, nn.Linear):
-            # module.weight.data.normal_(mean=0.0, std=std)
             init.normal_(module.weight, mean=0.0, std=std)
             if module.bias is not None:
-                # module.bias.data.zero_()
                 init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
-            # module.weight.data.normal_(mean=0.0, std=std)
             init.normal_(module.weight, mean=0.0, std=std)
             if module.padding_idx is not None:
-                # module.weight.data[module.padding_idx].zero_()
                 init.zeros_(module.weight[module.padding_idx])
@@ -145,7 +137,9 @@ class MossTTSRealtime(MossTTSRealtimePretrainedModel):
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
-            output_hidden_states=True,
             cache_position=cache_position,
             **kwargs,
         )
@@ -156,11 +150,12 @@ class MossTTSRealtime(MossTTSRealtimePretrainedModel):
             audio_labels = labels[:, :, 1:]
             train_mask = ~(audio_labels == -100).all(dim=-1)
             local_input_ids = audio_labels[train_mask][..., : self.config.rvq - 1]
-            local_input_ids[local_input_ids == -100] = 1024
             local_input_ids = F.pad(local_input_ids, (1, 0), value=0)
             train_idx = train_mask.nonzero(as_tuple=True)
-            local_hidden_states = outputs[0][train_idx[0], train_idx[1] - 1, :].reshape(
                 -1, 1, self.config.local_config.hidden_size
             )
             local_labels = audio_labels[train_mask]
@@ -175,7 +170,7 @@ class MossTTSRealtime(MossTTSRealtimePretrainedModel):
             )
             loss = local_outputs.loss
-        return MossTTSRealtimeOutputWithPast(
             loss=loss,
             logits=None,
             past_key_values=outputs.past_key_values,
@@ -187,6 +182,9 @@ class MossTTSRealtime(MossTTSRealtimePretrainedModel):
             local_hidden_states=local_outputs.hidden_states if local_outputs is not None else None,
             local_attentions=local_outputs.attentions if local_outputs is not None else None,
         )
-__all__ = ["MossTTSRealtime", "MossTTSRealtimeConfig", "MossTTSRealtimeOutputWithPast", "MossTTSRealtimePretrainedModel", "Qwen3Model"]

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""MossTTSRealtime model."""
 from __future__ import annotations
 import torch.nn as nn
 import torch.nn.functional as F
+from transformers import initialization as init
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import ModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.qwen3 import Qwen3Model
 from transformers.models.qwen3.modeling_qwen3 import Qwen3Attention, Qwen3DecoderLayer
 from .configuration_mossttsrealtime import MossTTSRealtimeConfig
 from .modeling_mossttsrealtime_local import MossTTSRealtimeLocalTransformerForCausalLM
     }
     def _init_weights(self, module):
         std = self.config.initializer_range
         if isinstance(module, nn.Linear):
             init.normal_(module.weight, mean=0.0, std=std)
             if module.bias is not None:
                 init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             init.normal_(module.weight, mean=0.0, std=std)
             if module.padding_idx is not None:
                 init.zeros_(module.weight[module.padding_idx])
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
             cache_position=cache_position,
             **kwargs,
         )
             audio_labels = labels[:, :, 1:]
             train_mask = ~(audio_labels == -100).all(dim=-1)
             local_input_ids = audio_labels[train_mask][..., : self.config.rvq - 1]
+            local_input_ids[local_input_ids == -100] = self.config.audio_pad_token
             local_input_ids = F.pad(local_input_ids, (1, 0), value=0)
             train_idx = train_mask.nonzero(as_tuple=True)
+            hidden_positions = torch.clamp(train_idx[1] - 1, min=0)
+            local_hidden_states = outputs.last_hidden_state[train_idx[0], hidden_positions, :].reshape(
                 -1, 1, self.config.local_config.hidden_size
             )
             local_labels = audio_labels[train_mask]
             )
             loss = local_outputs.loss
+        output = MossTTSRealtimeOutputWithPast(
             loss=loss,
             logits=None,
             past_key_values=outputs.past_key_values,
             local_hidden_states=local_outputs.hidden_states if local_outputs is not None else None,
             local_attentions=local_outputs.attentions if local_outputs is not None else None,
         )
+        if not return_dict:
+            return output.to_tuple()
+        return output
+__all__ = ["MossTTSRealtime", "MossTTSRealtimeConfig", "MossTTSRealtimeOutputWithPast", "MossTTSRealtimePretrainedModel"]

modeling_mossttsrealtime_local.py CHANGED Viewed

@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Local transformer used by MossTTSRealtime for RVQ codebook decoding."""
 from __future__ import annotations
@@ -22,7 +21,7 @@ import torch
 import torch.nn as nn
 from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_layers import GradientCheckpointingLayer
@@ -31,9 +30,8 @@ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_u
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.masking_utils import create_causal_mask
 from transformers.processing_utils import Unpack
-from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
 from transformers.loss.loss_utils import ForCausalLMLoss
 from .configuration_mossttsrealtime import MossTTSRealtimeLocalTransformerConfig
 logger = logging.get_logger(__name__)
@@ -221,7 +219,10 @@ class MossTTSRealtimeLocalTransformerDecoderLayer(GradientCheckpointingLayer):
 class MossTTSRealtimeLocalTransformerPreTrainedModel(PreTrainedModel):
     config: MossTTSRealtimeLocalTransformerConfig
     base_model_prefix = "local_transformer"
     supports_gradient_checkpointing = True
     _no_split_modules = ["MossTTSRealtimeLocalTransformerDecoderLayer"]
@@ -231,6 +232,7 @@ class MossTTSRealtimeLocalTransformerPreTrainedModel(PreTrainedModel):
     _supports_flash_attn = True
     _can_compile_fullgraph = True
     _supports_attention_backend = True
     _can_record_outputs = {
         "hidden_states": MossTTSRealtimeLocalTransformerDecoderLayer,
         "attentions": MossTTSRealtimeLocalTransformerAttention,
@@ -297,11 +299,12 @@ class MossTTSRealtimeLocalTransformer(MossTTSRealtimeLocalTransformerPreTrainedM
         if position_ids is not None and not torch.compiler.is_compiling():
             position_ids = None
-        if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds.")
         if use_cache and past_key_values is None:
-            past_key_values = StaticCache(config=self.config, max_cache_len=16, device=inputs_embeds.device)
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
@@ -311,17 +314,32 @@ class MossTTSRealtimeLocalTransformer(MossTTSRealtimeLocalTransformerPreTrainedM
         if inputs_embeds is None:
             if codebook_idx is not None:
                 if input_ids.ndim == 1:
                     input_ids = input_ids.unsqueeze(1)
                 token_emb = self.embed_tokens[codebook_idx - 1](input_ids[:, 0]).unsqueeze(1)  # [B,1,H]
                 inputs_embeds = token_emb
             else:
-                codebook_idxs = torch.clamp(cache_position - 1, min=0)
-                inputs_embeds = self.embed_tokens[codebook_idxs - 1](input_ids)
-                input_ids_are_first_codebook = cache_position[0] == 0
                 if backbone_last_hidden_state is not None:
-                    inputs_embeds[:, 0] = backbone_last_hidden_state
                 else:
                     if not torch.compiler.is_compiling() and input_ids_are_first_codebook:
                         logger.warning(
@@ -414,8 +432,14 @@ class MossTTSRealtimeLocalTransformerForCausalLM(MossTTSRealtimeLocalTransformer
         hs = hidden_states[:, slice_indices, :]
         if cache_position is not None:
             logits = self.local_lm_heads[codebook_idx](hs[:, 0, :]).unsqueeze(1)
         else:
             logits_list = []
             for i in range(hs.shape[1]):
                 logits_list.append(self.local_lm_heads[i](hs[:, i, :]))
@@ -434,9 +458,6 @@ class MossTTSRealtimeLocalTransformerForCausalLM(MossTTSRealtimeLocalTransformer
             attentions=outputs.attentions,
         )
 __all__ = [
     "MossTTSRealtimeLocalTransformer",
     "MossTTSRealtimeLocalTransformerAttention",
@@ -446,4 +467,4 @@ __all__ = [
     "MossTTSRealtimeLocalTransformerPreTrainedModel",
     "MossTTSRealtimeLocalTransformerRMSNorm",
     "MossTTSRealtimeLocalTransformerRotaryEmbedding",
-]

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Local transformer used by MossTTSRealtime for RVQ codebook decoding."""
 from __future__ import annotations
 import torch.nn as nn
 from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_layers import GradientCheckpointingLayer
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.masking_utils import create_causal_mask
 from transformers.processing_utils import Unpack
 from transformers.loss.loss_utils import ForCausalLMLoss
+from transformers.utils import TransformersKwargs, logging
 from .configuration_mossttsrealtime import MossTTSRealtimeLocalTransformerConfig
 logger = logging.get_logger(__name__)
 class MossTTSRealtimeLocalTransformerPreTrainedModel(PreTrainedModel):
+    config_class = MossTTSRealtimeLocalTransformerConfig
     config: MossTTSRealtimeLocalTransformerConfig
     base_model_prefix = "local_transformer"
     supports_gradient_checkpointing = True
     _no_split_modules = ["MossTTSRealtimeLocalTransformerDecoderLayer"]
     _supports_flash_attn = True
     _can_compile_fullgraph = True
     _supports_attention_backend = True
     _can_record_outputs = {
         "hidden_states": MossTTSRealtimeLocalTransformerDecoderLayer,
         "attentions": MossTTSRealtimeLocalTransformerAttention,
         if position_ids is not None and not torch.compiler.is_compiling():
             position_ids = None
+        if (input_ids is None) == (inputs_embeds is None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds.")
         if use_cache and past_key_values is None:
+            device = inputs_embeds.device if inputs_embeds is not None else input_ids.device
+            past_key_values = StaticCache(config=self.config, max_cache_len=self.config.rvq, device=device)
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         if inputs_embeds is None:
             if codebook_idx is not None:
+                if codebook_idx <= 0:
+                    raise ValueError(f"`codebook_idx` must be in [1, {len(self.embed_tokens)}], got {codebook_idx}.")
+                if codebook_idx > len(self.embed_tokens):
+                    raise ValueError(f"`codebook_idx` must be in [1, {len(self.embed_tokens)}], got {codebook_idx}.")
                 if input_ids.ndim == 1:
                     input_ids = input_ids.unsqueeze(1)
                 token_emb = self.embed_tokens[codebook_idx - 1](input_ids[:, 0]).unsqueeze(1)  # [B,1,H]
                 inputs_embeds = token_emb
             else:
+                if input_ids.shape[1] != cache_position.shape[0]:
+                    raise ValueError(
+                        "`input_ids` and `cache_position` must align in sequence length: "
+                        f"got {input_ids.shape[1]} and {cache_position.shape[0]}."
+                    )
+                codebook_idxs = torch.clamp(cache_position - 1, min=0, max=len(self.embed_tokens) - 1)
+                inputs_embeds = torch.stack(
+                    [
+                        self.embed_tokens[codebook_idx](input_ids[:, seq_idx])
+                        for seq_idx, codebook_idx in enumerate(codebook_idxs.tolist())
+                    ],
+                    dim=1,
+                )
+                input_ids_are_first_codebook = bool(cache_position[0] == 0)
                 if backbone_last_hidden_state is not None:
+                    inputs_embeds[:, 0, :] = backbone_last_hidden_state[:, 0, :]
                 else:
                     if not torch.compiler.is_compiling() and input_ids_are_first_codebook:
                         logger.warning(
         hs = hidden_states[:, slice_indices, :]
         if cache_position is not None:
+            if codebook_idx is None:
+                raise ValueError("`codebook_idx` must be provided when `cache_position` is provided.")
             logits = self.local_lm_heads[codebook_idx](hs[:, 0, :]).unsqueeze(1)
         else:
+            if hs.shape[1] > len(self.local_lm_heads):
+                raise ValueError(
+                    f"Cannot project {hs.shape[1]} codebooks with only {len(self.local_lm_heads)} LM heads."
+                )
             logits_list = []
             for i in range(hs.shape[1]):
                 logits_list.append(self.local_lm_heads[i](hs[:, i, :]))
             attentions=outputs.attentions,
         )
 __all__ = [
     "MossTTSRealtimeLocalTransformer",
     "MossTTSRealtimeLocalTransformerAttention",
     "MossTTSRealtimeLocalTransformerPreTrainedModel",
     "MossTTSRealtimeLocalTransformerRMSNorm",
     "MossTTSRealtimeLocalTransformerRotaryEmbedding",
+]

processing_mossttsrealtime.py CHANGED Viewed

@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Processing utilities for MossTTSRealtime."""
 from __future__ import annotations
@@ -20,14 +19,19 @@ from typing import Iterable, Optional
 import numpy as np
-class MossTTSRealtimeProcessor:
     """Builds MossTTSRealtime prompt inputs with text and audio codebooks.
     This processor focuses on preparing the mixed text/audio token layout expected by MossTTSRealtime.
     It does not perform audio encoding/decoding by itself.
     """
     def __init__(
         self,
         tokenizer,
@@ -40,7 +44,9 @@ class MossTTSRealtimeProcessor:
         audio_eos_token: int = 1026,
         delay_tokens_len: int = 12,
     ):
-        self.tokenizer = tokenizer
         self.channels = channels
         self.audio_channel_pad = audio_channel_pad
         self.audio_bos_token = audio_bos_token
@@ -58,7 +64,7 @@ class MossTTSRealtimeProcessor:
                 "capabilities, allowing you to generate the corresponding speech based on the text given in the assistant."
                 "<|im_end|>\n"
             )
-        self.ttsbase_system_prompt = tts_system_prompt
     def _convert_token_to_id(self, token: str) -> int:
         if hasattr(self.tokenizer, "convert_tokens_to_ids"):
@@ -73,7 +79,7 @@ class MossTTSRealtimeProcessor:
         return int(token_ids[0])
     def make_voice_clone_prompt(self, prompt_audio_tokens_len: int) -> str:
-        padded_audio_prompt = f"{'<|audio_pad|>' * prompt_audio_tokens_len}"
         voice_clone = (
             "<|im_start|>context\n"
             "The assistant section should be synthesized using the following voice timbre:"
@@ -85,6 +91,7 @@ class MossTTSRealtimeProcessor:
         tokens = np.array(audio_tokens)
         if tokens.ndim != 2:
             raise ValueError(f"Expected 2D audio tokens, got shape {tokens.shape}")
         if tokens.shape[0] == self.channels:
             tokens = tokens.T
         elif tokens.shape[1] == self.channels:
@@ -101,9 +108,9 @@ class MossTTSRealtimeProcessor:
         if prompt_audio_tokens is not None:
             prompt_audio_tokens = self._normalize_audio_tokens(prompt_audio_tokens)
             prompt_audio_tokens = prompt_audio_tokens[:, : self.channels]
-            system_prompt_text = f"{self.ttsbase_system_prompt}" + f"{self.make_voice_clone_prompt(prompt_audio_tokens.shape[0])}"
         else:
-            system_prompt_text = f"{self.ttsbase_system_prompt}"
         system_prompt_tokens = self.tokenizer(system_prompt_text)["input_ids"]
         system_prompt_tokens_full = np.full(

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Processing utilities for MossTTSRealtime."""
 from __future__ import annotations
 import numpy as np
+from transformers.processing_utils import ProcessorMixin
+class MossTTSRealtimeProcessor(ProcessorMixin):
     """Builds MossTTSRealtime prompt inputs with text and audio codebooks.
     This processor focuses on preparing the mixed text/audio token layout expected by MossTTSRealtime.
     It does not perform audio encoding/decoding by itself.
     """
+    attributes = ["tokenizer"]
+    tokenizer_class = "AutoTokenizer"
     def __init__(
         self,
         tokenizer,
         audio_eos_token: int = 1026,
         delay_tokens_len: int = 12,
     ):
+        super().__init__(tokenizer=tokenizer)
+        self.audio_pad_token = audio_pad_token
+        self.text_pad_token = text_pad_token
         self.channels = channels
         self.audio_channel_pad = audio_channel_pad
         self.audio_bos_token = audio_bos_token
                 "capabilities, allowing you to generate the corresponding speech based on the text given in the assistant."
                 "<|im_end|>\n"
             )
+        self.tts_system_prompt = tts_system_prompt
     def _convert_token_to_id(self, token: str) -> int:
         if hasattr(self.tokenizer, "convert_tokens_to_ids"):
         return int(token_ids[0])
     def make_voice_clone_prompt(self, prompt_audio_tokens_len: int) -> str:
+        padded_audio_prompt = f"{self.audio_pad_token * prompt_audio_tokens_len}"
         voice_clone = (
             "<|im_start|>context\n"
             "The assistant section should be synthesized using the following voice timbre:"
         tokens = np.array(audio_tokens)
         if tokens.ndim != 2:
             raise ValueError(f"Expected 2D audio tokens, got shape {tokens.shape}")
+        # Accept [channels, T] or [T, channels], and slice to expected channels if needed.
         if tokens.shape[0] == self.channels:
             tokens = tokens.T
         elif tokens.shape[1] == self.channels:
         if prompt_audio_tokens is not None:
             prompt_audio_tokens = self._normalize_audio_tokens(prompt_audio_tokens)
             prompt_audio_tokens = prompt_audio_tokens[:, : self.channels]
+            system_prompt_text = f"{self.tts_system_prompt}" + f"{self.make_voice_clone_prompt(prompt_audio_tokens.shape[0])}"
         else:
+            system_prompt_text = f"{self.tts_system_prompt}"
         system_prompt_tokens = self.tokenizer(system_prompt_text)["input_ids"]
         system_prompt_tokens_full = np.full(

streaming_mossttsrealtime.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2026 OpenMOSS and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,22 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Streaming inference utilities for MossTTSRealtime."""
 from __future__ import annotations
-import re
-import numpy as np
 import contextlib
 import torch
 import torch.nn.functional as F
-import torchaudio
 from transformers.cache_utils import StaticCache
 from transformers.utils.import_utils import requires
-from typing import Iterable, Iterator, List, Optional, Sequence
 @requires(backends=("torch",))
 class MossTTSRealtimeInference:
@@ -74,11 +77,11 @@ class MossTTSRealtimeInference:
         return self._is_stopping is not None and bool(self._is_stopping.all())
     def reset_generation_state(self, keep_cache: bool = True):
-        # When keep_cache=True, retain the attention_mask so that its length matches past_key_values.
-        # This is used for concatenation in the next prefill step.
         if not keep_cache:
             self.past_key_values = None
             self.attention_mask = None
         self._generated_tokens = []
         self._is_stopping = None
         self._last_audio_tokens = None
@@ -172,6 +175,7 @@ class MossTTSRealtimeInference:
         current_input_ids = torch.from_numpy(np.stack(padded_input_ids)).to(device)
         current_attention_mask = torch.from_numpy(np.stack(padded_attns)).to(device)
         if self.attention_mask is not None and self.past_key_values is not None:
             current_attention_mask = torch.cat([self.attention_mask, current_attention_mask], dim=-1)
@@ -321,7 +325,7 @@ class MossTTSRealtimeInference:
         for i in range(self.channels):
             cache_pos_t.fill_(i)
-            local_outputs =  self.model.local_transformer(
                 input_ids=local_token,
                 inputs_embeds=local_inputs,
                 past_key_values=past_key_values,
@@ -335,7 +339,7 @@ class MossTTSRealtimeInference:
             if repetition_penalty and repetition_penalty != 1.0 and generated_tokens is not None:
                 logits = self.apply_repetition_penalty(
                     scores=logits,
-                    history_tokens=generated_tokens[:, :gen_step, i],
                     penalty=float(repetition_penalty),
                     repetition_window=repetition_window,
                 )
@@ -355,22 +359,22 @@ class MossTTSRealtimeInference:
     def apply_repetition_penalty(
         self,
-        scores: torch.Tensor,
         history_tokens: torch.Tensor,
         penalty: float = 1.1,
         repetition_window: Optional[int] = None,
     ):
         scores_ = scores[:, 0, :]
-        B, V = scores_.shape
         ht = history_tokens
         if repetition_window is not None and repetition_window > 0:
-            ht = ht[:, -repetition_window:]
         ht_sorted, _ = torch.sort(ht, dim=1)
         uniq = torch.unique_consecutive(ht_sorted, dim=1)
-        b_idx = torch.arange(B, device=uniq.device).unsqueeze(1).expand_as(uniq)
         b_flat = b_idx.reshape(-1)
         t_flat = uniq.reshape(-1)
@@ -430,9 +434,9 @@ class MossTTSRealtimeStreamingSession:
     """Manage text-to-audio streaming for a single conversation."""
     _split_pattern = re.compile(
-        r"[。！？!?\.\u2026]\s*"
-        r"|[,，;；:：\u2014\u2013\-]\s*"
-        r"|\)\s*|\]\s*"
         r"|\n"
     )
@@ -504,6 +508,7 @@ class MossTTSRealtimeStreamingSession:
         waveform = audio
         if isinstance(audio, (str, bytes)):
             wav, sr = torchaudio.load(audio)
             if wav.shape[0] > 1:
                 wav = wav.mean(dim=0, keepdim=True)
@@ -516,6 +521,7 @@ class MossTTSRealtimeStreamingSession:
             raise ValueError("Unsupported audio type for voice prompt.")
         if sample_rate is not None and sample_rate != self.codec_sample_rate:
             waveform = torchaudio.functional.resample(waveform, sample_rate, self.codec_sample_rate)
         waveform = waveform.to(self.inferencer.device)
@@ -839,17 +845,19 @@ class TextDeltaTokenizer:
         return list(self._all_ids)
     def push_delta(self, delta: str) -> list[int]:
         if not delta:
             return []
         self._text += str(delta)
         self._all_ids = self.tokenizer.encode(self._text, add_special_tokens=False)
-        # 留 hold_back 个 token 不输出（尾部可能随后续 delta 而改变）
         stable_count = max(self._emitted_count, len(self._all_ids) - self.hold_back)
         new_ids = self._all_ids[self._emitted_count : stable_count]
         self._emitted_count = stable_count
         return new_ids
     def flush(self) -> list[int]:
         self._all_ids = self.tokenizer.encode(self._text, add_special_tokens=False)
         remaining = self._all_ids[self._emitted_count :]
         self._emitted_count = len(self._all_ids)
@@ -862,6 +870,7 @@ def _sanitize_audio_tokens(
     codebook_size: int,
     audio_eos_token: int,
 ) -> tuple[torch.Tensor, bool]:
     if tokens.dim() == 1:
         tokens = tokens.unsqueeze(0)
     if tokens.numel() == 0:
@@ -935,12 +944,14 @@ class MossTTSRealtimeTextStreamBridge:
         yield from self._decode_audio_frames(audio_frames)
     def push_text_tokens(self, token_ids: Sequence[int]) -> Iterator[torch.Tensor]:
         if not token_ids:
             return
         audio_frames = self.session.push_text_tokens(token_ids)
         yield from self._decode_audio_frames(audio_frames)
     def finish(self, *, drain_step: int = 1) -> Iterator[torch.Tensor]:
         audio_frames = self.session.end_text()
         yield from self._decode_audio_frames(audio_frames)
@@ -957,7 +968,7 @@ class MossTTSRealtimeTextStreamBridge:
             yield final.detach().cpu()
     def stream_from_text_deltas(self, deltas: Iterable[str], *, drain_step: int = 1) -> Iterator[torch.Tensor]:
-        """一口气消费一个 delta 迭代器，并持续 yield wav chunk。"""
         with _maybe_codec_streaming(getattr(self.session, "codec", None), batch_size=self.batch_size):
             for delta in deltas:
                 yield from self.push_text_delta(delta)

+# Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Streaming inference utilities for MossTTSRealtime."""
 from __future__ import annotations
 import contextlib
+import re
+from typing import Iterable, Iterator, List, Optional, Sequence
+import numpy as np
 import torch
 import torch.nn.functional as F
 from transformers.cache_utils import StaticCache
+from transformers.utils import is_torchaudio_available, requires_backends
 from transformers.utils.import_utils import requires
+if is_torchaudio_available():
+    import torchaudio
 @requires(backends=("torch",))
 class MossTTSRealtimeInference:
         return self._is_stopping is not None and bool(self._is_stopping.all())
     def reset_generation_state(self, keep_cache: bool = True):
         if not keep_cache:
             self.past_key_values = None
             self.attention_mask = None
+        # Keep the mask when reusing cache so it stays aligned with past_key_values.
+        # This allows concatenation with the next turn prefill mask.
         self._generated_tokens = []
         self._is_stopping = None
         self._last_audio_tokens = None
         current_input_ids = torch.from_numpy(np.stack(padded_input_ids)).to(device)
         current_attention_mask = torch.from_numpy(np.stack(padded_attns)).to(device)
+        # For multi-turn continuation, concatenate the cached mask and the current prefill mask.
         if self.attention_mask is not None and self.past_key_values is not None:
             current_attention_mask = torch.cat([self.attention_mask, current_attention_mask], dim=-1)
         for i in range(self.channels):
             cache_pos_t.fill_(i)
+            local_outputs = self.model.local_transformer(
                 input_ids=local_token,
                 inputs_embeds=local_inputs,
                 past_key_values=past_key_values,
             if repetition_penalty and repetition_penalty != 1.0 and generated_tokens is not None:
                 logits = self.apply_repetition_penalty(
                     scores=logits,
+                    history_tokens=generated_tokens[:, :gen_step, i],
                     penalty=float(repetition_penalty),
                     repetition_window=repetition_window,
                 )
     def apply_repetition_penalty(
         self,
+        scores: torch.Tensor,
         history_tokens: torch.Tensor,
         penalty: float = 1.1,
         repetition_window: Optional[int] = None,
     ):
         scores_ = scores[:, 0, :]
+        batch_size = scores_.shape[0]
         ht = history_tokens
         if repetition_window is not None and repetition_window > 0:
+            ht = ht[:, -repetition_window:]
         ht_sorted, _ = torch.sort(ht, dim=1)
         uniq = torch.unique_consecutive(ht_sorted, dim=1)
+        b_idx = torch.arange(batch_size, device=uniq.device).unsqueeze(1).expand_as(uniq)
         b_flat = b_idx.reshape(-1)
         t_flat = uniq.reshape(-1)
     """Manage text-to-audio streaming for a single conversation."""
     _split_pattern = re.compile(
+        r"[。！？!?\.\u2026]\s*"  # sentence boundaries: 。！？ ! ? . …
+        r"|[,，;；:：\u2014\u2013\-]\s*"  # short pauses: , ， ; ； : ： — – -
+        r"|\)\s*|\]\s*"  # closing brackets: ) ]
         r"|\n"
     )
         waveform = audio
         if isinstance(audio, (str, bytes)):
+            requires_backends(self, ["torchaudio"])
             wav, sr = torchaudio.load(audio)
             if wav.shape[0] > 1:
                 wav = wav.mean(dim=0, keepdim=True)
             raise ValueError("Unsupported audio type for voice prompt.")
         if sample_rate is not None and sample_rate != self.codec_sample_rate:
+            requires_backends(self, ["torchaudio"])
             waveform = torchaudio.functional.resample(waveform, sample_rate, self.codec_sample_rate)
         waveform = waveform.to(self.inferencer.device)
         return list(self._all_ids)
     def push_delta(self, delta: str) -> list[int]:
+        """Append a text delta and return newly stable token ids (may be empty)."""
         if not delta:
             return []
         self._text += str(delta)
         self._all_ids = self.tokenizer.encode(self._text, add_special_tokens=False)
+        # Keep the tail un-emitted because the latest tokens can still change.
         stable_count = max(self._emitted_count, len(self._all_ids) - self.hold_back)
         new_ids = self._all_ids[self._emitted_count : stable_count]
         self._emitted_count = stable_count
         return new_ids
     def flush(self) -> list[int]:
+        """Emit all remaining token ids at end of stream."""
         self._all_ids = self.tokenizer.encode(self._text, add_special_tokens=False)
         remaining = self._all_ids[self._emitted_count :]
         self._emitted_count = len(self._all_ids)
     codebook_size: int,
     audio_eos_token: int,
 ) -> tuple[torch.Tensor, bool]:
+    """Trim rows after EOS/invalid tokens and return whether decoding should stop."""
     if tokens.dim() == 1:
         tokens = tokens.unsqueeze(0)
     if tokens.numel() == 0:
         yield from self._decode_audio_frames(audio_frames)
     def push_text_tokens(self, token_ids: Sequence[int]) -> Iterator[torch.Tensor]:
+        """Push token ids directly (for sources that stream token ids)."""
         if not token_ids:
             return
         audio_frames = self.session.push_text_tokens(token_ids)
         yield from self._decode_audio_frames(audio_frames)
     def finish(self, *, drain_step: int = 1) -> Iterator[torch.Tensor]:
+        """Mark text stream end and emit all remaining audio chunks (including flush)."""
         audio_frames = self.session.end_text()
         yield from self._decode_audio_frames(audio_frames)
             yield final.detach().cpu()
     def stream_from_text_deltas(self, deltas: Iterable[str], *, drain_step: int = 1) -> Iterator[torch.Tensor]:
+        """Consume a full delta iterator and continuously yield waveform chunks."""
         with _maybe_codec_streaming(getattr(self.session, "codec", None), batch_size=self.batch_size):
             for delta in deltas:
                 yield from self.push_text_delta(delta)