Spaces:

OpenMOSS-Team
/

MOSS-TTS-Realtime

Paused

App Files Files Community

Zhyw commited on 22 days ago

Commit

fe61555

verified ·

1 Parent(s): a3a48d4

Update mossttsrealtime/streaming_mossttsrealtime.py

Browse files

Files changed (1) hide show

mossttsrealtime/streaming_mossttsrealtime.py +30 -76

mossttsrealtime/streaming_mossttsrealtime.py CHANGED Viewed

@@ -23,7 +23,7 @@ import numpy as np
 import torch
 import torch.nn.functional as F
-from transformers.cache_utils import DynamicCache, StaticCache
 from transformers.utils import is_torchaudio_available, requires_backends
 from transformers.utils.import_utils import requires
@@ -34,6 +34,7 @@ if is_torchaudio_available():
 @requires(backends=("torch",))
 class MossTTSRealtimeInference:
     """Step-wise inference wrapper for MossTTSRealtime.
     This class mirrors the non-streaming inference logic but exposes a
     prefill/step/finish API for streaming usage.
     """
@@ -66,24 +67,6 @@ class MossTTSRealtimeInference:
         self._is_stopping = None
         self._last_audio_tokens = None
         self._step_idx = 0
-        attn_impl = ""
-        for cfg in (
-            getattr(getattr(self.model, "local_transformer", None), "config", None),
-            getattr(getattr(self.model, "config", None), "local_config", None),
-            getattr(self.model, "config", None),
-        ):
-            if cfg is None:
-                continue
-            for name in ("_attn_implementation", "attn_implementation"):
-                candidate = getattr(cfg, name, None)
-                if isinstance(candidate, str) and candidate.strip():
-                    attn_impl = candidate.strip().lower()
-                    break
-            if attn_impl:
-                break
-        self._use_dynamic_local_cache = attn_impl == "flash_attention_2"
-        self._should_compile_local_transformer = not self._use_dynamic_local_cache
-        self._compiled_local_transformer = None
     @property
     def device(self):
@@ -93,18 +76,6 @@ class MossTTSRealtimeInference:
     def is_finished(self) -> bool:
         return self._is_stopping is not None and bool(self._is_stopping.all())
-    def _build_local_past_key_values(self):
-        if self._use_dynamic_local_cache:
-            return DynamicCache()
-        return StaticCache(config=self.model.local_transformer.config, max_cache_len=self.channels)
-    def _get_local_transformer_runner(self):
-        if not self._should_compile_local_transformer:
-            return self._generate_local_transformer_impl
-        if self._compiled_local_transformer is None:
-            self._compiled_local_transformer = torch.compile(self._generate_local_transformer_impl, fullgraph=False)
-        return self._compiled_local_transformer
     def reset_generation_state(self, keep_cache: bool = True):
         if not keep_cache:
             self.past_key_values = None
@@ -328,6 +299,7 @@ class MossTTSRealtimeInference:
             steps_left -= 1
         return outputs
     def generate_local_transformer(
         self,
         hidden_states: torch.Tensor,
@@ -339,40 +311,16 @@ class MossTTSRealtimeInference:
         repetition_window: Optional[int],
         generated_tokens: Optional[torch.Tensor],
         gen_step: int,
-    ) -> torch.Tensor:
-        runner = self._get_local_transformer_runner()
-        return runner(
-            hidden_states=hidden_states,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            do_sample=do_sample,
-            repetition_penalty=repetition_penalty,
-            repetition_window=repetition_window,
-            generated_tokens=generated_tokens,
-            gen_step=gen_step,
-        )
-    def _generate_local_transformer_impl(
-        self,
-        hidden_states: torch.Tensor,
-        temperature: float,
-        top_p: float,
-        top_k: int,
-        do_sample: bool,
-        repetition_penalty: Optional[float],
-        repetition_window: Optional[int],
-        generated_tokens: Optional[torch.Tensor],
-        gen_step: int,
     ) -> torch.Tensor:
         batch_size = hidden_states.shape[0]
         local_inputs = hidden_states.reshape(-1, 1, self.model.config.local_config.hidden_size)
-        output_token = torch.empty(batch_size, self.channels, dtype=torch.long)
-        past_key_values = self._build_local_past_key_values()
         local_token = None
-        cache_pos_t = torch.zeros(1, dtype=torch.long)
         for i in range(self.channels):
             cache_pos_t.fill_(i)
@@ -531,6 +479,7 @@ class MossTTSRealtimeStreamingSession:
     def set_voice_prompt(self, audio, sample_rate: Optional[int] = None):
         """Set voice prompt from either audio tokens or waveform.
         If `audio` is a 2D array whose shape matches the codebook channels, it is
         treated as audio tokens. Otherwise a codec is required to encode waveform
         prompts into tokens.
@@ -737,23 +686,18 @@ class AudioStreamDecoder:
         codec,
         chunk_frames: int = 40,
         overlap_frames: int = 4,
-        initial_chunk_frames: Optional[int] = None,
-        decode_chunk_duration: Optional[float] = None,
         decode_kwargs: Optional[dict] = None,
         device: Optional[torch.device] = None,
     ):
         self.codec = codec
         self.chunk_frames = chunk_frames
         self.overlap_frames = overlap_frames
-        self.initial_chunk_frames = initial_chunk_frames
-        self.decode_chunk_duration = decode_chunk_duration
         self.decode_kwargs = decode_kwargs or {}
         self.device = device
         self._buffer: list[torch.Tensor] = []
         self._buffer_len = 0
         self._prev_tail: Optional[torch.Tensor] = None
-        self._chunks_emitted = 0
     def push_tokens(self, audio_tokens: np.ndarray | torch.Tensor):
         if isinstance(audio_tokens, np.ndarray):
@@ -763,17 +707,10 @@ class AudioStreamDecoder:
         self._buffer.append(audio_tokens)
         self._buffer_len += audio_tokens.shape[0]
-    @property
-    def _active_chunk_frames(self) -> int:
-        if self.initial_chunk_frames is not None:
-            return min(self.initial_chunk_frames + self._chunks_emitted, self.chunk_frames)
-        return self.chunk_frames
     def audio_chunks(self) -> Iterable[torch.Tensor]:
-        while self._buffer_len >= self._active_chunk_frames:
-            chunk_tokens = self._consume_frames(self._active_chunk_frames)
-            wav = self._decode(chunk_tokens)
-            self._chunks_emitted += 1
             yield self._apply_crossfade(wav)
     def flush(self) -> Optional[torch.Tensor]:
@@ -799,7 +736,7 @@ class AudioStreamDecoder:
         self._buffer_len -= num_frames - remaining
         return torch.cat(frames, dim=0)
-    def _decode(self, tokens: torch.Tensor) -> torch.Tensor:
         device = self.device
         if device is None:
             if hasattr(self.codec, "device"):
@@ -812,8 +749,22 @@ class AudioStreamDecoder:
         if device is not None:
             tokens = tokens.to(device)
         tokens_t = tokens.permute(1, 0)
         decode_kwargs = dict(self.decode_kwargs) if self.decode_kwargs else {}
-        decoded = self.codec.decode(tokens_t, chunk_duration=self.decode_chunk_duration, **decode_kwargs)
         if isinstance(decoded, dict):
             wav = decoded["audio"][0]
         else:
@@ -858,6 +809,7 @@ class AudioStreamDecoder:
 class TextDeltaTokenizer:
     """
     Convert LLM streaming text (delta) into “incremental token IDs”.
     Notes:
     - The input is a delta that is progressively appended to the same string
     (consistent with the common delta output behavior in vLLM).
@@ -939,6 +891,7 @@ def _maybe_codec_streaming(codec, *, batch_size: int):
 class MossTTSRealtimeTextStreamBridge:
     """
     Bridge: external LLM streaming text (delta) -> TTS streaming audio chunks.
     Usage overview:
     - First configure `MossTTSRealtimeStreamingSession` (especially `prefill_text_len=12`).
     - Provide an `AudioStreamDecoder`, then continuously feed the LLM delta text via
@@ -972,6 +925,7 @@ class MossTTSRealtimeTextStreamBridge:
     def push_text_delta(self, delta: str) -> Iterator[torch.Tensor]:
         """
         Push a chunk of incremental text output from the LLM and return newly generated WAV chunks.
         Internally, this directly calls `session.push_text()`, which segments the text
         based on punctuation/length and then tokenizes the *entire segment* at once,
         avoiding the prefix instability issues of incremental BPE tokenization.

 import torch
 import torch.nn.functional as F
+from transformers.cache_utils import StaticCache
 from transformers.utils import is_torchaudio_available, requires_backends
 from transformers.utils.import_utils import requires
 @requires(backends=("torch",))
 class MossTTSRealtimeInference:
     """Step-wise inference wrapper for MossTTSRealtime.
     This class mirrors the non-streaming inference logic but exposes a
     prefill/step/finish API for streaming usage.
     """
         self._is_stopping = None
         self._last_audio_tokens = None
         self._step_idx = 0
     @property
     def device(self):
     def is_finished(self) -> bool:
         return self._is_stopping is not None and bool(self._is_stopping.all())
     def reset_generation_state(self, keep_cache: bool = True):
         if not keep_cache:
             self.past_key_values = None
             steps_left -= 1
         return outputs
+    @torch.compile(fullgraph=True)
     def generate_local_transformer(
         self,
         hidden_states: torch.Tensor,
         repetition_window: Optional[int],
         generated_tokens: Optional[torch.Tensor],
         gen_step: int,
     ) -> torch.Tensor:
         batch_size = hidden_states.shape[0]
+        device = hidden_states.device
         local_inputs = hidden_states.reshape(-1, 1, self.model.config.local_config.hidden_size)
+        output_token = torch.empty(batch_size, self.channels, dtype=torch.long, device=device)
+        past_key_values = StaticCache(config=self.model.local_transformer.config, max_cache_len=self.channels)
         local_token = None
+        cache_pos_t = torch.zeros(1, dtype=torch.long, device=device)
         for i in range(self.channels):
             cache_pos_t.fill_(i)
     def set_voice_prompt(self, audio, sample_rate: Optional[int] = None):
         """Set voice prompt from either audio tokens or waveform.
         If `audio` is a 2D array whose shape matches the codebook channels, it is
         treated as audio tokens. Otherwise a codec is required to encode waveform
         prompts into tokens.
         codec,
         chunk_frames: int = 40,
         overlap_frames: int = 4,
         decode_kwargs: Optional[dict] = None,
         device: Optional[torch.device] = None,
     ):
         self.codec = codec
         self.chunk_frames = chunk_frames
         self.overlap_frames = overlap_frames
         self.decode_kwargs = decode_kwargs or {}
         self.device = device
         self._buffer: list[torch.Tensor] = []
         self._buffer_len = 0
         self._prev_tail: Optional[torch.Tensor] = None
     def push_tokens(self, audio_tokens: np.ndarray | torch.Tensor):
         if isinstance(audio_tokens, np.ndarray):
         self._buffer.append(audio_tokens)
         self._buffer_len += audio_tokens.shape[0]
     def audio_chunks(self) -> Iterable[torch.Tensor]:
+        while self._buffer_len >= self.chunk_frames:
+            chunk_tokens = self._consume_frames(self.chunk_frames)
+            wav = self._decode(chunk_tokens, chunk_duration=0.32)
             yield self._apply_crossfade(wav)
     def flush(self) -> Optional[torch.Tensor]:
         self._buffer_len -= num_frames - remaining
         return torch.cat(frames, dim=0)
+    def _decode(self, tokens: torch.Tensor, chunk_duration: float = 0.32) -> torch.Tensor:
         device = self.device
         if device is None:
             if hasattr(self.codec, "device"):
         if device is not None:
             tokens = tokens.to(device)
         tokens_t = tokens.permute(1, 0)
+        # allow callers to override decode settings (e.g. chunk_duration=-1 to disable internal streaming)
         decode_kwargs = dict(self.decode_kwargs) if self.decode_kwargs else {}
+        if "chunk_duration" in decode_kwargs:
+            override = decode_kwargs.pop("chunk_duration")
+            if override is None:
+                chunk_duration_arg = None
+            else:
+                try:
+                    override_f = float(override)
+                except Exception:
+                    override_f = None
+                chunk_duration_arg = None if override_f is None or override_f <= 0 else override_f
+        else:
+            chunk_duration_arg = chunk_duration
+        decoded = self.codec.decode(tokens_t, chunk_duration=chunk_duration_arg, **decode_kwargs)
         if isinstance(decoded, dict):
             wav = decoded["audio"][0]
         else:
 class TextDeltaTokenizer:
     """
     Convert LLM streaming text (delta) into “incremental token IDs”.
     Notes:
     - The input is a delta that is progressively appended to the same string
     (consistent with the common delta output behavior in vLLM).
 class MossTTSRealtimeTextStreamBridge:
     """
     Bridge: external LLM streaming text (delta) -> TTS streaming audio chunks.
     Usage overview:
     - First configure `MossTTSRealtimeStreamingSession` (especially `prefill_text_len=12`).
     - Provide an `AudioStreamDecoder`, then continuously feed the LLM delta text via
     def push_text_delta(self, delta: str) -> Iterator[torch.Tensor]:
         """
         Push a chunk of incremental text output from the LLM and return newly generated WAV chunks.
         Internally, this directly calls `session.push_text()`, which segments the text
         based on punctuation/length and then tokenizes the *entire segment* at once,
         avoiding the prefix instability issues of incremental BPE tokenization.