Qwen3-TTS

Runtime error

App Files Files Community

littlebird13

multimodalart HF Staff commited on Jan 29

Commit

8a13284

verified ·

1 Parent(s): bb80b9a

flash-attention-3 (#16)

Browse files

- Update app.py (04a9ea9b96182c6852b961b3d43de4386cbf3c39)
- Upload 18 files (850bacaf6cc310d95a68f832b6c0e446d936a049)
- Update requirements.txt (6dc80d8c9808285dcc9ba2425e73901988ea0cc7)

Co-authored-by: Apolinário from multimodal AI art <multimodalart@users.noreply.huggingface.co>

Files changed (6) hide show

app.py +82 -38
qwen_tts/__init__.py +1 -2
qwen_tts/cli/demo.py +6 -5
qwen_tts/core/models/modeling_qwen3_tts.py +73 -20
qwen_tts/inference/qwen3_tts_model.py +7 -4
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -8,39 +8,94 @@ import spaces
 import gradio as gr
 import numpy as np
 import torch
-from huggingface_hub import snapshot_download
-from huggingface_hub import login
 HF_TOKEN = os.environ.get('HF_TOKEN')
 login(token=HF_TOKEN)
-# Global model holders - keyed by (model_type, model_size)
-loaded_models = {}
 # Model size options
 MODEL_SIZES = ["0.6B", "1.7B"]
 def get_model_path(model_type: str, model_size: str) -> str:
     """Get model path based on type and size."""
     return snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}")
-def get_model(model_type: str, model_size: str):
-    """Get or load a model by type and size."""
-    global loaded_models
-    key = (model_type, model_size)
-    if key not in loaded_models:
-        from qwen_tts import Qwen3TTSModel
-        model_path = get_model_path(model_type, model_size)
-        loaded_models[key] = Qwen3TTSModel.from_pretrained(
-            model_path,
-            device_map="cuda",
-            dtype=torch.bfloat16,
-            token=HF_TOKEN,
-#           attn_implementation="flash_attention_2",
-        )
-    return loaded_models[key]
 def _normalize_audio(wav, eps=1e-12, clip=True):
@@ -89,15 +144,8 @@ def _audio_to_tuple(audio):
     return None
-# Speaker and language choices for CustomVoice model
-SPEAKERS = [
-    "Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"
-]
-LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]
 @spaces.GPU(duration=60)
-def generate_voice_design(text, language, voice_description):
     """Generate speech using Voice Design model (1.7B only)."""
     if not text or not text.strip():
         return None, "Error: Text is required."
@@ -105,8 +153,7 @@ def generate_voice_design(text, language, voice_description):
         return None, "Error: Voice description is required."
     try:
-        tts = get_model("VoiceDesign", "1.7B")
-        wavs, sr = tts.generate_voice_design(
             text=text.strip(),
             language=language,
             instruct=voice_description.strip(),
@@ -119,7 +166,7 @@ def generate_voice_design(text, language, voice_description):
 @spaces.GPU(duration=60)
-def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector_only, model_size):
     """Generate speech using Base (Voice Clone) model."""
     if not target_text or not target_text.strip():
         return None, "Error: Target text is required."
@@ -132,7 +179,7 @@ def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector
         return None, "Error: Reference text is required when 'Use x-vector only' is not enabled."
     try:
-        tts = get_model("Base", model_size)
         wavs, sr = tts.generate_voice_clone(
             text=target_text.strip(),
             language=language,
@@ -147,7 +194,7 @@ def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector
 @spaces.GPU(duration=60)
-def generate_custom_voice(text, language, speaker, instruct, model_size):
     """Generate speech using CustomVoice model."""
     if not text or not text.strip():
         return None, "Error: Text is required."
@@ -155,7 +202,7 @@ def generate_custom_voice(text, language, speaker, instruct, model_size):
         return None, "Error: Speaker is required."
     try:
-        tts = get_model("CustomVoice", model_size)
         wavs, sr = tts.generate_custom_voice(
             text=text.strip(),
             language=language,
@@ -184,12 +231,10 @@ def build_ui():
         gr.Markdown(
             """
 # Qwen3-TTS Demo
 A unified Text-to-Speech demo featuring three powerful modes:
 - **Voice Design**: Create custom voices using natural language descriptions
 - **Voice Clone (Base)**: Clone any voice from a reference audio
 - **TTS (CustomVoice)**: Generate speech with predefined speakers and optional style instructions
 Built with [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS) by Alibaba Qwen Team.
 """
         )
@@ -331,7 +376,6 @@ Built with [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS) by Alibaba Qwen Team
         gr.Markdown(
             """
 ---
 **Note**: This demo uses HuggingFace Spaces Zero GPU. Each generation has a time limit.
 For longer texts, please split them into smaller segments.
 """
@@ -342,4 +386,4 @@ For longer texts, please split them into smaller segments.
 if __name__ == "__main__":
     demo = build_ui()
-    demo.launch()

 import gradio as gr
 import numpy as np
 import torch
+from huggingface_hub import snapshot_download, login
+from qwen_tts import Qwen3TTSModel
 HF_TOKEN = os.environ.get('HF_TOKEN')
 login(token=HF_TOKEN)
 # Model size options
 MODEL_SIZES = ["0.6B", "1.7B"]
+# Speaker and language choices for CustomVoice model
+SPEAKERS = [
+    "Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"
+]
+LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]
 def get_model_path(model_type: str, model_size: str) -> str:
     """Get model path based on type and size."""
     return snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}")
+# ============================================================================
+# GLOBAL MODEL LOADING - Load all models at startup
+# ============================================================================
+print("Loading all models to CUDA...")
+# Voice Design model (1.7B only)
+print("Loading VoiceDesign 1.7B model...")
+voice_design_model = Qwen3TTSModel.from_pretrained(
+    get_model_path("VoiceDesign", "1.7B"),
+    device_map="cuda",
+    dtype=torch.bfloat16,
+    token=HF_TOKEN,
+    attn_implementation="kernels-community/flash-attn3",
+)
+# Base (Voice Clone) models - both sizes
+print("Loading Base 0.6B model...")
+base_model_0_6b = Qwen3TTSModel.from_pretrained(
+    get_model_path("Base", "0.6B"),
+    device_map="cuda",
+    dtype=torch.bfloat16,
+    token=HF_TOKEN,
+    attn_implementation="kernels-community/flash-attn3",
+)
+print("Loading Base 1.7B model...")
+base_model_1_7b = Qwen3TTSModel.from_pretrained(
+    get_model_path("Base", "1.7B"),
+    device_map="cuda",
+    dtype=torch.bfloat16,
+    token=HF_TOKEN,
+    attn_implementation="kernels-community/flash-attn3",
+)
+# CustomVoice models - both sizes
+print("Loading CustomVoice 0.6B model...")
+custom_voice_model_0_6b = Qwen3TTSModel.from_pretrained(
+    get_model_path("CustomVoice", "0.6B"),
+    device_map="cuda",
+    dtype=torch.bfloat16,
+    token=HF_TOKEN,
+    attn_implementation="kernels-community/flash-attn3",
+)
+print("Loading CustomVoice 1.7B model...")
+custom_voice_model_1_7b = Qwen3TTSModel.from_pretrained(
+    get_model_path("CustomVoice", "1.7B"),
+    device_map="cuda",
+    dtype=torch.bfloat16,
+    token=HF_TOKEN,
+    attn_implementation="kernels-community/flash-attn3",
+)
+print("All models loaded successfully!")
+# Model lookup dictionaries for easy access
+BASE_MODELS = {
+    "0.6B": base_model_0_6b,
+    "1.7B": base_model_1_7b,
+}
+CUSTOM_VOICE_MODELS = {
+    "0.6B": custom_voice_model_0_6b,
+    "1.7B": custom_voice_model_1_7b,
+}
+# ============================================================================
 def _normalize_audio(wav, eps=1e-12, clip=True):
     return None
 @spaces.GPU(duration=60)
+def generate_voice_design(text, language, voice_description, progress=gr.Progress(track_tqdm=True)):
     """Generate speech using Voice Design model (1.7B only)."""
     if not text or not text.strip():
         return None, "Error: Text is required."
         return None, "Error: Voice description is required."
     try:
+        wavs, sr = voice_design_model.generate_voice_design(
             text=text.strip(),
             language=language,
             instruct=voice_description.strip(),
 @spaces.GPU(duration=60)
+def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector_only, model_size, progress=gr.Progress(track_tqdm=True)):
     """Generate speech using Base (Voice Clone) model."""
     if not target_text or not target_text.strip():
         return None, "Error: Target text is required."
         return None, "Error: Reference text is required when 'Use x-vector only' is not enabled."
     try:
+        tts = BASE_MODELS[model_size]
         wavs, sr = tts.generate_voice_clone(
             text=target_text.strip(),
             language=language,
 @spaces.GPU(duration=60)
+def generate_custom_voice(text, language, speaker, instruct, model_size, progress=gr.Progress(track_tqdm=True)):
     """Generate speech using CustomVoice model."""
     if not text or not text.strip():
         return None, "Error: Text is required."
         return None, "Error: Speaker is required."
     try:
+        tts = CUSTOM_VOICE_MODELS[model_size]
         wavs, sr = tts.generate_custom_voice(
             text=text.strip(),
             language=language,
         gr.Markdown(
             """
 # Qwen3-TTS Demo
 A unified Text-to-Speech demo featuring three powerful modes:
 - **Voice Design**: Create custom voices using natural language descriptions
 - **Voice Clone (Base)**: Clone any voice from a reference audio
 - **TTS (CustomVoice)**: Generate speech with predefined speakers and optional style instructions
 Built with [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS) by Alibaba Qwen Team.
 """
         )
         gr.Markdown(
             """
 ---
 **Note**: This demo uses HuggingFace Spaces Zero GPU. Each generation has a time limit.
 For longer texts, please split them into smaller segments.
 """
 if __name__ == "__main__":
     demo = build_ui()
+    demo.launch()

qwen_tts/__init__.py CHANGED Viewed

@@ -21,5 +21,4 @@ qwen_tts: Qwen-TTS package.
 from .inference.qwen3_tts_model import Qwen3TTSModel, VoiceClonePromptItem
 from .inference.qwen3_tts_tokenizer import Qwen3TTSTokenizer
-__all__ = ["__version__"]
-__version__ = "0.0.1"

 from .inference.qwen3_tts_model import Qwen3TTSModel, VoiceClonePromptItem
 from .inference.qwen3_tts_tokenizer import Qwen3TTSTokenizer
+__all__ = ["__version__"]

qwen_tts/cli/demo.py CHANGED Viewed

@@ -146,9 +146,11 @@ def build_parser() -> argparse.ArgumentParser:
         help="Path to SSL key file for HTTPS (optional).",
     )
     parser.add_argument(
-        "--ssl-verify",
-        default=None,
-        help="SSL verify setting for Gradio (optional).",
     )
     # Optional generation args
@@ -617,13 +619,12 @@ def main(argv=None) -> int:
         server_name=args.ip,
         server_port=args.port,
         share=args.share,
     )
     if args.ssl_certfile is not None:
         launch_kwargs["ssl_certfile"] = args.ssl_certfile
     if args.ssl_keyfile is not None:
         launch_kwargs["ssl_keyfile"] = args.ssl_keyfile
-    if args.ssl_verify is not None:
-        launch_kwargs["ssl_verify"] = args.ssl_verify
     demo.queue(default_concurrency_limit=int(args.concurrency)).launch(**launch_kwargs)
     return 0

         help="Path to SSL key file for HTTPS (optional).",
     )
     parser.add_argument(
+        "--ssl-verify/--no-ssl-verify",
+        dest="ssl_verify",
+        default=True,
+        action=argparse.BooleanOptionalAction,
+        help="Whether to verify SSL certificate (default: enabled).",
     )
     # Optional generation args
         server_name=args.ip,
         server_port=args.port,
         share=args.share,
+        ssl_verify=True if args.ssl_verify else False,
     )
     if args.ssl_certfile is not None:
         launch_kwargs["ssl_certfile"] = args.ssl_certfile
     if args.ssl_keyfile is not None:
         launch_kwargs["ssl_keyfile"] = args.ssl_keyfile
     demo.queue(default_concurrency_limit=int(args.concurrency)).launch(**launch_kwargs)
     return 0

qwen_tts/core/models/modeling_qwen3_tts.py CHANGED Viewed

@@ -19,7 +19,9 @@ import os
 from dataclasses import dataclass
 from typing import Callable, Optional
 import torch
 from librosa.filters import mel as librosa_mel_fn
 from torch import nn
 from torch.nn import functional as F
@@ -27,34 +29,69 @@ from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.generation import GenerationMixin
 from transformers.integrations import use_kernel_forward_from_hub
-from transformers.masking_utils import (
-    create_causal_mask,
-    create_sliding_window_causal_mask,
-)
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_layers import GradientCheckpointingLayer
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-    ModelOutput,
-)
-from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
-from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import can_return_tuple, logging
 from transformers.utils.hub import cached_file
 from ...inference.qwen3_tts_tokenizer import Qwen3TTSTokenizer
-from .configuration_qwen3_tts import (
-    Qwen3TTSConfig,
-    Qwen3TTSSpeakerEncoderConfig,
-    Qwen3TTSTalkerCodePredictorConfig,
-    Qwen3TTSTalkerConfig,
-)
 logger = logging.get_logger(__name__)
 class Res2NetBlock(torch.nn.Module):
     def __init__(self, in_channels, out_channels, scale=8, kernel_size=3, dilation=1):
         super().__init__()
@@ -433,7 +470,7 @@ class Qwen3TTSPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["Qwen3TTSDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_cache_class = True
     _supports_static_cache = False
@@ -464,8 +501,7 @@ class Qwen3TTSTalkerTextPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = []
     _skip_keys_device_placement = ["past_key_values"]
-    _supports_flash_attn_3 = True
-    _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_flex_attn = True
     _supports_cache_class = True
@@ -1178,6 +1214,8 @@ class Qwen3TTSTalkerCodePredictorModelForConditionalGeneration(Qwen3TTSPreTraine
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs: BaseModelOutputWithPast = self.model(
             input_ids=None,
@@ -1830,6 +1868,11 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
         weights_only=True,
         **kwargs,
     ):
         model = super().from_pretrained(
             pretrained_model_name_or_path,
             *model_args,
@@ -1842,8 +1885,18 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
             revision=revision,
             use_safetensors=use_safetensors,
             weights_only=weights_only,
             **kwargs,
         )
         speech_tokenizer_path = cached_file(
             pretrained_model_name_or_path,
             "speech_tokenizer/config.json",

 from dataclasses import dataclass
 from typing import Callable, Optional
+import huggingface_hub
 import torch
+from huggingface_hub import snapshot_download
 from librosa.filters import mel as librosa_mel_fn
 from torch import nn
 from torch.nn import functional as F
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.generation import GenerationMixin
 from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import (create_causal_mask,
+                                        create_sliding_window_causal_mask)
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import (BaseModelOutputWithPast,
+                                           CausalLMOutputWithPast, ModelOutput)
+from transformers.modeling_rope_utils import (ROPE_INIT_FUNCTIONS,
+                                              dynamic_rope_update)
+from transformers.modeling_utils import (ALL_ATTENTION_FUNCTIONS,
+                                         PreTrainedModel)
 from transformers.processing_utils import Unpack
 from transformers.utils import can_return_tuple, logging
 from transformers.utils.hub import cached_file
 from ...inference.qwen3_tts_tokenizer import Qwen3TTSTokenizer
+from .configuration_qwen3_tts import (Qwen3TTSConfig,
+                                      Qwen3TTSSpeakerEncoderConfig,
+                                      Qwen3TTSTalkerCodePredictorConfig,
+                                      Qwen3TTSTalkerConfig)
 logger = logging.get_logger(__name__)
+def download_weights_from_hf_specific(
+    model_name_or_path: str,
+    cache_dir: str | None,
+    allow_patterns: list[str],
+    revision: str | None = None,
+    ignore_patterns: str | list[str] | None = None,
+) -> str:
+    """Download model weights from Hugging Face Hub. Users can specify the
+    allow_patterns to download only the necessary weights.
+    Args:
+        model_name_or_path (str): The model name or path.
+        cache_dir (Optional[str]): The cache directory to store the model
+            weights. If None, will use HF defaults.
+        allow_patterns (list[str]): The allowed patterns for the
+            weight files. Files matched by any of the patterns will be
+            downloaded.
+        revision (Optional[str]): The revision of the model.
+        ignore_patterns (Optional[Union[str, list[str]]]): The patterns to
+            filter out the weight files. Files matched by any of the patterns
+            will be ignored.
+    Returns:
+        str: The path to the downloaded model weights.
+    """
+    assert len(allow_patterns) > 0
+    local_only = huggingface_hub.constants.HF_HUB_OFFLINE
+    for allow_pattern in allow_patterns:
+        hf_folder = snapshot_download(
+            model_name_or_path,
+            allow_patterns=allow_pattern,
+            ignore_patterns=ignore_patterns,
+            cache_dir=cache_dir,
+            revision=revision,
+            local_files_only=local_only,
+        )
+    return hf_folder
 class Res2NetBlock(torch.nn.Module):
     def __init__(self, in_channels, out_channels, scale=8, kernel_size=3, dilation=1):
         super().__init__()
     supports_gradient_checkpointing = True
     _no_split_modules = ["Qwen3TTSDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
     _supports_sdpa = True
     _supports_cache_class = True
     _supports_static_cache = False
     supports_gradient_checkpointing = True
     _no_split_modules = []
     _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
     _supports_sdpa = True
     _supports_flex_attn = True
     _supports_cache_class = True
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        inputs_embeds = self.small_to_mtp_projection(inputs_embeds)
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs: BaseModelOutputWithPast = self.model(
             input_ids=None,
         weights_only=True,
         **kwargs,
     ):
+        # Hotfix to enable passing the correct attn implementation which is stored in the config but not in kwargs
+        requested_attn_implementation = kwargs.pop("attn_implementation", None)
+        if requested_attn_implementation is None and config and config._attn_implementation:
+            requested_attn_implementation = config._attn_implementation
         model = super().from_pretrained(
             pretrained_model_name_or_path,
             *model_args,
             revision=revision,
             use_safetensors=use_safetensors,
             weights_only=weights_only,
+            attn_implementation=requested_attn_implementation,
             **kwargs,
         )
+        if not local_files_only and not os.path.isdir(pretrained_model_name_or_path):
+            download_cache_dir = kwargs.get("cache_dir", cache_dir)
+            download_revision = kwargs.get("revision", revision)
+            download_weights_from_hf_specific(
+                pretrained_model_name_or_path,
+                cache_dir=download_cache_dir,
+                allow_patterns=["speech_tokenizer/*"],
+                revision=download_revision,
+            )
         speech_tokenizer_path = cached_file(
             pretrained_model_name_or_path,
             "speech_tokenizer/config.json",

qwen_tts/inference/qwen3_tts_model.py CHANGED Viewed

@@ -286,7 +286,6 @@ class Qwen3TTSModel:
     def _merge_generate_kwargs(
         self,
-        non_streaming_mode: Optional[bool] = None,
         do_sample: Optional[bool] = None,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
@@ -308,7 +307,7 @@ class Qwen3TTSModel:
           - Otherwise, fall back to the hard defaults.
         Args:
-            non_streaming_mode, do_sample, top_k, top_p, temperature, repetition_penalty,
             subtalker_dosample, subtalker_top_k, subtalker_top_p, subtalker_temperature, max_new_tokens:
                 Common generation parameters.
             **kwargs:
@@ -318,7 +317,6 @@ class Qwen3TTSModel:
             Dict[str, Any]: Final kwargs to pass into model.generate().
         """
         hard_defaults = dict(
-            non_streaming_mode=False,
             do_sample=True,
             top_k=50,
             top_p=1.0,
@@ -340,7 +338,6 @@ class Qwen3TTSModel:
         merged = dict(kwargs)
         merged.update(
-            non_streaming_mode=pick("non_streaming_mode", non_streaming_mode),
             do_sample=pick("do_sample", do_sample),
             top_k=pick("top_k", top_k),
             top_p=pick("top_p", top_p),
@@ -478,6 +475,7 @@ class Qwen3TTSModel:
         ref_text: Optional[Union[str, List[Optional[str]]]] = None,
         x_vector_only_mode: Union[bool, List[bool]] = False,
         voice_clone_prompt: Optional[Union[Dict[str, Any], List[VoiceClonePromptItem]]] = None,
         **kwargs,
     ) -> Tuple[List[np.ndarray], int]:
         """
@@ -607,6 +605,7 @@ class Qwen3TTSModel:
             ref_ids=ref_ids,
             voice_clone_prompt=voice_clone_prompt_dict,
             languages=languages,
             **gen_kwargs,
         )
@@ -640,6 +639,7 @@ class Qwen3TTSModel:
         text: Union[str, List[str]],
         instruct: Union[str, List[str]],
         language: Union[str, List[str]] = None,
         **kwargs,
     ) -> Tuple[List[np.ndarray], int]:
         """
@@ -720,6 +720,7 @@ class Qwen3TTSModel:
             input_ids=input_ids,
             instruct_ids=instruct_ids,
             languages=languages,
             **gen_kwargs,
         )
@@ -734,6 +735,7 @@ class Qwen3TTSModel:
         speaker: Union[str, List[str]],
         language: Union[str, List[str]] = None,
         instruct: Optional[Union[str, List[str]]] = None,
         **kwargs,
     ) -> Tuple[List[np.ndarray], int]:
         """
@@ -829,6 +831,7 @@ class Qwen3TTSModel:
             instruct_ids=instruct_ids,
             languages=languages,
             speakers=speakers,
             **gen_kwargs,
         )

     def _merge_generate_kwargs(
         self,
         do_sample: Optional[bool] = None,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
           - Otherwise, fall back to the hard defaults.
         Args:
+            do_sample, top_k, top_p, temperature, repetition_penalty,
             subtalker_dosample, subtalker_top_k, subtalker_top_p, subtalker_temperature, max_new_tokens:
                 Common generation parameters.
             **kwargs:
             Dict[str, Any]: Final kwargs to pass into model.generate().
         """
         hard_defaults = dict(
             do_sample=True,
             top_k=50,
             top_p=1.0,
         merged = dict(kwargs)
         merged.update(
             do_sample=pick("do_sample", do_sample),
             top_k=pick("top_k", top_k),
             top_p=pick("top_p", top_p),
         ref_text: Optional[Union[str, List[Optional[str]]]] = None,
         x_vector_only_mode: Union[bool, List[bool]] = False,
         voice_clone_prompt: Optional[Union[Dict[str, Any], List[VoiceClonePromptItem]]] = None,
+        non_streaming_mode: bool = False,
         **kwargs,
     ) -> Tuple[List[np.ndarray], int]:
         """
             ref_ids=ref_ids,
             voice_clone_prompt=voice_clone_prompt_dict,
             languages=languages,
+            non_streaming_mode=non_streaming_mode,
             **gen_kwargs,
         )
         text: Union[str, List[str]],
         instruct: Union[str, List[str]],
         language: Union[str, List[str]] = None,
+        non_streaming_mode: bool = True,
         **kwargs,
     ) -> Tuple[List[np.ndarray], int]:
         """
             input_ids=input_ids,
             instruct_ids=instruct_ids,
             languages=languages,
+            non_streaming_mode=non_streaming_mode,
             **gen_kwargs,
         )
         speaker: Union[str, List[str]],
         language: Union[str, List[str]] = None,
         instruct: Optional[Union[str, List[str]]] = None,
+        non_streaming_mode: bool = True,
         **kwargs,
     ) -> Tuple[List[np.ndarray], int]:
         """
             instruct_ids=instruct_ids,
             languages=languages,
             speakers=speakers,
+            non_streaming_mode=non_streaming_mode,
             **gen_kwargs,
         )

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 # Qwen3-TTS Dependencies for HuggingFace Spaces
 transformers==4.57.3
 accelerate==1.12.0
 einops
@@ -10,4 +11,5 @@ sox
 onnxruntime
 spaces
 torch
-numpy

 # Qwen3-TTS Dependencies for HuggingFace Spaces
+torch==2.8.0
 transformers==4.57.3
 accelerate==1.12.0
 einops
 onnxruntime
 spaces
 torch
+numpy
+kernels