Upload voiceplus_qwen3_1.7B_tp8_rvq32_all_data_tacv3_max_lr_2e-4_min_2e-4_enhanced_lm_head_add_layer_norm_wd_0.1_from_pretrained_seqlen_14336_decay iter_0015000 model snapshot
a724b39
| # coding=utf-8 | |
| # Copyright 2026 OpenMOSS and the HuggingFace Inc. team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ MossTTSDelay model configuration """ | |
| from typing import Optional, Union | |
| from transformers.configuration_utils import PretrainedConfig | |
| from transformers.utils import logging | |
| from transformers.models.qwen3 import Qwen3Config | |
| logger = logging.get_logger(__name__) | |
| class MossTTSDelayConfig(PretrainedConfig): | |
| r""" | |
| This is the configuration class to store the configuration of a [`MossTTSDelayModel`]. It is used to instantiate an | |
| MossTTSDelay model according to the specified arguments, defining the model architecture. Instantiating a configuration | |
| with the defaults will yield a similar configuration to that of the MossTTSDelay [MossTTSDelay-8B](https://huggingface.co/OpenMOSS/mosstts-8b) architecture. | |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the | |
| documentation from [`PretrainedConfig`] for more information. | |
| Args: | |
| language_config (`Union[Qwen3Config, dict]`, *optional*): | |
| Configuration for the backbone language model (Qwen3). | |
| initializer_range (`float`, *optional*, defaults to 0.02): | |
| The standard deviation of the truncated_normal_initializer for initializing all weight matrices. | |
| n_vq (`int`, *optional*, defaults to 32): | |
| Number of additional VQ (Vector Quantization) heads/channels for audio. | |
| Determines the number of codebooks used in the audio representation. | |
| audio_vocab_size (`int`, *optional*, defaults to 1024): | |
| Vocabulary size for the audio tokens (codebooks 1 to N). | |
| audio_user_slot_token_id (`int`, *optional*, defaults to 151654): | |
| The specific token ID used as a placeholder/slot for user-side audio inputs in the prompt. | |
| audio_assistant_gen_slot_token_id (`int`, *optional*, defaults to 151656): | |
| The specific token ID representing the generation slot for the assistant's audio output. | |
| Acting as the trigger for the TTS generation process. | |
| audio_assistant_delay_slot_token_id (`int`, *optional*, defaults to 151662): | |
| The token ID used in the 'Delay Pattern' paradigm to represent the delayed/offset positions | |
| between different VQ channels. | |
| audio_start_token_id (`int`, *optional*, defaults to 151652): | |
| Special token ID used to denote the start of an audio sequence in the stream. | |
| audio_end_token_id (`int`, *optional*, defaults to 151653): | |
| Special token ID used to denote the end of an audio sequence (EOS for audio). | |
| audio_pad_code (`int`, *optional*, defaults to 1024): | |
| The padding value used within the audio VQ codebooks. Typically equals `audio_vocab_size`. | |
| """ | |
| model_type = "moss_tts_delay" | |
| keys_to_ignore_at_inference = ["past_key_values"] | |
| def __init__( | |
| self, | |
| language_config: Optional[Union[Qwen3Config, dict]] = None, | |
| initializer_range: float = 0.02, | |
| n_vq: int = 32, | |
| pad_token_id: int = 151643, | |
| im_start_token_id: int = 151644, | |
| im_end_token_id: int = 151645, | |
| audio_vocab_size: int = 1024, | |
| audio_user_slot_token_id: int = 151654, | |
| audio_assistant_gen_slot_token_id: int = 151656, | |
| audio_assistant_delay_slot_token_id: int = 151662, | |
| audio_start_token_id: int = 151652, | |
| audio_end_token_id: int = 151653, | |
| audio_pad_code: int = 1024, | |
| sampling_rate: int = 24000, | |
| additional_mlp_ffn_hidden_size: int = 2048, | |
| local_ffn_hidden_size: int = 8960, | |
| local_hidden_size: int = 1536, | |
| local_num_layers: int = 4, | |
| **kwargs, | |
| ): | |
| if isinstance(language_config, dict): | |
| self.language_config = Qwen3Config(**language_config) | |
| elif language_config is None: | |
| self.language_config = Qwen3Config() | |
| else: | |
| self.language_config = language_config | |
| self.initializer_range = initializer_range | |
| self.n_vq = n_vq | |
| self.audio_vocab_size = audio_vocab_size | |
| self.audio_user_slot_token_id = audio_user_slot_token_id | |
| self.audio_assistant_gen_slot_token_id = audio_assistant_gen_slot_token_id | |
| self.audio_assistant_delay_slot_token_id = audio_assistant_delay_slot_token_id | |
| self.audio_start_token_id = audio_start_token_id | |
| self.audio_end_token_id = audio_end_token_id | |
| self.audio_pad_code = audio_pad_code | |
| self.sampling_rate = sampling_rate | |
| self.hidden_size = self.language_config.hidden_size | |
| self.vocab_size = self.language_config.vocab_size | |
| self.im_start_token_id = self.language_config | |
| self.pad_token_id = pad_token_id | |
| self.im_start_token_id = im_start_token_id | |
| self.im_end_token_id = im_end_token_id | |
| self.additional_mlp_ffn_hidden_size = additional_mlp_ffn_hidden_size | |
| self.local_ffn_hidden_size = local_ffn_hidden_size | |
| self.local_hidden_size = local_hidden_size | |
| self.local_num_layers = local_num_layers | |
| super().__init__(**kwargs) | |
| def to_dict(self): | |
| output = super().to_dict() | |
| if hasattr(self.language_config, "to_dict"): | |
| output["language_config"] = self.language_config.to_dict() | |
| else: | |
| output["language_config"] = self.language_config | |
| return output | |