from transformers import PretrainedConfig, Qwen3Config class MossAudioConfig(PretrainedConfig): model_type = "moss_audio" is_composition = True def __init__( self, audio_config=None, language_config=None, adapter_hidden_size=8192, ignore_index=-100, deepstack_num_inject_layers=None, **kwargs, ): if isinstance(language_config, dict): language_config = Qwen3Config(**language_config) elif language_config is None: language_config = Qwen3Config() self.audio_config = audio_config self.language_config = language_config self.adapter_hidden_size = adapter_hidden_size self.ignore_index = ignore_index self.deepstack_num_inject_layers = deepstack_num_inject_layers for key in ("num_hidden_layers", "eos_token_id", "bos_token_id", "vocab_size"): kwargs.setdefault(key, getattr(language_config, key, None)) super().__init__(**kwargs)