{ "model_type": "semantic_vocoder", "auto_map": { "AutoConfig": "model.SemanticVocoderConfig", "AutoModel": "model.SemanticVocoder" }, "model": { "autoencoder": { "_target_": "models.autoencoder.waveform.semanticVocoder.semanticVocoder.SemanticVocoder", "encoder_name": "dasheng_base", "n_timesteps": 200, "sample_rate": 24000, "clamp_pred": true, "downsampling_ratio": 960, "encoder_sampling_rate": 16000, "vocoder": { "_target_": "models.autoencoder.waveform.semanticVocoder.flow2gan.models.generator.MaeAudioGenerator", "latent_dim": 768, "hop_length": 960, "n_ffts": [ 512, 256, 128 ], "hop_lengths": [ 320, 160, 80 ], "channels": [ 768, 512, 384 ], "time_embed_channels": 512, "hidden_factor": 3, "conv_kernel_sizes": [ 7, 7, 7 ], "num_layers": [ 8, 8, 8 ], "use_cond_encoder": true, "cond_enc_channels": 512, "cond_enc_hidden_factor": 3, "cond_enc_conv_kernel_size": 7, "cond_enc_num_layers": 4, "residual_scale": 1.0, "init_noise_scale": 0.1, "pred_x1": true, "branch_reduction": "mean", "spec_scaling_loss": true, "loss_n_filters": 256, "loss_n_fft": 1024, "loss_hop_length": 256, "loss_power": 0.5, "loss_eps": 1e-07, "loss_scale_min": 0.01, "loss_scale_max": 100.0, "branch_dropout": 0.05, "max_add_noise_scale": 0.0 } }, "backbone": { "_target_": "models.dit.mask_dit.UDiT", "img_size": 250, "patch_size": 1, "in_chans": 768, "out_chans": 768, "input_type": "1d", "embed_dim": 1024, "depth": 24, "num_heads": 16, "mlp_ratio": 4.0, "qkv_bias": false, "qk_scale": null, "qk_norm": "layernorm", "norm_layer": "layernorm", "act_layer": "geglu", "context_norm": true, "use_checkpoint": true, "time_fusion": "ada_sola_bias", "ada_sola_rank": 32, "ada_sola_alpha": 32, "cls_dim": null, "context_dim": 1024, "context_fusion": "cross", "context_max_length": null, "context_pe_method": "none", "pe_method": "none", "rope_mode": "shared", "use_conv": true, "skip": true, "skip_norm": true }, "cfg_drop_ratio": 0.2, "sample_strategy": "uniform", "_target_": "models.flow_matching.SingleTaskCrossAttentionAudioFlowMatching", "content_encoder": { "_target_": "models.content_encoder.content_encoder.ContentEncoder", "embed_dim": 1024, "text_encoder": { "_target_": "models.content_encoder.text_encoder.T5TextEncoder", "model_name": "google/flan-t5-large", "embed_dim": 1024 } } } }