SemanticVocoder / config.json
ZeyuXie's picture
Upload model
e2da711 verified
{
"model_type": "semantic_vocoder",
"auto_map": {
"AutoConfig": "model.SemanticVocoderConfig",
"AutoModel": "model.SemanticVocoder"
},
"model_config": {
"autoencoder": {
"_target_": "models.autoencoder.waveform.semanticVocoder.semanticVocoder.SemanticVocoder",
"encoder_name": "none",
"n_timesteps": 200,
"sample_rate": 24000,
"clamp_pred": true,
"downsampling_ratio": 960,
"encoder_sampling_rate": 16000,
"vocoder": {
"_target_": "models.autoencoder.waveform.semanticVocoder.flow2gan.models.generator.MaeAudioGenerator",
"latent_dim": 768,
"hop_length": 960,
"n_ffts": [
512,
256,
128
],
"hop_lengths": [
320,
160,
80
],
"channels": [
768,
512,
384
],
"time_embed_channels": 512,
"hidden_factor": 3,
"conv_kernel_sizes": [
7,
7,
7
],
"num_layers": [
8,
8,
8
],
"use_cond_encoder": true,
"cond_enc_channels": 512,
"cond_enc_hidden_factor": 3,
"cond_enc_conv_kernel_size": 7,
"cond_enc_num_layers": 4,
"residual_scale": 1.0,
"init_noise_scale": 0.1,
"pred_x1": true,
"branch_reduction": "mean",
"spec_scaling_loss": true,
"loss_n_filters": 256,
"loss_n_fft": 1024,
"loss_hop_length": 256,
"loss_power": 0.5,
"loss_eps": 1e-07,
"loss_scale_min": 0.01,
"loss_scale_max": 100.0,
"branch_dropout": 0.05,
"max_add_noise_scale": 0.0
}
},
"backbone": {
"_target_": "models.dit.mask_dit.UDiT",
"img_size": 250,
"patch_size": 1,
"in_chans": 768,
"out_chans": 768,
"input_type": "1d",
"embed_dim": 1024,
"depth": 24,
"num_heads": 16,
"mlp_ratio": 4.0,
"qkv_bias": false,
"qk_scale": null,
"qk_norm": "layernorm",
"norm_layer": "layernorm",
"act_layer": "geglu",
"context_norm": true,
"use_checkpoint": true,
"time_fusion": "ada_sola_bias",
"ada_sola_rank": 32,
"ada_sola_alpha": 32,
"cls_dim": null,
"context_dim": 1024,
"context_fusion": "cross",
"context_max_length": null,
"context_pe_method": "none",
"pe_method": "none",
"rope_mode": "shared",
"use_conv": true,
"skip": true,
"skip_norm": true
},
"cfg_drop_ratio": 0.2,
"sample_strategy": "uniform",
"_target_": "models.flow_matching.SingleTaskCrossAttentionAudioFlowMatching",
"content_encoder": {
"_target_": "models.content_encoder.content_encoder.ContentEncoder",
"embed_dim": 1024,
"text_encoder": {
"_target_": "models.content_encoder.text_encoder.T5TextEncoder",
"model_name": "google/flan-t5-large",
"embed_dim": 1024
}
}
}
}