| { | |
| "model_type": "semantic_vocoder", | |
| "auto_map": { | |
| "AutoConfig": "model.SemanticVocoderConfig", | |
| "AutoModel": "model.SemanticVocoder" | |
| }, | |
| "model_config": { | |
| "autoencoder": { | |
| "_target_": "models.autoencoder.waveform.semanticVocoder.semanticVocoder.SemanticVocoder", | |
| "encoder_name": "none", | |
| "n_timesteps": 200, | |
| "sample_rate": 24000, | |
| "clamp_pred": true, | |
| "downsampling_ratio": 960, | |
| "encoder_sampling_rate": 16000, | |
| "vocoder": { | |
| "_target_": "models.autoencoder.waveform.semanticVocoder.flow2gan.models.generator.MaeAudioGenerator", | |
| "latent_dim": 768, | |
| "hop_length": 960, | |
| "n_ffts": [ | |
| 512, | |
| 256, | |
| 128 | |
| ], | |
| "hop_lengths": [ | |
| 320, | |
| 160, | |
| 80 | |
| ], | |
| "channels": [ | |
| 768, | |
| 512, | |
| 384 | |
| ], | |
| "time_embed_channels": 512, | |
| "hidden_factor": 3, | |
| "conv_kernel_sizes": [ | |
| 7, | |
| 7, | |
| 7 | |
| ], | |
| "num_layers": [ | |
| 8, | |
| 8, | |
| 8 | |
| ], | |
| "use_cond_encoder": true, | |
| "cond_enc_channels": 512, | |
| "cond_enc_hidden_factor": 3, | |
| "cond_enc_conv_kernel_size": 7, | |
| "cond_enc_num_layers": 4, | |
| "residual_scale": 1.0, | |
| "init_noise_scale": 0.1, | |
| "pred_x1": true, | |
| "branch_reduction": "mean", | |
| "spec_scaling_loss": true, | |
| "loss_n_filters": 256, | |
| "loss_n_fft": 1024, | |
| "loss_hop_length": 256, | |
| "loss_power": 0.5, | |
| "loss_eps": 1e-07, | |
| "loss_scale_min": 0.01, | |
| "loss_scale_max": 100.0, | |
| "branch_dropout": 0.05, | |
| "max_add_noise_scale": 0.0 | |
| } | |
| }, | |
| "backbone": { | |
| "_target_": "models.dit.mask_dit.UDiT", | |
| "img_size": 250, | |
| "patch_size": 1, | |
| "in_chans": 768, | |
| "out_chans": 768, | |
| "input_type": "1d", | |
| "embed_dim": 1024, | |
| "depth": 24, | |
| "num_heads": 16, | |
| "mlp_ratio": 4.0, | |
| "qkv_bias": false, | |
| "qk_scale": null, | |
| "qk_norm": "layernorm", | |
| "norm_layer": "layernorm", | |
| "act_layer": "geglu", | |
| "context_norm": true, | |
| "use_checkpoint": true, | |
| "time_fusion": "ada_sola_bias", | |
| "ada_sola_rank": 32, | |
| "ada_sola_alpha": 32, | |
| "cls_dim": null, | |
| "context_dim": 1024, | |
| "context_fusion": "cross", | |
| "context_max_length": null, | |
| "context_pe_method": "none", | |
| "pe_method": "none", | |
| "rope_mode": "shared", | |
| "use_conv": true, | |
| "skip": true, | |
| "skip_norm": true | |
| }, | |
| "cfg_drop_ratio": 0.2, | |
| "sample_strategy": "uniform", | |
| "_target_": "models.flow_matching.SingleTaskCrossAttentionAudioFlowMatching", | |
| "content_encoder": { | |
| "_target_": "models.content_encoder.content_encoder.ContentEncoder", | |
| "embed_dim": 1024, | |
| "text_encoder": { | |
| "_target_": "models.content_encoder.text_encoder.T5TextEncoder", | |
| "model_name": "google/flan-t5-large", | |
| "embed_dim": 1024 | |
| } | |
| } | |
| } | |
| } |