| generator_params: | |
| input_sample_rate: 16000 | |
| output_sample_rate: 16000 | |
| mel_hop_length: 160 | |
| encoder_downsample_rate: 1280 | |
| decoder_upsample_rate: 1280 | |
| feature_extractor: | |
| chunk_length: 30 | |
| feature_size: 80 | |
| sampling_rate: 16000 | |
| hop_length: 160 | |
| n_fft: 400 | |
| n_samples: 480000 | |
| nb_max_frames: 3000 | |
| padding_side: "right" | |
| padding_value: 0.0 | |
| return_attention_mask: false | |
| acoustic_encoder: | |
| num_mel_bins: 80 | |
| sampling_rate: 16000 | |
| hop_length: 160 | |
| stride_size: 2 | |
| kernel_size: 3 | |
| d_model: 768 | |
| scale_embedding: false | |
| max_audio_seconds: 30 | |
| encoder_layers: 12 | |
| encoder_attention_heads: 12 | |
| encoder_ffn_dim: 3072 | |
| is_acoustic: true | |
| freeze: true | |
| # 下采样 | |
| downsample: | |
| in_dim: 768 | |
| latent_dim: 32 | |
| stack_factor: 4 | |
| hidden_dim: 512 | |
| # GroupFSQ量化器 | |
| quantizer: | |
| num_groups: 8 | |
| num_levels_per_group: [8, 7, 6, 6] | |
| eps: 0.001 | |
| # 上采样 | |
| upsample: | |
| latent_dim: 32 | |
| out_dim: 768 | |
| stack_factor: 4 | |
| hidden_dim: 512 | |
| acoustic_decoder: | |
| num_mel_bins: 80 | |
| sampling_rate: 16000 | |
| hop_length: 160 | |
| stride_size: 2 | |
| kernel_size: 3 | |
| d_model: 768 | |
| scale_embedding: false | |
| max_audio_seconds: 30 | |
| decoder_layers: 12 | |
| decoder_attention_heads: 12 | |
| decoder_ffn_dim: 3072 | |
| activation_function: "gelu" | |
| vocos: | |
| input_channels: 80 | |
| dim: 512 | |
| intermediate_dim: 4096 | |
| num_layers: 24 | |
| n_fft: 640 | |
| hop_size: 160 | |
| padding: "same" |