generator_params: input_sample_rate: 16000 output_sample_rate: 16000 mel_hop_length: 160 encoder_downsample_rate: 1280 decoder_upsample_rate: 1280 feature_extractor: chunk_length: 30 feature_size: 80 sampling_rate: 16000 hop_length: 160 n_fft: 400 n_samples: 480000 nb_max_frames: 3000 padding_side: "right" padding_value: 0.0 return_attention_mask: false acoustic_encoder: num_mel_bins: 80 sampling_rate: 16000 hop_length: 160 stride_size: 2 kernel_size: 3 d_model: 768 scale_embedding: false max_audio_seconds: 30 encoder_layers: 12 encoder_attention_heads: 12 encoder_ffn_dim: 3072 is_acoustic: true freeze: true # 下采样 downsample: in_dim: 768 latent_dim: 32 stack_factor: 4 hidden_dim: 512 # GroupFSQ量化器 quantizer: num_groups: 8 num_levels_per_group: [8, 7, 6, 6] eps: 0.001 # 上采样 upsample: latent_dim: 32 out_dim: 768 stack_factor: 4 hidden_dim: 512 acoustic_decoder: num_mel_bins: 80 sampling_rate: 16000 hop_length: 160 stride_size: 2 kernel_size: 3 d_model: 768 scale_embedding: false max_audio_seconds: 30 decoder_layers: 12 decoder_attention_heads: 12 decoder_ffn_dim: 3072 activation_function: "gelu" vocos: input_channels: 80 dim: 512 intermediate_dim: 4096 num_layers: 24 n_fft: 640 hop_size: 160 padding: "same"