generator_params:
  input_sample_rate: 16000
  output_sample_rate: 16000
  mel_hop_length: 160
  encoder_downsample_rate: 1280
  decoder_upsample_rate: 1280

  feature_extractor:
    chunk_length: 30
    feature_size: 80
    sampling_rate: 16000
    hop_length: 160
    n_fft: 400
    n_samples: 480000
    nb_max_frames: 3000
    padding_side: "right"
    padding_value: 0.0
    return_attention_mask: false

  acoustic_encoder:
    num_mel_bins: 80
    sampling_rate: 16000
    hop_length: 160
    stride_size: 2
    kernel_size: 3
    d_model: 768
    scale_embedding: false
    max_audio_seconds: 30
    encoder_layers: 12
    encoder_attention_heads: 12
    encoder_ffn_dim: 3072
    is_acoustic: true
    freeze: true

  # 下采样
  downsample:
    in_dim: 768
    latent_dim: 32
    stack_factor: 4
    hidden_dim: 512

  # GroupFSQ量化器
  quantizer:
    num_groups: 8
    num_levels_per_group: [8, 7, 6, 6]
    eps: 0.001

  # 上采样
  upsample:
    latent_dim: 32
    out_dim: 768
    stack_factor: 4
    hidden_dim: 512

  acoustic_decoder:
    num_mel_bins: 80
    sampling_rate: 16000
    hop_length: 160
    stride_size: 2
    kernel_size: 3
    d_model: 768
    scale_embedding: false
    max_audio_seconds: 30
    decoder_layers: 12
    decoder_attention_heads: 12
    decoder_ffn_dim: 3072
    activation_function: "gelu"

  vocos:
    input_channels: 80
    dim: 512
    intermediate_dim: 4096
    num_layers: 24
    n_fft: 640
    hop_size: 160
    padding: "same"