File size: 1,491 Bytes
8aad9db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | generator_params:
input_sample_rate: 16000
output_sample_rate: 16000
mel_hop_length: 160
encoder_downsample_rate: 1280
decoder_upsample_rate: 1280
feature_extractor:
chunk_length: 30
feature_size: 80
sampling_rate: 16000
hop_length: 160
n_fft: 400
n_samples: 480000
nb_max_frames: 3000
padding_side: "right"
padding_value: 0.0
return_attention_mask: false
acoustic_encoder:
num_mel_bins: 80
sampling_rate: 16000
hop_length: 160
stride_size: 2
kernel_size: 3
d_model: 768
scale_embedding: false
max_audio_seconds: 30
encoder_layers: 12
encoder_attention_heads: 12
encoder_ffn_dim: 3072
is_acoustic: true
freeze: true
# 下采样
downsample:
in_dim: 768
latent_dim: 32
stack_factor: 4
hidden_dim: 512
# GroupFSQ量化器
quantizer:
num_groups: 8
num_levels_per_group: [8, 7, 6, 6]
eps: 0.001
# 上采样
upsample:
latent_dim: 32
out_dim: 768
stack_factor: 4
hidden_dim: 512
acoustic_decoder:
num_mel_bins: 80
sampling_rate: 16000
hop_length: 160
stride_size: 2
kernel_size: 3
d_model: 768
scale_embedding: false
max_audio_seconds: 30
decoder_layers: 12
decoder_attention_heads: 12
decoder_ffn_dim: 3072
activation_function: "gelu"
vocos:
input_channels: 80
dim: 512
intermediate_dim: 4096
num_layers: 24
n_fft: 640
hop_size: 160
padding: "same" |