SimWhisper_Codec / SimWhisperCodec.yaml
xxx123456's picture
Upload SimWhisperCodec.yaml
8aad9db verified
generator_params:
input_sample_rate: 16000
output_sample_rate: 16000
mel_hop_length: 160
encoder_downsample_rate: 1280
decoder_upsample_rate: 1280
feature_extractor:
chunk_length: 30
feature_size: 80
sampling_rate: 16000
hop_length: 160
n_fft: 400
n_samples: 480000
nb_max_frames: 3000
padding_side: "right"
padding_value: 0.0
return_attention_mask: false
acoustic_encoder:
num_mel_bins: 80
sampling_rate: 16000
hop_length: 160
stride_size: 2
kernel_size: 3
d_model: 768
scale_embedding: false
max_audio_seconds: 30
encoder_layers: 12
encoder_attention_heads: 12
encoder_ffn_dim: 3072
is_acoustic: true
freeze: true
# 下采样
downsample:
in_dim: 768
latent_dim: 32
stack_factor: 4
hidden_dim: 512
# GroupFSQ量化器
quantizer:
num_groups: 8
num_levels_per_group: [8, 7, 6, 6]
eps: 0.001
# 上采样
upsample:
latent_dim: 32
out_dim: 768
stack_factor: 4
hidden_dim: 512
acoustic_decoder:
num_mel_bins: 80
sampling_rate: 16000
hop_length: 160
stride_size: 2
kernel_size: 3
d_model: 768
scale_embedding: false
max_audio_seconds: 30
decoder_layers: 12
decoder_attention_heads: 12
decoder_ffn_dim: 3072
activation_function: "gelu"
vocos:
input_channels: 80
dim: 512
intermediate_dim: 4096
num_layers: 24
n_fft: 640
hop_size: 160
padding: "same"