StyleTTS2-LibriTTS / config.yml
patriotyk's picture
Update config.yml
7950060 verified
plbert_params:
vocab_size: 178
hidden_size: 768
num_attention_heads: 12
intermediate_size: 2048
max_position_embeddings: 512
num_hidden_layers: 12
dropout: 0.1
model_params:
multispeaker: true
vocab: "$;:,.!?¡¿—…\"«»“” ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
dim_in: 64
hidden_dim: 512
max_conv_dim: 512
n_layer: 3
n_mels: 80
n_token: 178 # number of phoneme tokens
max_dur: 50 # maximum duration of a single phoneme
style_dim: 128 # style vector size
dropout: 0.0
# config for decoder
decoder:
type: 'hifigan' # either hifigan or istftnet
resblock_kernel_sizes: [3,7,11]
upsample_rates : [10,5,3,2]
upsample_initial_channel: 512
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
upsample_kernel_sizes: [20,10,6,4]
# style diffusion model config
diffusion:
embedding_mask_proba: 0.1
# transformer config
transformer:
num_layers: 3
num_heads: 8
head_features: 64
multiplier: 2
# diffusion distribution config
dist:
sigma_data: 0.19988229232390187
mean: -4.0
std: 4.0