kitten-tts-mini / config.json
roboalchemist's picture
Upload folder using huggingface_hub
d1f3a2c verified
{
"activation_quant_modules": [
"bert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense",
"bert.encoder.albert_layer_groups.0.albert_layers.0.attention.key",
"bert.encoder.albert_layer_groups.0.albert_layers.0.attention.query",
"bert.encoder.albert_layer_groups.0.albert_layers.0.attention.value",
"bert.encoder.albert_layer_groups.0.albert_layers.0.ffn",
"bert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output",
"bert.encoder.embedding_hidden_mapping_in",
"bert_encoder",
"decoder.asr_res.0",
"decoder.decode.0.conv1",
"decoder.decode.0.conv1x1",
"decoder.decode.0.conv2",
"decoder.decode.0.norm1.fc",
"decoder.decode.0.norm2.fc",
"decoder.decode.1.conv1",
"decoder.decode.1.conv1x1",
"decoder.decode.1.conv2",
"decoder.decode.1.norm1.fc",
"decoder.decode.1.norm2.fc",
"decoder.decode.2.conv1",
"decoder.decode.2.conv1x1",
"decoder.decode.2.conv2",
"decoder.decode.2.norm1.fc",
"decoder.decode.2.norm2.fc",
"decoder.decode.3.conv1",
"decoder.decode.3.conv1x1",
"decoder.decode.3.conv2",
"decoder.decode.3.norm1.fc",
"decoder.decode.3.norm2.fc",
"decoder.encode.conv1",
"decoder.encode.conv1x1",
"decoder.encode.conv2",
"decoder.encode.norm1.fc",
"decoder.encode.norm2.fc",
"decoder.generator.conv_post",
"decoder.generator.noise_convs.0",
"decoder.generator.noise_convs.1",
"decoder.generator.noise_res.0.adain1.0.fc",
"decoder.generator.noise_res.0.adain1.1.fc",
"decoder.generator.noise_res.0.adain1.2.fc",
"decoder.generator.noise_res.0.adain2.0.fc",
"decoder.generator.noise_res.0.adain2.1.fc",
"decoder.generator.noise_res.0.adain2.2.fc",
"decoder.generator.noise_res.0.convs1.0",
"decoder.generator.noise_res.0.convs1.1",
"decoder.generator.noise_res.0.convs1.2",
"decoder.generator.noise_res.0.convs2.0",
"decoder.generator.noise_res.0.convs2.1",
"decoder.generator.noise_res.0.convs2.2",
"decoder.generator.noise_res.1.adain1.0.fc",
"decoder.generator.noise_res.1.adain1.1.fc",
"decoder.generator.noise_res.1.adain1.2.fc",
"decoder.generator.noise_res.1.adain2.0.fc",
"decoder.generator.noise_res.1.adain2.1.fc",
"decoder.generator.noise_res.1.adain2.2.fc",
"decoder.generator.noise_res.1.convs1.0",
"decoder.generator.noise_res.1.convs1.1",
"decoder.generator.noise_res.1.convs1.2",
"decoder.generator.noise_res.1.convs2.0",
"decoder.generator.noise_res.1.convs2.1",
"decoder.generator.noise_res.1.convs2.2",
"decoder.generator.resblocks.0.adain1.0.fc",
"decoder.generator.resblocks.0.adain1.1.fc",
"decoder.generator.resblocks.0.adain1.2.fc",
"decoder.generator.resblocks.0.adain2.0.fc",
"decoder.generator.resblocks.0.adain2.1.fc",
"decoder.generator.resblocks.0.adain2.2.fc",
"decoder.generator.resblocks.0.convs1.0",
"decoder.generator.resblocks.0.convs1.1",
"decoder.generator.resblocks.0.convs1.2",
"decoder.generator.resblocks.0.convs2.0",
"decoder.generator.resblocks.0.convs2.1",
"decoder.generator.resblocks.0.convs2.2",
"decoder.generator.resblocks.1.adain1.0.fc",
"decoder.generator.resblocks.1.adain1.1.fc",
"decoder.generator.resblocks.1.adain1.2.fc",
"decoder.generator.resblocks.1.adain2.0.fc",
"decoder.generator.resblocks.1.adain2.1.fc",
"decoder.generator.resblocks.1.adain2.2.fc",
"decoder.generator.resblocks.1.convs1.0",
"decoder.generator.resblocks.1.convs1.1",
"decoder.generator.resblocks.1.convs1.2",
"decoder.generator.resblocks.1.convs2.0",
"decoder.generator.resblocks.1.convs2.1",
"decoder.generator.resblocks.1.convs2.2",
"decoder.generator.resblocks.2.adain1.0.fc",
"decoder.generator.resblocks.2.adain1.1.fc",
"decoder.generator.resblocks.2.adain1.2.fc",
"decoder.generator.resblocks.2.adain2.0.fc",
"decoder.generator.resblocks.2.adain2.1.fc",
"decoder.generator.resblocks.2.adain2.2.fc",
"decoder.generator.resblocks.2.convs1.0",
"decoder.generator.resblocks.2.convs1.1",
"decoder.generator.resblocks.2.convs1.2",
"decoder.generator.resblocks.2.convs2.0",
"decoder.generator.resblocks.2.convs2.1",
"decoder.generator.resblocks.2.convs2.2",
"decoder.generator.resblocks.3.adain1.0.fc",
"decoder.generator.resblocks.3.adain1.1.fc",
"decoder.generator.resblocks.3.adain1.2.fc",
"decoder.generator.resblocks.3.adain2.0.fc",
"decoder.generator.resblocks.3.adain2.1.fc",
"decoder.generator.resblocks.3.adain2.2.fc",
"decoder.generator.resblocks.3.convs1.0",
"decoder.generator.resblocks.3.convs1.1",
"decoder.generator.resblocks.3.convs1.2",
"decoder.generator.resblocks.3.convs2.0",
"decoder.generator.resblocks.3.convs2.1",
"decoder.generator.resblocks.3.convs2.2",
"predictor.F0.0.conv1",
"predictor.F0.0.conv2",
"predictor.F0.0.norm1.fc",
"predictor.F0.0.norm2.fc",
"predictor.F0.1.conv1",
"predictor.F0.1.conv1x1",
"predictor.F0.1.conv2",
"predictor.F0.1.norm1.fc",
"predictor.F0.1.norm2.fc",
"predictor.F0.2.conv1",
"predictor.F0.2.conv2",
"predictor.F0.2.norm1.fc",
"predictor.F0.2.norm2.fc",
"predictor.F0_proj",
"predictor.N.0.conv1",
"predictor.N.0.conv2",
"predictor.N.0.norm1.fc",
"predictor.N.0.norm2.fc",
"predictor.N.1.conv1",
"predictor.N.1.conv1x1",
"predictor.N.1.conv2",
"predictor.N.1.norm1.fc",
"predictor.N.1.norm2.fc",
"predictor.N.2.conv1",
"predictor.N.2.conv2",
"predictor.N.2.norm1.fc",
"predictor.N.2.norm2.fc",
"predictor.N_proj",
"predictor.lstm",
"predictor.shared",
"predictor.text_encoder.lstms.0",
"predictor.text_encoder.lstms.1.fc",
"predictor.text_encoder.lstms.2",
"predictor.text_encoder.lstms.3.fc",
"predictor.text_encoder.lstms.4",
"predictor.text_encoder.lstms.5.fc",
"text_encoder.cnn.0.0",
"text_encoder.cnn.1.0",
"text_encoder.cnn.2.0",
"text_encoder.lstm"
],
"asr_res_dim": 64,
"decoder_out_dim": 512,
"hidden_dim": 512,
"istftnet": {
"resblock_kernel_sizes": [
3,
3
],
"upsample_rates": [
10,
6
],
"upsample_initial_channel": 512,
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_kernel_sizes": [
20,
12
],
"gen_istft_n_fft": 20,
"gen_istft_hop_size": 5
},
"max_conv_dim": 1024,
"max_dur": 50,
"model_type": "kitten_tts",
"n_layer": 3,
"n_mels": 80,
"n_token": 178,
"plbert": {
"num_hidden_layers": 12,
"num_attention_heads": 12,
"hidden_size": 768,
"intermediate_size": 2048,
"max_position_embeddings": 512,
"embedding_size": 128,
"inner_group_num": 1,
"num_hidden_groups": 1,
"hidden_dropout_prob": 0.0,
"attention_probs_dropout_prob": 0.0,
"type_vocab_size": 2,
"layer_norm_eps": 1e-12
},
"sample_rate": 24000,
"speed_priors": {},
"style_dim": 128,
"text_encoder_kernel_size": 5,
"voice_aliases": {
"Bella": "expr-voice-2-f",
"Jasper": "expr-voice-2-m",
"Luna": "expr-voice-3-f",
"Bruno": "expr-voice-3-m",
"Rosie": "expr-voice-4-f",
"Hugo": "expr-voice-4-m",
"Kiki": "expr-voice-5-f",
"Leo": "expr-voice-5-m"
},
"voices_path": "voices.npz"
}