File size: 1,463 Bytes
051ae47 6b63e62 051ae47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | {
"architectures": [
"ZipVoice"
],
"model_type": "zipvoice",
"library_name": "pytorch",
"pipeline_tag": "text-to-speech",
"checkpoint": "checkpoint-1860000.pt",
"dataset_hours": 7000,
"tokenizer": {
"type": "SimpleTokenizer",
"level": "character",
"vocab_size": 244,
"token_file": "tokens.txt"
},
"text_normalizer": {
"package": "soe-vinorm",
"enabled_by_default": true,
"postprocess": "remove extra spaces around punctuation"
},
"reference_audio": {
"directory": "audio",
"count": 30,
"text_format": "sidecar_txt_same_basename"
},
"demo": {
"directory": "demo",
"sample_count": 3
},
"model": {
"fm_decoder_downsampling_factor": [
1,
2,
4,
2,
1
],
"fm_decoder_num_layers": [
2,
2,
4,
4,
4
],
"fm_decoder_cnn_module_kernel": [
31,
15,
7,
15,
31
],
"fm_decoder_feedforward_dim": 1536,
"fm_decoder_num_heads": 4,
"fm_decoder_dim": 512,
"text_encoder_num_layers": 4,
"text_encoder_feedforward_dim": 512,
"text_encoder_cnn_module_kernel": 9,
"text_encoder_num_heads": 4,
"text_encoder_dim": 192,
"query_head_dim": 32,
"value_head_dim": 12,
"pos_head_dim": 4,
"pos_dim": 48,
"time_embed_dim": 192,
"text_embed_dim": 192,
"feat_dim": 100
},
"feature": {
"sampling_rate": 24000,
"type": "vocos"
}
}
|