feat: あみたろ 6종 화자 모델 추가
Browse filesあみたろの声素材工房 ITAコーパス読み上げ音声 기반 파인튜닝 모델 6종
- amitaro_normal, amitaro_runrun, amitaro_yofukashi, amitaro_punsuka, amitaro_sasayaki_a, amitaro_sasayaki_b
- PyTorch (safetensors) + ONNX (synthesizer + duration_predictor)
- License: CC-BY-SA-4.0
- Credit: あみたろの声素材工房 https://amitaro.net/
- onnx/speakers/amitaro_normal/config.json +118 -0
- onnx/speakers/amitaro_normal/duration_predictor.onnx +3 -0
- onnx/speakers/amitaro_normal/style_vectors.npy +3 -0
- onnx/speakers/amitaro_normal/synthesizer.onnx +3 -0
- onnx/speakers/amitaro_punsuka/config.json +118 -0
- onnx/speakers/amitaro_punsuka/duration_predictor.onnx +3 -0
- onnx/speakers/amitaro_punsuka/style_vectors.npy +3 -0
- onnx/speakers/amitaro_punsuka/synthesizer.onnx +3 -0
- onnx/speakers/amitaro_runrun/config.json +118 -0
- onnx/speakers/amitaro_runrun/duration_predictor.onnx +3 -0
- onnx/speakers/amitaro_runrun/style_vectors.npy +3 -0
- onnx/speakers/amitaro_runrun/synthesizer.onnx +3 -0
- onnx/speakers/amitaro_sasayaki_a/config.json +118 -0
- onnx/speakers/amitaro_sasayaki_a/duration_predictor.onnx +3 -0
- onnx/speakers/amitaro_sasayaki_a/style_vectors.npy +3 -0
- onnx/speakers/amitaro_sasayaki_a/synthesizer.onnx +3 -0
- onnx/speakers/amitaro_sasayaki_b/config.json +118 -0
- onnx/speakers/amitaro_sasayaki_b/duration_predictor.onnx +3 -0
- onnx/speakers/amitaro_sasayaki_b/style_vectors.npy +3 -0
- onnx/speakers/amitaro_sasayaki_b/synthesizer.onnx +3 -0
- onnx/speakers/amitaro_yofukashi/config.json +118 -0
- onnx/speakers/amitaro_yofukashi/duration_predictor.onnx +3 -0
- onnx/speakers/amitaro_yofukashi/style_vectors.npy +3 -0
- onnx/speakers/amitaro_yofukashi/synthesizer.onnx +3 -0
- pytorch/speakers/amitaro_normal/config.json +118 -0
- pytorch/speakers/amitaro_normal/hayakoe_amitaro_normal_e11_s2000.safetensors +3 -0
- pytorch/speakers/amitaro_normal/style_vectors.npy +3 -0
- pytorch/speakers/amitaro_punsuka/config.json +118 -0
- pytorch/speakers/amitaro_punsuka/hayakoe_amitaro_punsuka_e11_s2000.safetensors +3 -0
- pytorch/speakers/amitaro_punsuka/style_vectors.npy +3 -0
- pytorch/speakers/amitaro_runrun/config.json +118 -0
- pytorch/speakers/amitaro_runrun/hayakoe_amitaro_runrun_e11_s2000.safetensors +3 -0
- pytorch/speakers/amitaro_runrun/style_vectors.npy +3 -0
- pytorch/speakers/amitaro_sasayaki_a/config.json +118 -0
- pytorch/speakers/amitaro_sasayaki_a/hayakoe_amitaro_sasayaki_a_e87_s4000.safetensors +3 -0
- pytorch/speakers/amitaro_sasayaki_a/style_vectors.npy +3 -0
- pytorch/speakers/amitaro_sasayaki_b/config.json +118 -0
- pytorch/speakers/amitaro_sasayaki_b/hayakoe_amitaro_sasayaki_b_e98_s4500.safetensors +3 -0
- pytorch/speakers/amitaro_sasayaki_b/style_vectors.npy +3 -0
- pytorch/speakers/amitaro_yofukashi/config.json +118 -0
- pytorch/speakers/amitaro_yofukashi/hayakoe_amitaro_yofukashi_e11_s2000.safetensors +3 -0
- pytorch/speakers/amitaro_yofukashi/style_vectors.npy +3 -0
onnx/speakers/amitaro_normal/config.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "hayakoe_amitaro_normal",
|
| 3 |
+
"train": {
|
| 4 |
+
"log_interval": 200,
|
| 5 |
+
"eval_interval": 500,
|
| 6 |
+
"seed": 42,
|
| 7 |
+
"epochs": 21,
|
| 8 |
+
"learning_rate": 0.0001,
|
| 9 |
+
"betas": [
|
| 10 |
+
0.8,
|
| 11 |
+
0.99
|
| 12 |
+
],
|
| 13 |
+
"eps": 1e-09,
|
| 14 |
+
"batch_size": 2,
|
| 15 |
+
"bf16_run": false,
|
| 16 |
+
"fp16_run": false,
|
| 17 |
+
"lr_decay": 0.99996,
|
| 18 |
+
"segment_size": 16384,
|
| 19 |
+
"init_lr_ratio": 1,
|
| 20 |
+
"warmup_epochs": 0,
|
| 21 |
+
"c_mel": 45,
|
| 22 |
+
"c_kl": 1.0,
|
| 23 |
+
"c_commit": 100,
|
| 24 |
+
"skip_optimizer": false,
|
| 25 |
+
"freeze_ZH_bert": false,
|
| 26 |
+
"freeze_JP_bert": false,
|
| 27 |
+
"freeze_EN_bert": false,
|
| 28 |
+
"freeze_emo": false,
|
| 29 |
+
"freeze_style": false,
|
| 30 |
+
"freeze_decoder": false
|
| 31 |
+
},
|
| 32 |
+
"data": {
|
| 33 |
+
"use_jp_extra": true,
|
| 34 |
+
"training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_normal/train.list",
|
| 35 |
+
"validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_normal/val.list",
|
| 36 |
+
"max_wav_value": 32768.0,
|
| 37 |
+
"sampling_rate": 44100,
|
| 38 |
+
"filter_length": 2048,
|
| 39 |
+
"hop_length": 512,
|
| 40 |
+
"win_length": 2048,
|
| 41 |
+
"n_mel_channels": 128,
|
| 42 |
+
"mel_fmin": 0.0,
|
| 43 |
+
"mel_fmax": null,
|
| 44 |
+
"add_blank": true,
|
| 45 |
+
"n_speakers": 1,
|
| 46 |
+
"cleaned_text": true,
|
| 47 |
+
"spk2id": {
|
| 48 |
+
"amitaro_normal": 0
|
| 49 |
+
},
|
| 50 |
+
"num_styles": 1,
|
| 51 |
+
"style2id": {
|
| 52 |
+
"Neutral": 0
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"model": {
|
| 56 |
+
"use_spk_conditioned_encoder": true,
|
| 57 |
+
"use_noise_scaled_mas": true,
|
| 58 |
+
"use_mel_posterior_encoder": false,
|
| 59 |
+
"use_duration_discriminator": false,
|
| 60 |
+
"use_wavlm_discriminator": true,
|
| 61 |
+
"inter_channels": 192,
|
| 62 |
+
"hidden_channels": 192,
|
| 63 |
+
"filter_channels": 768,
|
| 64 |
+
"n_heads": 2,
|
| 65 |
+
"n_layers": 6,
|
| 66 |
+
"kernel_size": 3,
|
| 67 |
+
"p_dropout": 0.1,
|
| 68 |
+
"resblock": "1",
|
| 69 |
+
"resblock_kernel_sizes": [
|
| 70 |
+
3,
|
| 71 |
+
7,
|
| 72 |
+
11
|
| 73 |
+
],
|
| 74 |
+
"resblock_dilation_sizes": [
|
| 75 |
+
[
|
| 76 |
+
1,
|
| 77 |
+
3,
|
| 78 |
+
5
|
| 79 |
+
],
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
]
|
| 90 |
+
],
|
| 91 |
+
"upsample_rates": [
|
| 92 |
+
8,
|
| 93 |
+
8,
|
| 94 |
+
2,
|
| 95 |
+
2,
|
| 96 |
+
2
|
| 97 |
+
],
|
| 98 |
+
"upsample_initial_channel": 512,
|
| 99 |
+
"upsample_kernel_sizes": [
|
| 100 |
+
16,
|
| 101 |
+
16,
|
| 102 |
+
8,
|
| 103 |
+
2,
|
| 104 |
+
2
|
| 105 |
+
],
|
| 106 |
+
"n_layers_q": 3,
|
| 107 |
+
"use_spectral_norm": false,
|
| 108 |
+
"gin_channels": 512,
|
| 109 |
+
"slm": {
|
| 110 |
+
"model": "microsoft/wavlm-base-plus",
|
| 111 |
+
"sr": 16000,
|
| 112 |
+
"hidden": 768,
|
| 113 |
+
"nlayers": 13,
|
| 114 |
+
"initial_channel": 64
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
"version": "2.7.0-JP-Extra"
|
| 118 |
+
}
|
onnx/speakers/amitaro_normal/duration_predictor.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea72d3b807b7120a84cc533e04da80374352fa124adae13577ac36953b170e41
|
| 3 |
+
size 31503027
|
onnx/speakers/amitaro_normal/style_vectors.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28f9059b9da6c20dd891a2f22df2bfe77311a51d337623bd88e701de2f942643
|
| 3 |
+
size 1152
|
onnx/speakers/amitaro_normal/synthesizer.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b4840a47ff05b207e226bd936a143b5985ad24d0983be23a5b14705f6849965
|
| 3 |
+
size 250644697
|
onnx/speakers/amitaro_punsuka/config.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "hayakoe_amitaro_punsuka",
|
| 3 |
+
"train": {
|
| 4 |
+
"log_interval": 200,
|
| 5 |
+
"eval_interval": 500,
|
| 6 |
+
"seed": 42,
|
| 7 |
+
"epochs": 11,
|
| 8 |
+
"learning_rate": 0.0001,
|
| 9 |
+
"betas": [
|
| 10 |
+
0.8,
|
| 11 |
+
0.99
|
| 12 |
+
],
|
| 13 |
+
"eps": 1e-09,
|
| 14 |
+
"batch_size": 2,
|
| 15 |
+
"bf16_run": false,
|
| 16 |
+
"fp16_run": false,
|
| 17 |
+
"lr_decay": 0.99996,
|
| 18 |
+
"segment_size": 16384,
|
| 19 |
+
"init_lr_ratio": 1,
|
| 20 |
+
"warmup_epochs": 0,
|
| 21 |
+
"c_mel": 45,
|
| 22 |
+
"c_kl": 1.0,
|
| 23 |
+
"c_commit": 100,
|
| 24 |
+
"skip_optimizer": false,
|
| 25 |
+
"freeze_ZH_bert": false,
|
| 26 |
+
"freeze_JP_bert": false,
|
| 27 |
+
"freeze_EN_bert": false,
|
| 28 |
+
"freeze_emo": false,
|
| 29 |
+
"freeze_style": false,
|
| 30 |
+
"freeze_decoder": false
|
| 31 |
+
},
|
| 32 |
+
"data": {
|
| 33 |
+
"use_jp_extra": true,
|
| 34 |
+
"training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_punsuka/train.list",
|
| 35 |
+
"validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_punsuka/val.list",
|
| 36 |
+
"max_wav_value": 32768.0,
|
| 37 |
+
"sampling_rate": 44100,
|
| 38 |
+
"filter_length": 2048,
|
| 39 |
+
"hop_length": 512,
|
| 40 |
+
"win_length": 2048,
|
| 41 |
+
"n_mel_channels": 128,
|
| 42 |
+
"mel_fmin": 0.0,
|
| 43 |
+
"mel_fmax": null,
|
| 44 |
+
"add_blank": true,
|
| 45 |
+
"n_speakers": 1,
|
| 46 |
+
"cleaned_text": true,
|
| 47 |
+
"spk2id": {
|
| 48 |
+
"amitaro_punsuka": 0
|
| 49 |
+
},
|
| 50 |
+
"num_styles": 1,
|
| 51 |
+
"style2id": {
|
| 52 |
+
"Neutral": 0
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"model": {
|
| 56 |
+
"use_spk_conditioned_encoder": true,
|
| 57 |
+
"use_noise_scaled_mas": true,
|
| 58 |
+
"use_mel_posterior_encoder": false,
|
| 59 |
+
"use_duration_discriminator": false,
|
| 60 |
+
"use_wavlm_discriminator": true,
|
| 61 |
+
"inter_channels": 192,
|
| 62 |
+
"hidden_channels": 192,
|
| 63 |
+
"filter_channels": 768,
|
| 64 |
+
"n_heads": 2,
|
| 65 |
+
"n_layers": 6,
|
| 66 |
+
"kernel_size": 3,
|
| 67 |
+
"p_dropout": 0.1,
|
| 68 |
+
"resblock": "1",
|
| 69 |
+
"resblock_kernel_sizes": [
|
| 70 |
+
3,
|
| 71 |
+
7,
|
| 72 |
+
11
|
| 73 |
+
],
|
| 74 |
+
"resblock_dilation_sizes": [
|
| 75 |
+
[
|
| 76 |
+
1,
|
| 77 |
+
3,
|
| 78 |
+
5
|
| 79 |
+
],
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
]
|
| 90 |
+
],
|
| 91 |
+
"upsample_rates": [
|
| 92 |
+
8,
|
| 93 |
+
8,
|
| 94 |
+
2,
|
| 95 |
+
2,
|
| 96 |
+
2
|
| 97 |
+
],
|
| 98 |
+
"upsample_initial_channel": 512,
|
| 99 |
+
"upsample_kernel_sizes": [
|
| 100 |
+
16,
|
| 101 |
+
16,
|
| 102 |
+
8,
|
| 103 |
+
2,
|
| 104 |
+
2
|
| 105 |
+
],
|
| 106 |
+
"n_layers_q": 3,
|
| 107 |
+
"use_spectral_norm": false,
|
| 108 |
+
"gin_channels": 512,
|
| 109 |
+
"slm": {
|
| 110 |
+
"model": "microsoft/wavlm-base-plus",
|
| 111 |
+
"sr": 16000,
|
| 112 |
+
"hidden": 768,
|
| 113 |
+
"nlayers": 13,
|
| 114 |
+
"initial_channel": 64
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
"version": "2.7.0-JP-Extra"
|
| 118 |
+
}
|
onnx/speakers/amitaro_punsuka/duration_predictor.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b6accd699eecf9b44c92a0fd497154aa5c6063e99937ee58d0c78d58f60b166
|
| 3 |
+
size 31503027
|
onnx/speakers/amitaro_punsuka/style_vectors.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0cdbc3e625231b57adb16c89499bd0f01fb74acac42abbd1c0aceb96346bfa68
|
| 3 |
+
size 1152
|
onnx/speakers/amitaro_punsuka/synthesizer.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:241d74d32ea37a0c1cb3d37b0569563c9712d08514fe68136cd2f1a2743971bc
|
| 3 |
+
size 250644697
|
onnx/speakers/amitaro_runrun/config.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "hayakoe_amitaro_runrun",
|
| 3 |
+
"train": {
|
| 4 |
+
"log_interval": 200,
|
| 5 |
+
"eval_interval": 500,
|
| 6 |
+
"seed": 42,
|
| 7 |
+
"epochs": 21,
|
| 8 |
+
"learning_rate": 0.0001,
|
| 9 |
+
"betas": [
|
| 10 |
+
0.8,
|
| 11 |
+
0.99
|
| 12 |
+
],
|
| 13 |
+
"eps": 1e-09,
|
| 14 |
+
"batch_size": 2,
|
| 15 |
+
"bf16_run": false,
|
| 16 |
+
"fp16_run": false,
|
| 17 |
+
"lr_decay": 0.99996,
|
| 18 |
+
"segment_size": 16384,
|
| 19 |
+
"init_lr_ratio": 1,
|
| 20 |
+
"warmup_epochs": 0,
|
| 21 |
+
"c_mel": 45,
|
| 22 |
+
"c_kl": 1.0,
|
| 23 |
+
"c_commit": 100,
|
| 24 |
+
"skip_optimizer": false,
|
| 25 |
+
"freeze_ZH_bert": false,
|
| 26 |
+
"freeze_JP_bert": false,
|
| 27 |
+
"freeze_EN_bert": false,
|
| 28 |
+
"freeze_emo": false,
|
| 29 |
+
"freeze_style": false,
|
| 30 |
+
"freeze_decoder": false
|
| 31 |
+
},
|
| 32 |
+
"data": {
|
| 33 |
+
"use_jp_extra": true,
|
| 34 |
+
"training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_runrun/train.list",
|
| 35 |
+
"validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_runrun/val.list",
|
| 36 |
+
"max_wav_value": 32768.0,
|
| 37 |
+
"sampling_rate": 44100,
|
| 38 |
+
"filter_length": 2048,
|
| 39 |
+
"hop_length": 512,
|
| 40 |
+
"win_length": 2048,
|
| 41 |
+
"n_mel_channels": 128,
|
| 42 |
+
"mel_fmin": 0.0,
|
| 43 |
+
"mel_fmax": null,
|
| 44 |
+
"add_blank": true,
|
| 45 |
+
"n_speakers": 1,
|
| 46 |
+
"cleaned_text": true,
|
| 47 |
+
"spk2id": {
|
| 48 |
+
"amitaro_runrun": 0
|
| 49 |
+
},
|
| 50 |
+
"num_styles": 1,
|
| 51 |
+
"style2id": {
|
| 52 |
+
"Neutral": 0
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"model": {
|
| 56 |
+
"use_spk_conditioned_encoder": true,
|
| 57 |
+
"use_noise_scaled_mas": true,
|
| 58 |
+
"use_mel_posterior_encoder": false,
|
| 59 |
+
"use_duration_discriminator": false,
|
| 60 |
+
"use_wavlm_discriminator": true,
|
| 61 |
+
"inter_channels": 192,
|
| 62 |
+
"hidden_channels": 192,
|
| 63 |
+
"filter_channels": 768,
|
| 64 |
+
"n_heads": 2,
|
| 65 |
+
"n_layers": 6,
|
| 66 |
+
"kernel_size": 3,
|
| 67 |
+
"p_dropout": 0.1,
|
| 68 |
+
"resblock": "1",
|
| 69 |
+
"resblock_kernel_sizes": [
|
| 70 |
+
3,
|
| 71 |
+
7,
|
| 72 |
+
11
|
| 73 |
+
],
|
| 74 |
+
"resblock_dilation_sizes": [
|
| 75 |
+
[
|
| 76 |
+
1,
|
| 77 |
+
3,
|
| 78 |
+
5
|
| 79 |
+
],
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
]
|
| 90 |
+
],
|
| 91 |
+
"upsample_rates": [
|
| 92 |
+
8,
|
| 93 |
+
8,
|
| 94 |
+
2,
|
| 95 |
+
2,
|
| 96 |
+
2
|
| 97 |
+
],
|
| 98 |
+
"upsample_initial_channel": 512,
|
| 99 |
+
"upsample_kernel_sizes": [
|
| 100 |
+
16,
|
| 101 |
+
16,
|
| 102 |
+
8,
|
| 103 |
+
2,
|
| 104 |
+
2
|
| 105 |
+
],
|
| 106 |
+
"n_layers_q": 3,
|
| 107 |
+
"use_spectral_norm": false,
|
| 108 |
+
"gin_channels": 512,
|
| 109 |
+
"slm": {
|
| 110 |
+
"model": "microsoft/wavlm-base-plus",
|
| 111 |
+
"sr": 16000,
|
| 112 |
+
"hidden": 768,
|
| 113 |
+
"nlayers": 13,
|
| 114 |
+
"initial_channel": 64
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
"version": "2.7.0-JP-Extra"
|
| 118 |
+
}
|
onnx/speakers/amitaro_runrun/duration_predictor.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97809075bf8aad4a482fe20bd2722a234d4a8ad916ad5e88fa70105d817430d5
|
| 3 |
+
size 31503027
|
onnx/speakers/amitaro_runrun/style_vectors.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b59a7286e5ea0c549acbdb086aa52fd227b6e270ec20918bc8cc63b61cbf64b8
|
| 3 |
+
size 1152
|
onnx/speakers/amitaro_runrun/synthesizer.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a183206be69b208edd3a229e884c85770304ca21ae8ee0b3b0e930700fd2ce44
|
| 3 |
+
size 250644697
|
onnx/speakers/amitaro_sasayaki_a/config.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "hayakoe_amitaro_sasayaki_a",
|
| 3 |
+
"train": {
|
| 4 |
+
"log_interval": 200,
|
| 5 |
+
"eval_interval": 500,
|
| 6 |
+
"seed": 42,
|
| 7 |
+
"epochs": 100,
|
| 8 |
+
"learning_rate": 0.0001,
|
| 9 |
+
"betas": [
|
| 10 |
+
0.8,
|
| 11 |
+
0.99
|
| 12 |
+
],
|
| 13 |
+
"eps": 1e-09,
|
| 14 |
+
"batch_size": 2,
|
| 15 |
+
"bf16_run": false,
|
| 16 |
+
"fp16_run": false,
|
| 17 |
+
"lr_decay": 0.99996,
|
| 18 |
+
"segment_size": 16384,
|
| 19 |
+
"init_lr_ratio": 1,
|
| 20 |
+
"warmup_epochs": 0,
|
| 21 |
+
"c_mel": 45,
|
| 22 |
+
"c_kl": 1.0,
|
| 23 |
+
"c_commit": 100,
|
| 24 |
+
"skip_optimizer": false,
|
| 25 |
+
"freeze_ZH_bert": false,
|
| 26 |
+
"freeze_JP_bert": false,
|
| 27 |
+
"freeze_EN_bert": false,
|
| 28 |
+
"freeze_emo": false,
|
| 29 |
+
"freeze_style": false,
|
| 30 |
+
"freeze_decoder": false
|
| 31 |
+
},
|
| 32 |
+
"data": {
|
| 33 |
+
"use_jp_extra": true,
|
| 34 |
+
"training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_a/train.list",
|
| 35 |
+
"validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_a/val.list",
|
| 36 |
+
"max_wav_value": 32768.0,
|
| 37 |
+
"sampling_rate": 44100,
|
| 38 |
+
"filter_length": 2048,
|
| 39 |
+
"hop_length": 512,
|
| 40 |
+
"win_length": 2048,
|
| 41 |
+
"n_mel_channels": 128,
|
| 42 |
+
"mel_fmin": 0.0,
|
| 43 |
+
"mel_fmax": null,
|
| 44 |
+
"add_blank": true,
|
| 45 |
+
"n_speakers": 1,
|
| 46 |
+
"cleaned_text": true,
|
| 47 |
+
"spk2id": {
|
| 48 |
+
"amitaro_sasayaki_a": 0
|
| 49 |
+
},
|
| 50 |
+
"num_styles": 1,
|
| 51 |
+
"style2id": {
|
| 52 |
+
"Neutral": 0
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"model": {
|
| 56 |
+
"use_spk_conditioned_encoder": true,
|
| 57 |
+
"use_noise_scaled_mas": true,
|
| 58 |
+
"use_mel_posterior_encoder": false,
|
| 59 |
+
"use_duration_discriminator": false,
|
| 60 |
+
"use_wavlm_discriminator": true,
|
| 61 |
+
"inter_channels": 192,
|
| 62 |
+
"hidden_channels": 192,
|
| 63 |
+
"filter_channels": 768,
|
| 64 |
+
"n_heads": 2,
|
| 65 |
+
"n_layers": 6,
|
| 66 |
+
"kernel_size": 3,
|
| 67 |
+
"p_dropout": 0.1,
|
| 68 |
+
"resblock": "1",
|
| 69 |
+
"resblock_kernel_sizes": [
|
| 70 |
+
3,
|
| 71 |
+
7,
|
| 72 |
+
11
|
| 73 |
+
],
|
| 74 |
+
"resblock_dilation_sizes": [
|
| 75 |
+
[
|
| 76 |
+
1,
|
| 77 |
+
3,
|
| 78 |
+
5
|
| 79 |
+
],
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
]
|
| 90 |
+
],
|
| 91 |
+
"upsample_rates": [
|
| 92 |
+
8,
|
| 93 |
+
8,
|
| 94 |
+
2,
|
| 95 |
+
2,
|
| 96 |
+
2
|
| 97 |
+
],
|
| 98 |
+
"upsample_initial_channel": 512,
|
| 99 |
+
"upsample_kernel_sizes": [
|
| 100 |
+
16,
|
| 101 |
+
16,
|
| 102 |
+
8,
|
| 103 |
+
2,
|
| 104 |
+
2
|
| 105 |
+
],
|
| 106 |
+
"n_layers_q": 3,
|
| 107 |
+
"use_spectral_norm": false,
|
| 108 |
+
"gin_channels": 512,
|
| 109 |
+
"slm": {
|
| 110 |
+
"model": "microsoft/wavlm-base-plus",
|
| 111 |
+
"sr": 16000,
|
| 112 |
+
"hidden": 768,
|
| 113 |
+
"nlayers": 13,
|
| 114 |
+
"initial_channel": 64
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
"version": "2.7.0-JP-Extra"
|
| 118 |
+
}
|
onnx/speakers/amitaro_sasayaki_a/duration_predictor.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b1331719136678ce093301b88364ebfdca6cf70287cbb3b97e1e07d641cb29a
|
| 3 |
+
size 31503027
|
onnx/speakers/amitaro_sasayaki_a/style_vectors.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ab461895545a48c61e6d96ba8e9dc717d9e7edc06e42a549b934ed14f9c84c86
|
| 3 |
+
size 1152
|
onnx/speakers/amitaro_sasayaki_a/synthesizer.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66b43fcf188b152cd9011d6c2d1ac3e16eb9e043589b28c7fe2bb729379afaa0
|
| 3 |
+
size 250644697
|
onnx/speakers/amitaro_sasayaki_b/config.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "hayakoe_amitaro_sasayaki_b",
|
| 3 |
+
"train": {
|
| 4 |
+
"log_interval": 200,
|
| 5 |
+
"eval_interval": 500,
|
| 6 |
+
"seed": 42,
|
| 7 |
+
"epochs": 100,
|
| 8 |
+
"learning_rate": 0.0001,
|
| 9 |
+
"betas": [
|
| 10 |
+
0.8,
|
| 11 |
+
0.99
|
| 12 |
+
],
|
| 13 |
+
"eps": 1e-09,
|
| 14 |
+
"batch_size": 2,
|
| 15 |
+
"bf16_run": false,
|
| 16 |
+
"fp16_run": false,
|
| 17 |
+
"lr_decay": 0.99996,
|
| 18 |
+
"segment_size": 16384,
|
| 19 |
+
"init_lr_ratio": 1,
|
| 20 |
+
"warmup_epochs": 0,
|
| 21 |
+
"c_mel": 45,
|
| 22 |
+
"c_kl": 1.0,
|
| 23 |
+
"c_commit": 100,
|
| 24 |
+
"skip_optimizer": false,
|
| 25 |
+
"freeze_ZH_bert": false,
|
| 26 |
+
"freeze_JP_bert": false,
|
| 27 |
+
"freeze_EN_bert": false,
|
| 28 |
+
"freeze_emo": false,
|
| 29 |
+
"freeze_style": false,
|
| 30 |
+
"freeze_decoder": false
|
| 31 |
+
},
|
| 32 |
+
"data": {
|
| 33 |
+
"use_jp_extra": true,
|
| 34 |
+
"training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_b/train.list",
|
| 35 |
+
"validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_b/val.list",
|
| 36 |
+
"max_wav_value": 32768.0,
|
| 37 |
+
"sampling_rate": 44100,
|
| 38 |
+
"filter_length": 2048,
|
| 39 |
+
"hop_length": 512,
|
| 40 |
+
"win_length": 2048,
|
| 41 |
+
"n_mel_channels": 128,
|
| 42 |
+
"mel_fmin": 0.0,
|
| 43 |
+
"mel_fmax": null,
|
| 44 |
+
"add_blank": true,
|
| 45 |
+
"n_speakers": 1,
|
| 46 |
+
"cleaned_text": true,
|
| 47 |
+
"spk2id": {
|
| 48 |
+
"amitaro_sasayaki_b": 0
|
| 49 |
+
},
|
| 50 |
+
"num_styles": 1,
|
| 51 |
+
"style2id": {
|
| 52 |
+
"Neutral": 0
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"model": {
|
| 56 |
+
"use_spk_conditioned_encoder": true,
|
| 57 |
+
"use_noise_scaled_mas": true,
|
| 58 |
+
"use_mel_posterior_encoder": false,
|
| 59 |
+
"use_duration_discriminator": false,
|
| 60 |
+
"use_wavlm_discriminator": true,
|
| 61 |
+
"inter_channels": 192,
|
| 62 |
+
"hidden_channels": 192,
|
| 63 |
+
"filter_channels": 768,
|
| 64 |
+
"n_heads": 2,
|
| 65 |
+
"n_layers": 6,
|
| 66 |
+
"kernel_size": 3,
|
| 67 |
+
"p_dropout": 0.1,
|
| 68 |
+
"resblock": "1",
|
| 69 |
+
"resblock_kernel_sizes": [
|
| 70 |
+
3,
|
| 71 |
+
7,
|
| 72 |
+
11
|
| 73 |
+
],
|
| 74 |
+
"resblock_dilation_sizes": [
|
| 75 |
+
[
|
| 76 |
+
1,
|
| 77 |
+
3,
|
| 78 |
+
5
|
| 79 |
+
],
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
]
|
| 90 |
+
],
|
| 91 |
+
"upsample_rates": [
|
| 92 |
+
8,
|
| 93 |
+
8,
|
| 94 |
+
2,
|
| 95 |
+
2,
|
| 96 |
+
2
|
| 97 |
+
],
|
| 98 |
+
"upsample_initial_channel": 512,
|
| 99 |
+
"upsample_kernel_sizes": [
|
| 100 |
+
16,
|
| 101 |
+
16,
|
| 102 |
+
8,
|
| 103 |
+
2,
|
| 104 |
+
2
|
| 105 |
+
],
|
| 106 |
+
"n_layers_q": 3,
|
| 107 |
+
"use_spectral_norm": false,
|
| 108 |
+
"gin_channels": 512,
|
| 109 |
+
"slm": {
|
| 110 |
+
"model": "microsoft/wavlm-base-plus",
|
| 111 |
+
"sr": 16000,
|
| 112 |
+
"hidden": 768,
|
| 113 |
+
"nlayers": 13,
|
| 114 |
+
"initial_channel": 64
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
"version": "2.7.0-JP-Extra"
|
| 118 |
+
}
|
onnx/speakers/amitaro_sasayaki_b/duration_predictor.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7273327640ddb5345963ae5fea5a5517c65d7efa57c35db3811634f727180975
|
| 3 |
+
size 31503027
|
onnx/speakers/amitaro_sasayaki_b/style_vectors.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3fac4742f009abb2d8a3fb007319887ee664634c6eed7816a9b72b8c90bdcde3
|
| 3 |
+
size 1152
|
onnx/speakers/amitaro_sasayaki_b/synthesizer.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:742e7ff7a39b7218a19ba3120c1281dd6eb8726b89c872ad3eeee1068006bd1b
|
| 3 |
+
size 250644697
|
onnx/speakers/amitaro_yofukashi/config.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "hayakoe_amitaro_yofukashi",
|
| 3 |
+
"train": {
|
| 4 |
+
"log_interval": 200,
|
| 5 |
+
"eval_interval": 500,
|
| 6 |
+
"seed": 42,
|
| 7 |
+
"epochs": 21,
|
| 8 |
+
"learning_rate": 0.0001,
|
| 9 |
+
"betas": [
|
| 10 |
+
0.8,
|
| 11 |
+
0.99
|
| 12 |
+
],
|
| 13 |
+
"eps": 1e-09,
|
| 14 |
+
"batch_size": 2,
|
| 15 |
+
"bf16_run": false,
|
| 16 |
+
"fp16_run": false,
|
| 17 |
+
"lr_decay": 0.99996,
|
| 18 |
+
"segment_size": 16384,
|
| 19 |
+
"init_lr_ratio": 1,
|
| 20 |
+
"warmup_epochs": 0,
|
| 21 |
+
"c_mel": 45,
|
| 22 |
+
"c_kl": 1.0,
|
| 23 |
+
"c_commit": 100,
|
| 24 |
+
"skip_optimizer": false,
|
| 25 |
+
"freeze_ZH_bert": false,
|
| 26 |
+
"freeze_JP_bert": false,
|
| 27 |
+
"freeze_EN_bert": false,
|
| 28 |
+
"freeze_emo": false,
|
| 29 |
+
"freeze_style": false,
|
| 30 |
+
"freeze_decoder": false
|
| 31 |
+
},
|
| 32 |
+
"data": {
|
| 33 |
+
"use_jp_extra": true,
|
| 34 |
+
"training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_yofukashi/train.list",
|
| 35 |
+
"validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_yofukashi/val.list",
|
| 36 |
+
"max_wav_value": 32768.0,
|
| 37 |
+
"sampling_rate": 44100,
|
| 38 |
+
"filter_length": 2048,
|
| 39 |
+
"hop_length": 512,
|
| 40 |
+
"win_length": 2048,
|
| 41 |
+
"n_mel_channels": 128,
|
| 42 |
+
"mel_fmin": 0.0,
|
| 43 |
+
"mel_fmax": null,
|
| 44 |
+
"add_blank": true,
|
| 45 |
+
"n_speakers": 1,
|
| 46 |
+
"cleaned_text": true,
|
| 47 |
+
"spk2id": {
|
| 48 |
+
"amitaro_yofukashi": 0
|
| 49 |
+
},
|
| 50 |
+
"num_styles": 1,
|
| 51 |
+
"style2id": {
|
| 52 |
+
"Neutral": 0
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"model": {
|
| 56 |
+
"use_spk_conditioned_encoder": true,
|
| 57 |
+
"use_noise_scaled_mas": true,
|
| 58 |
+
"use_mel_posterior_encoder": false,
|
| 59 |
+
"use_duration_discriminator": false,
|
| 60 |
+
"use_wavlm_discriminator": true,
|
| 61 |
+
"inter_channels": 192,
|
| 62 |
+
"hidden_channels": 192,
|
| 63 |
+
"filter_channels": 768,
|
| 64 |
+
"n_heads": 2,
|
| 65 |
+
"n_layers": 6,
|
| 66 |
+
"kernel_size": 3,
|
| 67 |
+
"p_dropout": 0.1,
|
| 68 |
+
"resblock": "1",
|
| 69 |
+
"resblock_kernel_sizes": [
|
| 70 |
+
3,
|
| 71 |
+
7,
|
| 72 |
+
11
|
| 73 |
+
],
|
| 74 |
+
"resblock_dilation_sizes": [
|
| 75 |
+
[
|
| 76 |
+
1,
|
| 77 |
+
3,
|
| 78 |
+
5
|
| 79 |
+
],
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
]
|
| 90 |
+
],
|
| 91 |
+
"upsample_rates": [
|
| 92 |
+
8,
|
| 93 |
+
8,
|
| 94 |
+
2,
|
| 95 |
+
2,
|
| 96 |
+
2
|
| 97 |
+
],
|
| 98 |
+
"upsample_initial_channel": 512,
|
| 99 |
+
"upsample_kernel_sizes": [
|
| 100 |
+
16,
|
| 101 |
+
16,
|
| 102 |
+
8,
|
| 103 |
+
2,
|
| 104 |
+
2
|
| 105 |
+
],
|
| 106 |
+
"n_layers_q": 3,
|
| 107 |
+
"use_spectral_norm": false,
|
| 108 |
+
"gin_channels": 512,
|
| 109 |
+
"slm": {
|
| 110 |
+
"model": "microsoft/wavlm-base-plus",
|
| 111 |
+
"sr": 16000,
|
| 112 |
+
"hidden": 768,
|
| 113 |
+
"nlayers": 13,
|
| 114 |
+
"initial_channel": 64
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
"version": "2.7.0-JP-Extra"
|
| 118 |
+
}
|
onnx/speakers/amitaro_yofukashi/duration_predictor.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8899e93605fc44ce3e058882a3fd09110973416fa9510104a58a1d5c5a5777b1
|
| 3 |
+
size 31503027
|
onnx/speakers/amitaro_yofukashi/style_vectors.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c249725ac0de81417da52cf0b3211bb7a14610f534df933f0d4a307fcaa659a0
|
| 3 |
+
size 1152
|
onnx/speakers/amitaro_yofukashi/synthesizer.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:999faade6b925fb9008c8a97fd87e95e6e20266b4d142b106edc0f9fc4b18ba0
|
| 3 |
+
size 250644697
|
pytorch/speakers/amitaro_normal/config.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "hayakoe_amitaro_normal",
|
| 3 |
+
"train": {
|
| 4 |
+
"log_interval": 200,
|
| 5 |
+
"eval_interval": 500,
|
| 6 |
+
"seed": 42,
|
| 7 |
+
"epochs": 21,
|
| 8 |
+
"learning_rate": 0.0001,
|
| 9 |
+
"betas": [
|
| 10 |
+
0.8,
|
| 11 |
+
0.99
|
| 12 |
+
],
|
| 13 |
+
"eps": 1e-09,
|
| 14 |
+
"batch_size": 2,
|
| 15 |
+
"bf16_run": false,
|
| 16 |
+
"fp16_run": false,
|
| 17 |
+
"lr_decay": 0.99996,
|
| 18 |
+
"segment_size": 16384,
|
| 19 |
+
"init_lr_ratio": 1,
|
| 20 |
+
"warmup_epochs": 0,
|
| 21 |
+
"c_mel": 45,
|
| 22 |
+
"c_kl": 1.0,
|
| 23 |
+
"c_commit": 100,
|
| 24 |
+
"skip_optimizer": false,
|
| 25 |
+
"freeze_ZH_bert": false,
|
| 26 |
+
"freeze_JP_bert": false,
|
| 27 |
+
"freeze_EN_bert": false,
|
| 28 |
+
"freeze_emo": false,
|
| 29 |
+
"freeze_style": false,
|
| 30 |
+
"freeze_decoder": false
|
| 31 |
+
},
|
| 32 |
+
"data": {
|
| 33 |
+
"use_jp_extra": true,
|
| 34 |
+
"training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_normal/train.list",
|
| 35 |
+
"validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_normal/val.list",
|
| 36 |
+
"max_wav_value": 32768.0,
|
| 37 |
+
"sampling_rate": 44100,
|
| 38 |
+
"filter_length": 2048,
|
| 39 |
+
"hop_length": 512,
|
| 40 |
+
"win_length": 2048,
|
| 41 |
+
"n_mel_channels": 128,
|
| 42 |
+
"mel_fmin": 0.0,
|
| 43 |
+
"mel_fmax": null,
|
| 44 |
+
"add_blank": true,
|
| 45 |
+
"n_speakers": 1,
|
| 46 |
+
"cleaned_text": true,
|
| 47 |
+
"spk2id": {
|
| 48 |
+
"amitaro_normal": 0
|
| 49 |
+
},
|
| 50 |
+
"num_styles": 1,
|
| 51 |
+
"style2id": {
|
| 52 |
+
"Neutral": 0
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"model": {
|
| 56 |
+
"use_spk_conditioned_encoder": true,
|
| 57 |
+
"use_noise_scaled_mas": true,
|
| 58 |
+
"use_mel_posterior_encoder": false,
|
| 59 |
+
"use_duration_discriminator": false,
|
| 60 |
+
"use_wavlm_discriminator": true,
|
| 61 |
+
"inter_channels": 192,
|
| 62 |
+
"hidden_channels": 192,
|
| 63 |
+
"filter_channels": 768,
|
| 64 |
+
"n_heads": 2,
|
| 65 |
+
"n_layers": 6,
|
| 66 |
+
"kernel_size": 3,
|
| 67 |
+
"p_dropout": 0.1,
|
| 68 |
+
"resblock": "1",
|
| 69 |
+
"resblock_kernel_sizes": [
|
| 70 |
+
3,
|
| 71 |
+
7,
|
| 72 |
+
11
|
| 73 |
+
],
|
| 74 |
+
"resblock_dilation_sizes": [
|
| 75 |
+
[
|
| 76 |
+
1,
|
| 77 |
+
3,
|
| 78 |
+
5
|
| 79 |
+
],
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
]
|
| 90 |
+
],
|
| 91 |
+
"upsample_rates": [
|
| 92 |
+
8,
|
| 93 |
+
8,
|
| 94 |
+
2,
|
| 95 |
+
2,
|
| 96 |
+
2
|
| 97 |
+
],
|
| 98 |
+
"upsample_initial_channel": 512,
|
| 99 |
+
"upsample_kernel_sizes": [
|
| 100 |
+
16,
|
| 101 |
+
16,
|
| 102 |
+
8,
|
| 103 |
+
2,
|
| 104 |
+
2
|
| 105 |
+
],
|
| 106 |
+
"n_layers_q": 3,
|
| 107 |
+
"use_spectral_norm": false,
|
| 108 |
+
"gin_channels": 512,
|
| 109 |
+
"slm": {
|
| 110 |
+
"model": "microsoft/wavlm-base-plus",
|
| 111 |
+
"sr": 16000,
|
| 112 |
+
"hidden": 768,
|
| 113 |
+
"nlayers": 13,
|
| 114 |
+
"initial_channel": 64
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
"version": "2.7.0-JP-Extra"
|
| 118 |
+
}
|
pytorch/speakers/amitaro_normal/hayakoe_amitaro_normal_e11_s2000.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:243d24c78d418b1b83001b5eb067e87372c398c25d36f65a5e47043dbf3484c5
|
| 3 |
+
size 251155732
|
pytorch/speakers/amitaro_normal/style_vectors.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28f9059b9da6c20dd891a2f22df2bfe77311a51d337623bd88e701de2f942643
|
| 3 |
+
size 1152
|
pytorch/speakers/amitaro_punsuka/config.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "hayakoe_amitaro_punsuka",
|
| 3 |
+
"train": {
|
| 4 |
+
"log_interval": 200,
|
| 5 |
+
"eval_interval": 500,
|
| 6 |
+
"seed": 42,
|
| 7 |
+
"epochs": 11,
|
| 8 |
+
"learning_rate": 0.0001,
|
| 9 |
+
"betas": [
|
| 10 |
+
0.8,
|
| 11 |
+
0.99
|
| 12 |
+
],
|
| 13 |
+
"eps": 1e-09,
|
| 14 |
+
"batch_size": 2,
|
| 15 |
+
"bf16_run": false,
|
| 16 |
+
"fp16_run": false,
|
| 17 |
+
"lr_decay": 0.99996,
|
| 18 |
+
"segment_size": 16384,
|
| 19 |
+
"init_lr_ratio": 1,
|
| 20 |
+
"warmup_epochs": 0,
|
| 21 |
+
"c_mel": 45,
|
| 22 |
+
"c_kl": 1.0,
|
| 23 |
+
"c_commit": 100,
|
| 24 |
+
"skip_optimizer": false,
|
| 25 |
+
"freeze_ZH_bert": false,
|
| 26 |
+
"freeze_JP_bert": false,
|
| 27 |
+
"freeze_EN_bert": false,
|
| 28 |
+
"freeze_emo": false,
|
| 29 |
+
"freeze_style": false,
|
| 30 |
+
"freeze_decoder": false
|
| 31 |
+
},
|
| 32 |
+
"data": {
|
| 33 |
+
"use_jp_extra": true,
|
| 34 |
+
"training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_punsuka/train.list",
|
| 35 |
+
"validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_punsuka/val.list",
|
| 36 |
+
"max_wav_value": 32768.0,
|
| 37 |
+
"sampling_rate": 44100,
|
| 38 |
+
"filter_length": 2048,
|
| 39 |
+
"hop_length": 512,
|
| 40 |
+
"win_length": 2048,
|
| 41 |
+
"n_mel_channels": 128,
|
| 42 |
+
"mel_fmin": 0.0,
|
| 43 |
+
"mel_fmax": null,
|
| 44 |
+
"add_blank": true,
|
| 45 |
+
"n_speakers": 1,
|
| 46 |
+
"cleaned_text": true,
|
| 47 |
+
"spk2id": {
|
| 48 |
+
"amitaro_punsuka": 0
|
| 49 |
+
},
|
| 50 |
+
"num_styles": 1,
|
| 51 |
+
"style2id": {
|
| 52 |
+
"Neutral": 0
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"model": {
|
| 56 |
+
"use_spk_conditioned_encoder": true,
|
| 57 |
+
"use_noise_scaled_mas": true,
|
| 58 |
+
"use_mel_posterior_encoder": false,
|
| 59 |
+
"use_duration_discriminator": false,
|
| 60 |
+
"use_wavlm_discriminator": true,
|
| 61 |
+
"inter_channels": 192,
|
| 62 |
+
"hidden_channels": 192,
|
| 63 |
+
"filter_channels": 768,
|
| 64 |
+
"n_heads": 2,
|
| 65 |
+
"n_layers": 6,
|
| 66 |
+
"kernel_size": 3,
|
| 67 |
+
"p_dropout": 0.1,
|
| 68 |
+
"resblock": "1",
|
| 69 |
+
"resblock_kernel_sizes": [
|
| 70 |
+
3,
|
| 71 |
+
7,
|
| 72 |
+
11
|
| 73 |
+
],
|
| 74 |
+
"resblock_dilation_sizes": [
|
| 75 |
+
[
|
| 76 |
+
1,
|
| 77 |
+
3,
|
| 78 |
+
5
|
| 79 |
+
],
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
]
|
| 90 |
+
],
|
| 91 |
+
"upsample_rates": [
|
| 92 |
+
8,
|
| 93 |
+
8,
|
| 94 |
+
2,
|
| 95 |
+
2,
|
| 96 |
+
2
|
| 97 |
+
],
|
| 98 |
+
"upsample_initial_channel": 512,
|
| 99 |
+
"upsample_kernel_sizes": [
|
| 100 |
+
16,
|
| 101 |
+
16,
|
| 102 |
+
8,
|
| 103 |
+
2,
|
| 104 |
+
2
|
| 105 |
+
],
|
| 106 |
+
"n_layers_q": 3,
|
| 107 |
+
"use_spectral_norm": false,
|
| 108 |
+
"gin_channels": 512,
|
| 109 |
+
"slm": {
|
| 110 |
+
"model": "microsoft/wavlm-base-plus",
|
| 111 |
+
"sr": 16000,
|
| 112 |
+
"hidden": 768,
|
| 113 |
+
"nlayers": 13,
|
| 114 |
+
"initial_channel": 64
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
"version": "2.7.0-JP-Extra"
|
| 118 |
+
}
|
pytorch/speakers/amitaro_punsuka/hayakoe_amitaro_punsuka_e11_s2000.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:caa8a3a78795cab1b9d4c7507afab5b4dcf7f594c08fa49e826b917588db3cbe
|
| 3 |
+
size 251155732
|
pytorch/speakers/amitaro_punsuka/style_vectors.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0cdbc3e625231b57adb16c89499bd0f01fb74acac42abbd1c0aceb96346bfa68
|
| 3 |
+
size 1152
|
pytorch/speakers/amitaro_runrun/config.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "hayakoe_amitaro_runrun",
|
| 3 |
+
"train": {
|
| 4 |
+
"log_interval": 200,
|
| 5 |
+
"eval_interval": 500,
|
| 6 |
+
"seed": 42,
|
| 7 |
+
"epochs": 21,
|
| 8 |
+
"learning_rate": 0.0001,
|
| 9 |
+
"betas": [
|
| 10 |
+
0.8,
|
| 11 |
+
0.99
|
| 12 |
+
],
|
| 13 |
+
"eps": 1e-09,
|
| 14 |
+
"batch_size": 2,
|
| 15 |
+
"bf16_run": false,
|
| 16 |
+
"fp16_run": false,
|
| 17 |
+
"lr_decay": 0.99996,
|
| 18 |
+
"segment_size": 16384,
|
| 19 |
+
"init_lr_ratio": 1,
|
| 20 |
+
"warmup_epochs": 0,
|
| 21 |
+
"c_mel": 45,
|
| 22 |
+
"c_kl": 1.0,
|
| 23 |
+
"c_commit": 100,
|
| 24 |
+
"skip_optimizer": false,
|
| 25 |
+
"freeze_ZH_bert": false,
|
| 26 |
+
"freeze_JP_bert": false,
|
| 27 |
+
"freeze_EN_bert": false,
|
| 28 |
+
"freeze_emo": false,
|
| 29 |
+
"freeze_style": false,
|
| 30 |
+
"freeze_decoder": false
|
| 31 |
+
},
|
| 32 |
+
"data": {
|
| 33 |
+
"use_jp_extra": true,
|
| 34 |
+
"training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_runrun/train.list",
|
| 35 |
+
"validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_runrun/val.list",
|
| 36 |
+
"max_wav_value": 32768.0,
|
| 37 |
+
"sampling_rate": 44100,
|
| 38 |
+
"filter_length": 2048,
|
| 39 |
+
"hop_length": 512,
|
| 40 |
+
"win_length": 2048,
|
| 41 |
+
"n_mel_channels": 128,
|
| 42 |
+
"mel_fmin": 0.0,
|
| 43 |
+
"mel_fmax": null,
|
| 44 |
+
"add_blank": true,
|
| 45 |
+
"n_speakers": 1,
|
| 46 |
+
"cleaned_text": true,
|
| 47 |
+
"spk2id": {
|
| 48 |
+
"amitaro_runrun": 0
|
| 49 |
+
},
|
| 50 |
+
"num_styles": 1,
|
| 51 |
+
"style2id": {
|
| 52 |
+
"Neutral": 0
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"model": {
|
| 56 |
+
"use_spk_conditioned_encoder": true,
|
| 57 |
+
"use_noise_scaled_mas": true,
|
| 58 |
+
"use_mel_posterior_encoder": false,
|
| 59 |
+
"use_duration_discriminator": false,
|
| 60 |
+
"use_wavlm_discriminator": true,
|
| 61 |
+
"inter_channels": 192,
|
| 62 |
+
"hidden_channels": 192,
|
| 63 |
+
"filter_channels": 768,
|
| 64 |
+
"n_heads": 2,
|
| 65 |
+
"n_layers": 6,
|
| 66 |
+
"kernel_size": 3,
|
| 67 |
+
"p_dropout": 0.1,
|
| 68 |
+
"resblock": "1",
|
| 69 |
+
"resblock_kernel_sizes": [
|
| 70 |
+
3,
|
| 71 |
+
7,
|
| 72 |
+
11
|
| 73 |
+
],
|
| 74 |
+
"resblock_dilation_sizes": [
|
| 75 |
+
[
|
| 76 |
+
1,
|
| 77 |
+
3,
|
| 78 |
+
5
|
| 79 |
+
],
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
]
|
| 90 |
+
],
|
| 91 |
+
"upsample_rates": [
|
| 92 |
+
8,
|
| 93 |
+
8,
|
| 94 |
+
2,
|
| 95 |
+
2,
|
| 96 |
+
2
|
| 97 |
+
],
|
| 98 |
+
"upsample_initial_channel": 512,
|
| 99 |
+
"upsample_kernel_sizes": [
|
| 100 |
+
16,
|
| 101 |
+
16,
|
| 102 |
+
8,
|
| 103 |
+
2,
|
| 104 |
+
2
|
| 105 |
+
],
|
| 106 |
+
"n_layers_q": 3,
|
| 107 |
+
"use_spectral_norm": false,
|
| 108 |
+
"gin_channels": 512,
|
| 109 |
+
"slm": {
|
| 110 |
+
"model": "microsoft/wavlm-base-plus",
|
| 111 |
+
"sr": 16000,
|
| 112 |
+
"hidden": 768,
|
| 113 |
+
"nlayers": 13,
|
| 114 |
+
"initial_channel": 64
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
"version": "2.7.0-JP-Extra"
|
| 118 |
+
}
|
pytorch/speakers/amitaro_runrun/hayakoe_amitaro_runrun_e11_s2000.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:00ac68eff452b4c2a3d1301ed46de128aa2a1a36e401954d7bd1c08408408473
|
| 3 |
+
size 251155732
|
pytorch/speakers/amitaro_runrun/style_vectors.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b59a7286e5ea0c549acbdb086aa52fd227b6e270ec20918bc8cc63b61cbf64b8
|
| 3 |
+
size 1152
|
pytorch/speakers/amitaro_sasayaki_a/config.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "hayakoe_amitaro_sasayaki_a",
|
| 3 |
+
"train": {
|
| 4 |
+
"log_interval": 200,
|
| 5 |
+
"eval_interval": 500,
|
| 6 |
+
"seed": 42,
|
| 7 |
+
"epochs": 100,
|
| 8 |
+
"learning_rate": 0.0001,
|
| 9 |
+
"betas": [
|
| 10 |
+
0.8,
|
| 11 |
+
0.99
|
| 12 |
+
],
|
| 13 |
+
"eps": 1e-09,
|
| 14 |
+
"batch_size": 2,
|
| 15 |
+
"bf16_run": false,
|
| 16 |
+
"fp16_run": false,
|
| 17 |
+
"lr_decay": 0.99996,
|
| 18 |
+
"segment_size": 16384,
|
| 19 |
+
"init_lr_ratio": 1,
|
| 20 |
+
"warmup_epochs": 0,
|
| 21 |
+
"c_mel": 45,
|
| 22 |
+
"c_kl": 1.0,
|
| 23 |
+
"c_commit": 100,
|
| 24 |
+
"skip_optimizer": false,
|
| 25 |
+
"freeze_ZH_bert": false,
|
| 26 |
+
"freeze_JP_bert": false,
|
| 27 |
+
"freeze_EN_bert": false,
|
| 28 |
+
"freeze_emo": false,
|
| 29 |
+
"freeze_style": false,
|
| 30 |
+
"freeze_decoder": false
|
| 31 |
+
},
|
| 32 |
+
"data": {
|
| 33 |
+
"use_jp_extra": true,
|
| 34 |
+
"training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_a/train.list",
|
| 35 |
+
"validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_a/val.list",
|
| 36 |
+
"max_wav_value": 32768.0,
|
| 37 |
+
"sampling_rate": 44100,
|
| 38 |
+
"filter_length": 2048,
|
| 39 |
+
"hop_length": 512,
|
| 40 |
+
"win_length": 2048,
|
| 41 |
+
"n_mel_channels": 128,
|
| 42 |
+
"mel_fmin": 0.0,
|
| 43 |
+
"mel_fmax": null,
|
| 44 |
+
"add_blank": true,
|
| 45 |
+
"n_speakers": 1,
|
| 46 |
+
"cleaned_text": true,
|
| 47 |
+
"spk2id": {
|
| 48 |
+
"amitaro_sasayaki_a": 0
|
| 49 |
+
},
|
| 50 |
+
"num_styles": 1,
|
| 51 |
+
"style2id": {
|
| 52 |
+
"Neutral": 0
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"model": {
|
| 56 |
+
"use_spk_conditioned_encoder": true,
|
| 57 |
+
"use_noise_scaled_mas": true,
|
| 58 |
+
"use_mel_posterior_encoder": false,
|
| 59 |
+
"use_duration_discriminator": false,
|
| 60 |
+
"use_wavlm_discriminator": true,
|
| 61 |
+
"inter_channels": 192,
|
| 62 |
+
"hidden_channels": 192,
|
| 63 |
+
"filter_channels": 768,
|
| 64 |
+
"n_heads": 2,
|
| 65 |
+
"n_layers": 6,
|
| 66 |
+
"kernel_size": 3,
|
| 67 |
+
"p_dropout": 0.1,
|
| 68 |
+
"resblock": "1",
|
| 69 |
+
"resblock_kernel_sizes": [
|
| 70 |
+
3,
|
| 71 |
+
7,
|
| 72 |
+
11
|
| 73 |
+
],
|
| 74 |
+
"resblock_dilation_sizes": [
|
| 75 |
+
[
|
| 76 |
+
1,
|
| 77 |
+
3,
|
| 78 |
+
5
|
| 79 |
+
],
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
]
|
| 90 |
+
],
|
| 91 |
+
"upsample_rates": [
|
| 92 |
+
8,
|
| 93 |
+
8,
|
| 94 |
+
2,
|
| 95 |
+
2,
|
| 96 |
+
2
|
| 97 |
+
],
|
| 98 |
+
"upsample_initial_channel": 512,
|
| 99 |
+
"upsample_kernel_sizes": [
|
| 100 |
+
16,
|
| 101 |
+
16,
|
| 102 |
+
8,
|
| 103 |
+
2,
|
| 104 |
+
2
|
| 105 |
+
],
|
| 106 |
+
"n_layers_q": 3,
|
| 107 |
+
"use_spectral_norm": false,
|
| 108 |
+
"gin_channels": 512,
|
| 109 |
+
"slm": {
|
| 110 |
+
"model": "microsoft/wavlm-base-plus",
|
| 111 |
+
"sr": 16000,
|
| 112 |
+
"hidden": 768,
|
| 113 |
+
"nlayers": 13,
|
| 114 |
+
"initial_channel": 64
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
"version": "2.7.0-JP-Extra"
|
| 118 |
+
}
|
pytorch/speakers/amitaro_sasayaki_a/hayakoe_amitaro_sasayaki_a_e87_s4000.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8bdef1fb086b368918a67cbc098ccf07f12d21f5db56df697c74056ebe18e967
|
| 3 |
+
size 251155732
|
pytorch/speakers/amitaro_sasayaki_a/style_vectors.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ab461895545a48c61e6d96ba8e9dc717d9e7edc06e42a549b934ed14f9c84c86
|
| 3 |
+
size 1152
|
pytorch/speakers/amitaro_sasayaki_b/config.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "hayakoe_amitaro_sasayaki_b",
|
| 3 |
+
"train": {
|
| 4 |
+
"log_interval": 200,
|
| 5 |
+
"eval_interval": 500,
|
| 6 |
+
"seed": 42,
|
| 7 |
+
"epochs": 100,
|
| 8 |
+
"learning_rate": 0.0001,
|
| 9 |
+
"betas": [
|
| 10 |
+
0.8,
|
| 11 |
+
0.99
|
| 12 |
+
],
|
| 13 |
+
"eps": 1e-09,
|
| 14 |
+
"batch_size": 2,
|
| 15 |
+
"bf16_run": false,
|
| 16 |
+
"fp16_run": false,
|
| 17 |
+
"lr_decay": 0.99996,
|
| 18 |
+
"segment_size": 16384,
|
| 19 |
+
"init_lr_ratio": 1,
|
| 20 |
+
"warmup_epochs": 0,
|
| 21 |
+
"c_mel": 45,
|
| 22 |
+
"c_kl": 1.0,
|
| 23 |
+
"c_commit": 100,
|
| 24 |
+
"skip_optimizer": false,
|
| 25 |
+
"freeze_ZH_bert": false,
|
| 26 |
+
"freeze_JP_bert": false,
|
| 27 |
+
"freeze_EN_bert": false,
|
| 28 |
+
"freeze_emo": false,
|
| 29 |
+
"freeze_style": false,
|
| 30 |
+
"freeze_decoder": false
|
| 31 |
+
},
|
| 32 |
+
"data": {
|
| 33 |
+
"use_jp_extra": true,
|
| 34 |
+
"training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_b/train.list",
|
| 35 |
+
"validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_b/val.list",
|
| 36 |
+
"max_wav_value": 32768.0,
|
| 37 |
+
"sampling_rate": 44100,
|
| 38 |
+
"filter_length": 2048,
|
| 39 |
+
"hop_length": 512,
|
| 40 |
+
"win_length": 2048,
|
| 41 |
+
"n_mel_channels": 128,
|
| 42 |
+
"mel_fmin": 0.0,
|
| 43 |
+
"mel_fmax": null,
|
| 44 |
+
"add_blank": true,
|
| 45 |
+
"n_speakers": 1,
|
| 46 |
+
"cleaned_text": true,
|
| 47 |
+
"spk2id": {
|
| 48 |
+
"amitaro_sasayaki_b": 0
|
| 49 |
+
},
|
| 50 |
+
"num_styles": 1,
|
| 51 |
+
"style2id": {
|
| 52 |
+
"Neutral": 0
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"model": {
|
| 56 |
+
"use_spk_conditioned_encoder": true,
|
| 57 |
+
"use_noise_scaled_mas": true,
|
| 58 |
+
"use_mel_posterior_encoder": false,
|
| 59 |
+
"use_duration_discriminator": false,
|
| 60 |
+
"use_wavlm_discriminator": true,
|
| 61 |
+
"inter_channels": 192,
|
| 62 |
+
"hidden_channels": 192,
|
| 63 |
+
"filter_channels": 768,
|
| 64 |
+
"n_heads": 2,
|
| 65 |
+
"n_layers": 6,
|
| 66 |
+
"kernel_size": 3,
|
| 67 |
+
"p_dropout": 0.1,
|
| 68 |
+
"resblock": "1",
|
| 69 |
+
"resblock_kernel_sizes": [
|
| 70 |
+
3,
|
| 71 |
+
7,
|
| 72 |
+
11
|
| 73 |
+
],
|
| 74 |
+
"resblock_dilation_sizes": [
|
| 75 |
+
[
|
| 76 |
+
1,
|
| 77 |
+
3,
|
| 78 |
+
5
|
| 79 |
+
],
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
]
|
| 90 |
+
],
|
| 91 |
+
"upsample_rates": [
|
| 92 |
+
8,
|
| 93 |
+
8,
|
| 94 |
+
2,
|
| 95 |
+
2,
|
| 96 |
+
2
|
| 97 |
+
],
|
| 98 |
+
"upsample_initial_channel": 512,
|
| 99 |
+
"upsample_kernel_sizes": [
|
| 100 |
+
16,
|
| 101 |
+
16,
|
| 102 |
+
8,
|
| 103 |
+
2,
|
| 104 |
+
2
|
| 105 |
+
],
|
| 106 |
+
"n_layers_q": 3,
|
| 107 |
+
"use_spectral_norm": false,
|
| 108 |
+
"gin_channels": 512,
|
| 109 |
+
"slm": {
|
| 110 |
+
"model": "microsoft/wavlm-base-plus",
|
| 111 |
+
"sr": 16000,
|
| 112 |
+
"hidden": 768,
|
| 113 |
+
"nlayers": 13,
|
| 114 |
+
"initial_channel": 64
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
"version": "2.7.0-JP-Extra"
|
| 118 |
+
}
|
pytorch/speakers/amitaro_sasayaki_b/hayakoe_amitaro_sasayaki_b_e98_s4500.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7b1212c0d916a57cc9f1b9b09c93576aabe1fd0fd5e3dc7be0fdfa114bfaffd
|
| 3 |
+
size 251155732
|
pytorch/speakers/amitaro_sasayaki_b/style_vectors.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3fac4742f009abb2d8a3fb007319887ee664634c6eed7816a9b72b8c90bdcde3
|
| 3 |
+
size 1152
|
pytorch/speakers/amitaro_yofukashi/config.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "hayakoe_amitaro_yofukashi",
|
| 3 |
+
"train": {
|
| 4 |
+
"log_interval": 200,
|
| 5 |
+
"eval_interval": 500,
|
| 6 |
+
"seed": 42,
|
| 7 |
+
"epochs": 21,
|
| 8 |
+
"learning_rate": 0.0001,
|
| 9 |
+
"betas": [
|
| 10 |
+
0.8,
|
| 11 |
+
0.99
|
| 12 |
+
],
|
| 13 |
+
"eps": 1e-09,
|
| 14 |
+
"batch_size": 2,
|
| 15 |
+
"bf16_run": false,
|
| 16 |
+
"fp16_run": false,
|
| 17 |
+
"lr_decay": 0.99996,
|
| 18 |
+
"segment_size": 16384,
|
| 19 |
+
"init_lr_ratio": 1,
|
| 20 |
+
"warmup_epochs": 0,
|
| 21 |
+
"c_mel": 45,
|
| 22 |
+
"c_kl": 1.0,
|
| 23 |
+
"c_commit": 100,
|
| 24 |
+
"skip_optimizer": false,
|
| 25 |
+
"freeze_ZH_bert": false,
|
| 26 |
+
"freeze_JP_bert": false,
|
| 27 |
+
"freeze_EN_bert": false,
|
| 28 |
+
"freeze_emo": false,
|
| 29 |
+
"freeze_style": false,
|
| 30 |
+
"freeze_decoder": false
|
| 31 |
+
},
|
| 32 |
+
"data": {
|
| 33 |
+
"use_jp_extra": true,
|
| 34 |
+
"training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_yofukashi/train.list",
|
| 35 |
+
"validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_yofukashi/val.list",
|
| 36 |
+
"max_wav_value": 32768.0,
|
| 37 |
+
"sampling_rate": 44100,
|
| 38 |
+
"filter_length": 2048,
|
| 39 |
+
"hop_length": 512,
|
| 40 |
+
"win_length": 2048,
|
| 41 |
+
"n_mel_channels": 128,
|
| 42 |
+
"mel_fmin": 0.0,
|
| 43 |
+
"mel_fmax": null,
|
| 44 |
+
"add_blank": true,
|
| 45 |
+
"n_speakers": 1,
|
| 46 |
+
"cleaned_text": true,
|
| 47 |
+
"spk2id": {
|
| 48 |
+
"amitaro_yofukashi": 0
|
| 49 |
+
},
|
| 50 |
+
"num_styles": 1,
|
| 51 |
+
"style2id": {
|
| 52 |
+
"Neutral": 0
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"model": {
|
| 56 |
+
"use_spk_conditioned_encoder": true,
|
| 57 |
+
"use_noise_scaled_mas": true,
|
| 58 |
+
"use_mel_posterior_encoder": false,
|
| 59 |
+
"use_duration_discriminator": false,
|
| 60 |
+
"use_wavlm_discriminator": true,
|
| 61 |
+
"inter_channels": 192,
|
| 62 |
+
"hidden_channels": 192,
|
| 63 |
+
"filter_channels": 768,
|
| 64 |
+
"n_heads": 2,
|
| 65 |
+
"n_layers": 6,
|
| 66 |
+
"kernel_size": 3,
|
| 67 |
+
"p_dropout": 0.1,
|
| 68 |
+
"resblock": "1",
|
| 69 |
+
"resblock_kernel_sizes": [
|
| 70 |
+
3,
|
| 71 |
+
7,
|
| 72 |
+
11
|
| 73 |
+
],
|
| 74 |
+
"resblock_dilation_sizes": [
|
| 75 |
+
[
|
| 76 |
+
1,
|
| 77 |
+
3,
|
| 78 |
+
5
|
| 79 |
+
],
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
]
|
| 90 |
+
],
|
| 91 |
+
"upsample_rates": [
|
| 92 |
+
8,
|
| 93 |
+
8,
|
| 94 |
+
2,
|
| 95 |
+
2,
|
| 96 |
+
2
|
| 97 |
+
],
|
| 98 |
+
"upsample_initial_channel": 512,
|
| 99 |
+
"upsample_kernel_sizes": [
|
| 100 |
+
16,
|
| 101 |
+
16,
|
| 102 |
+
8,
|
| 103 |
+
2,
|
| 104 |
+
2
|
| 105 |
+
],
|
| 106 |
+
"n_layers_q": 3,
|
| 107 |
+
"use_spectral_norm": false,
|
| 108 |
+
"gin_channels": 512,
|
| 109 |
+
"slm": {
|
| 110 |
+
"model": "microsoft/wavlm-base-plus",
|
| 111 |
+
"sr": 16000,
|
| 112 |
+
"hidden": 768,
|
| 113 |
+
"nlayers": 13,
|
| 114 |
+
"initial_channel": 64
|
| 115 |
+
}
|
| 116 |
+
},
|
| 117 |
+
"version": "2.7.0-JP-Extra"
|
| 118 |
+
}
|
pytorch/speakers/amitaro_yofukashi/hayakoe_amitaro_yofukashi_e11_s2000.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:836dfc7745d9616a26afcad054b6140143dd2157b45cac0b71fa4f86cb10b425
|
| 3 |
+
size 251155732
|
pytorch/speakers/amitaro_yofukashi/style_vectors.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c249725ac0de81417da52cf0b3211bb7a14610f534df933f0d4a307fcaa659a0
|
| 3 |
+
size 1152
|