lemondouble commited on
Commit
2cf2964
·
verified ·
1 Parent(s): d108854

feat: あみたろ 6종 화자 모델 추가

Browse files

あみたろの声素材工房 ITAコーパス読み上げ音声 기반 파인튜닝 모델 6종
- amitaro_normal, amitaro_runrun, amitaro_yofukashi, amitaro_punsuka, amitaro_sasayaki_a, amitaro_sasayaki_b
- PyTorch (safetensors) + ONNX (synthesizer + duration_predictor)
- License: CC-BY-SA-4.0
- Credit: あみたろの声素材工房 https://amitaro.net/

Files changed (42) hide show
  1. onnx/speakers/amitaro_normal/config.json +118 -0
  2. onnx/speakers/amitaro_normal/duration_predictor.onnx +3 -0
  3. onnx/speakers/amitaro_normal/style_vectors.npy +3 -0
  4. onnx/speakers/amitaro_normal/synthesizer.onnx +3 -0
  5. onnx/speakers/amitaro_punsuka/config.json +118 -0
  6. onnx/speakers/amitaro_punsuka/duration_predictor.onnx +3 -0
  7. onnx/speakers/amitaro_punsuka/style_vectors.npy +3 -0
  8. onnx/speakers/amitaro_punsuka/synthesizer.onnx +3 -0
  9. onnx/speakers/amitaro_runrun/config.json +118 -0
  10. onnx/speakers/amitaro_runrun/duration_predictor.onnx +3 -0
  11. onnx/speakers/amitaro_runrun/style_vectors.npy +3 -0
  12. onnx/speakers/amitaro_runrun/synthesizer.onnx +3 -0
  13. onnx/speakers/amitaro_sasayaki_a/config.json +118 -0
  14. onnx/speakers/amitaro_sasayaki_a/duration_predictor.onnx +3 -0
  15. onnx/speakers/amitaro_sasayaki_a/style_vectors.npy +3 -0
  16. onnx/speakers/amitaro_sasayaki_a/synthesizer.onnx +3 -0
  17. onnx/speakers/amitaro_sasayaki_b/config.json +118 -0
  18. onnx/speakers/amitaro_sasayaki_b/duration_predictor.onnx +3 -0
  19. onnx/speakers/amitaro_sasayaki_b/style_vectors.npy +3 -0
  20. onnx/speakers/amitaro_sasayaki_b/synthesizer.onnx +3 -0
  21. onnx/speakers/amitaro_yofukashi/config.json +118 -0
  22. onnx/speakers/amitaro_yofukashi/duration_predictor.onnx +3 -0
  23. onnx/speakers/amitaro_yofukashi/style_vectors.npy +3 -0
  24. onnx/speakers/amitaro_yofukashi/synthesizer.onnx +3 -0
  25. pytorch/speakers/amitaro_normal/config.json +118 -0
  26. pytorch/speakers/amitaro_normal/hayakoe_amitaro_normal_e11_s2000.safetensors +3 -0
  27. pytorch/speakers/amitaro_normal/style_vectors.npy +3 -0
  28. pytorch/speakers/amitaro_punsuka/config.json +118 -0
  29. pytorch/speakers/amitaro_punsuka/hayakoe_amitaro_punsuka_e11_s2000.safetensors +3 -0
  30. pytorch/speakers/amitaro_punsuka/style_vectors.npy +3 -0
  31. pytorch/speakers/amitaro_runrun/config.json +118 -0
  32. pytorch/speakers/amitaro_runrun/hayakoe_amitaro_runrun_e11_s2000.safetensors +3 -0
  33. pytorch/speakers/amitaro_runrun/style_vectors.npy +3 -0
  34. pytorch/speakers/amitaro_sasayaki_a/config.json +118 -0
  35. pytorch/speakers/amitaro_sasayaki_a/hayakoe_amitaro_sasayaki_a_e87_s4000.safetensors +3 -0
  36. pytorch/speakers/amitaro_sasayaki_a/style_vectors.npy +3 -0
  37. pytorch/speakers/amitaro_sasayaki_b/config.json +118 -0
  38. pytorch/speakers/amitaro_sasayaki_b/hayakoe_amitaro_sasayaki_b_e98_s4500.safetensors +3 -0
  39. pytorch/speakers/amitaro_sasayaki_b/style_vectors.npy +3 -0
  40. pytorch/speakers/amitaro_yofukashi/config.json +118 -0
  41. pytorch/speakers/amitaro_yofukashi/hayakoe_amitaro_yofukashi_e11_s2000.safetensors +3 -0
  42. pytorch/speakers/amitaro_yofukashi/style_vectors.npy +3 -0
onnx/speakers/amitaro_normal/config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "hayakoe_amitaro_normal",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 500,
6
+ "seed": 42,
7
+ "epochs": 21,
8
+ "learning_rate": 0.0001,
9
+ "betas": [
10
+ 0.8,
11
+ 0.99
12
+ ],
13
+ "eps": 1e-09,
14
+ "batch_size": 2,
15
+ "bf16_run": false,
16
+ "fp16_run": false,
17
+ "lr_decay": 0.99996,
18
+ "segment_size": 16384,
19
+ "init_lr_ratio": 1,
20
+ "warmup_epochs": 0,
21
+ "c_mel": 45,
22
+ "c_kl": 1.0,
23
+ "c_commit": 100,
24
+ "skip_optimizer": false,
25
+ "freeze_ZH_bert": false,
26
+ "freeze_JP_bert": false,
27
+ "freeze_EN_bert": false,
28
+ "freeze_emo": false,
29
+ "freeze_style": false,
30
+ "freeze_decoder": false
31
+ },
32
+ "data": {
33
+ "use_jp_extra": true,
34
+ "training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_normal/train.list",
35
+ "validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_normal/val.list",
36
+ "max_wav_value": 32768.0,
37
+ "sampling_rate": 44100,
38
+ "filter_length": 2048,
39
+ "hop_length": 512,
40
+ "win_length": 2048,
41
+ "n_mel_channels": 128,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": null,
44
+ "add_blank": true,
45
+ "n_speakers": 1,
46
+ "cleaned_text": true,
47
+ "spk2id": {
48
+ "amitaro_normal": 0
49
+ },
50
+ "num_styles": 1,
51
+ "style2id": {
52
+ "Neutral": 0
53
+ }
54
+ },
55
+ "model": {
56
+ "use_spk_conditioned_encoder": true,
57
+ "use_noise_scaled_mas": true,
58
+ "use_mel_posterior_encoder": false,
59
+ "use_duration_discriminator": false,
60
+ "use_wavlm_discriminator": true,
61
+ "inter_channels": 192,
62
+ "hidden_channels": 192,
63
+ "filter_channels": 768,
64
+ "n_heads": 2,
65
+ "n_layers": 6,
66
+ "kernel_size": 3,
67
+ "p_dropout": 0.1,
68
+ "resblock": "1",
69
+ "resblock_kernel_sizes": [
70
+ 3,
71
+ 7,
72
+ 11
73
+ ],
74
+ "resblock_dilation_sizes": [
75
+ [
76
+ 1,
77
+ 3,
78
+ 5
79
+ ],
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ]
90
+ ],
91
+ "upsample_rates": [
92
+ 8,
93
+ 8,
94
+ 2,
95
+ 2,
96
+ 2
97
+ ],
98
+ "upsample_initial_channel": 512,
99
+ "upsample_kernel_sizes": [
100
+ 16,
101
+ 16,
102
+ 8,
103
+ 2,
104
+ 2
105
+ ],
106
+ "n_layers_q": 3,
107
+ "use_spectral_norm": false,
108
+ "gin_channels": 512,
109
+ "slm": {
110
+ "model": "microsoft/wavlm-base-plus",
111
+ "sr": 16000,
112
+ "hidden": 768,
113
+ "nlayers": 13,
114
+ "initial_channel": 64
115
+ }
116
+ },
117
+ "version": "2.7.0-JP-Extra"
118
+ }
onnx/speakers/amitaro_normal/duration_predictor.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea72d3b807b7120a84cc533e04da80374352fa124adae13577ac36953b170e41
3
+ size 31503027
onnx/speakers/amitaro_normal/style_vectors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28f9059b9da6c20dd891a2f22df2bfe77311a51d337623bd88e701de2f942643
3
+ size 1152
onnx/speakers/amitaro_normal/synthesizer.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b4840a47ff05b207e226bd936a143b5985ad24d0983be23a5b14705f6849965
3
+ size 250644697
onnx/speakers/amitaro_punsuka/config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "hayakoe_amitaro_punsuka",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 500,
6
+ "seed": 42,
7
+ "epochs": 11,
8
+ "learning_rate": 0.0001,
9
+ "betas": [
10
+ 0.8,
11
+ 0.99
12
+ ],
13
+ "eps": 1e-09,
14
+ "batch_size": 2,
15
+ "bf16_run": false,
16
+ "fp16_run": false,
17
+ "lr_decay": 0.99996,
18
+ "segment_size": 16384,
19
+ "init_lr_ratio": 1,
20
+ "warmup_epochs": 0,
21
+ "c_mel": 45,
22
+ "c_kl": 1.0,
23
+ "c_commit": 100,
24
+ "skip_optimizer": false,
25
+ "freeze_ZH_bert": false,
26
+ "freeze_JP_bert": false,
27
+ "freeze_EN_bert": false,
28
+ "freeze_emo": false,
29
+ "freeze_style": false,
30
+ "freeze_decoder": false
31
+ },
32
+ "data": {
33
+ "use_jp_extra": true,
34
+ "training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_punsuka/train.list",
35
+ "validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_punsuka/val.list",
36
+ "max_wav_value": 32768.0,
37
+ "sampling_rate": 44100,
38
+ "filter_length": 2048,
39
+ "hop_length": 512,
40
+ "win_length": 2048,
41
+ "n_mel_channels": 128,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": null,
44
+ "add_blank": true,
45
+ "n_speakers": 1,
46
+ "cleaned_text": true,
47
+ "spk2id": {
48
+ "amitaro_punsuka": 0
49
+ },
50
+ "num_styles": 1,
51
+ "style2id": {
52
+ "Neutral": 0
53
+ }
54
+ },
55
+ "model": {
56
+ "use_spk_conditioned_encoder": true,
57
+ "use_noise_scaled_mas": true,
58
+ "use_mel_posterior_encoder": false,
59
+ "use_duration_discriminator": false,
60
+ "use_wavlm_discriminator": true,
61
+ "inter_channels": 192,
62
+ "hidden_channels": 192,
63
+ "filter_channels": 768,
64
+ "n_heads": 2,
65
+ "n_layers": 6,
66
+ "kernel_size": 3,
67
+ "p_dropout": 0.1,
68
+ "resblock": "1",
69
+ "resblock_kernel_sizes": [
70
+ 3,
71
+ 7,
72
+ 11
73
+ ],
74
+ "resblock_dilation_sizes": [
75
+ [
76
+ 1,
77
+ 3,
78
+ 5
79
+ ],
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ]
90
+ ],
91
+ "upsample_rates": [
92
+ 8,
93
+ 8,
94
+ 2,
95
+ 2,
96
+ 2
97
+ ],
98
+ "upsample_initial_channel": 512,
99
+ "upsample_kernel_sizes": [
100
+ 16,
101
+ 16,
102
+ 8,
103
+ 2,
104
+ 2
105
+ ],
106
+ "n_layers_q": 3,
107
+ "use_spectral_norm": false,
108
+ "gin_channels": 512,
109
+ "slm": {
110
+ "model": "microsoft/wavlm-base-plus",
111
+ "sr": 16000,
112
+ "hidden": 768,
113
+ "nlayers": 13,
114
+ "initial_channel": 64
115
+ }
116
+ },
117
+ "version": "2.7.0-JP-Extra"
118
+ }
onnx/speakers/amitaro_punsuka/duration_predictor.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b6accd699eecf9b44c92a0fd497154aa5c6063e99937ee58d0c78d58f60b166
3
+ size 31503027
onnx/speakers/amitaro_punsuka/style_vectors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cdbc3e625231b57adb16c89499bd0f01fb74acac42abbd1c0aceb96346bfa68
3
+ size 1152
onnx/speakers/amitaro_punsuka/synthesizer.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:241d74d32ea37a0c1cb3d37b0569563c9712d08514fe68136cd2f1a2743971bc
3
+ size 250644697
onnx/speakers/amitaro_runrun/config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "hayakoe_amitaro_runrun",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 500,
6
+ "seed": 42,
7
+ "epochs": 21,
8
+ "learning_rate": 0.0001,
9
+ "betas": [
10
+ 0.8,
11
+ 0.99
12
+ ],
13
+ "eps": 1e-09,
14
+ "batch_size": 2,
15
+ "bf16_run": false,
16
+ "fp16_run": false,
17
+ "lr_decay": 0.99996,
18
+ "segment_size": 16384,
19
+ "init_lr_ratio": 1,
20
+ "warmup_epochs": 0,
21
+ "c_mel": 45,
22
+ "c_kl": 1.0,
23
+ "c_commit": 100,
24
+ "skip_optimizer": false,
25
+ "freeze_ZH_bert": false,
26
+ "freeze_JP_bert": false,
27
+ "freeze_EN_bert": false,
28
+ "freeze_emo": false,
29
+ "freeze_style": false,
30
+ "freeze_decoder": false
31
+ },
32
+ "data": {
33
+ "use_jp_extra": true,
34
+ "training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_runrun/train.list",
35
+ "validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_runrun/val.list",
36
+ "max_wav_value": 32768.0,
37
+ "sampling_rate": 44100,
38
+ "filter_length": 2048,
39
+ "hop_length": 512,
40
+ "win_length": 2048,
41
+ "n_mel_channels": 128,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": null,
44
+ "add_blank": true,
45
+ "n_speakers": 1,
46
+ "cleaned_text": true,
47
+ "spk2id": {
48
+ "amitaro_runrun": 0
49
+ },
50
+ "num_styles": 1,
51
+ "style2id": {
52
+ "Neutral": 0
53
+ }
54
+ },
55
+ "model": {
56
+ "use_spk_conditioned_encoder": true,
57
+ "use_noise_scaled_mas": true,
58
+ "use_mel_posterior_encoder": false,
59
+ "use_duration_discriminator": false,
60
+ "use_wavlm_discriminator": true,
61
+ "inter_channels": 192,
62
+ "hidden_channels": 192,
63
+ "filter_channels": 768,
64
+ "n_heads": 2,
65
+ "n_layers": 6,
66
+ "kernel_size": 3,
67
+ "p_dropout": 0.1,
68
+ "resblock": "1",
69
+ "resblock_kernel_sizes": [
70
+ 3,
71
+ 7,
72
+ 11
73
+ ],
74
+ "resblock_dilation_sizes": [
75
+ [
76
+ 1,
77
+ 3,
78
+ 5
79
+ ],
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ]
90
+ ],
91
+ "upsample_rates": [
92
+ 8,
93
+ 8,
94
+ 2,
95
+ 2,
96
+ 2
97
+ ],
98
+ "upsample_initial_channel": 512,
99
+ "upsample_kernel_sizes": [
100
+ 16,
101
+ 16,
102
+ 8,
103
+ 2,
104
+ 2
105
+ ],
106
+ "n_layers_q": 3,
107
+ "use_spectral_norm": false,
108
+ "gin_channels": 512,
109
+ "slm": {
110
+ "model": "microsoft/wavlm-base-plus",
111
+ "sr": 16000,
112
+ "hidden": 768,
113
+ "nlayers": 13,
114
+ "initial_channel": 64
115
+ }
116
+ },
117
+ "version": "2.7.0-JP-Extra"
118
+ }
onnx/speakers/amitaro_runrun/duration_predictor.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97809075bf8aad4a482fe20bd2722a234d4a8ad916ad5e88fa70105d817430d5
3
+ size 31503027
onnx/speakers/amitaro_runrun/style_vectors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b59a7286e5ea0c549acbdb086aa52fd227b6e270ec20918bc8cc63b61cbf64b8
3
+ size 1152
onnx/speakers/amitaro_runrun/synthesizer.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a183206be69b208edd3a229e884c85770304ca21ae8ee0b3b0e930700fd2ce44
3
+ size 250644697
onnx/speakers/amitaro_sasayaki_a/config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "hayakoe_amitaro_sasayaki_a",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 500,
6
+ "seed": 42,
7
+ "epochs": 100,
8
+ "learning_rate": 0.0001,
9
+ "betas": [
10
+ 0.8,
11
+ 0.99
12
+ ],
13
+ "eps": 1e-09,
14
+ "batch_size": 2,
15
+ "bf16_run": false,
16
+ "fp16_run": false,
17
+ "lr_decay": 0.99996,
18
+ "segment_size": 16384,
19
+ "init_lr_ratio": 1,
20
+ "warmup_epochs": 0,
21
+ "c_mel": 45,
22
+ "c_kl": 1.0,
23
+ "c_commit": 100,
24
+ "skip_optimizer": false,
25
+ "freeze_ZH_bert": false,
26
+ "freeze_JP_bert": false,
27
+ "freeze_EN_bert": false,
28
+ "freeze_emo": false,
29
+ "freeze_style": false,
30
+ "freeze_decoder": false
31
+ },
32
+ "data": {
33
+ "use_jp_extra": true,
34
+ "training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_a/train.list",
35
+ "validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_a/val.list",
36
+ "max_wav_value": 32768.0,
37
+ "sampling_rate": 44100,
38
+ "filter_length": 2048,
39
+ "hop_length": 512,
40
+ "win_length": 2048,
41
+ "n_mel_channels": 128,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": null,
44
+ "add_blank": true,
45
+ "n_speakers": 1,
46
+ "cleaned_text": true,
47
+ "spk2id": {
48
+ "amitaro_sasayaki_a": 0
49
+ },
50
+ "num_styles": 1,
51
+ "style2id": {
52
+ "Neutral": 0
53
+ }
54
+ },
55
+ "model": {
56
+ "use_spk_conditioned_encoder": true,
57
+ "use_noise_scaled_mas": true,
58
+ "use_mel_posterior_encoder": false,
59
+ "use_duration_discriminator": false,
60
+ "use_wavlm_discriminator": true,
61
+ "inter_channels": 192,
62
+ "hidden_channels": 192,
63
+ "filter_channels": 768,
64
+ "n_heads": 2,
65
+ "n_layers": 6,
66
+ "kernel_size": 3,
67
+ "p_dropout": 0.1,
68
+ "resblock": "1",
69
+ "resblock_kernel_sizes": [
70
+ 3,
71
+ 7,
72
+ 11
73
+ ],
74
+ "resblock_dilation_sizes": [
75
+ [
76
+ 1,
77
+ 3,
78
+ 5
79
+ ],
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ]
90
+ ],
91
+ "upsample_rates": [
92
+ 8,
93
+ 8,
94
+ 2,
95
+ 2,
96
+ 2
97
+ ],
98
+ "upsample_initial_channel": 512,
99
+ "upsample_kernel_sizes": [
100
+ 16,
101
+ 16,
102
+ 8,
103
+ 2,
104
+ 2
105
+ ],
106
+ "n_layers_q": 3,
107
+ "use_spectral_norm": false,
108
+ "gin_channels": 512,
109
+ "slm": {
110
+ "model": "microsoft/wavlm-base-plus",
111
+ "sr": 16000,
112
+ "hidden": 768,
113
+ "nlayers": 13,
114
+ "initial_channel": 64
115
+ }
116
+ },
117
+ "version": "2.7.0-JP-Extra"
118
+ }
onnx/speakers/amitaro_sasayaki_a/duration_predictor.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b1331719136678ce093301b88364ebfdca6cf70287cbb3b97e1e07d641cb29a
3
+ size 31503027
onnx/speakers/amitaro_sasayaki_a/style_vectors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab461895545a48c61e6d96ba8e9dc717d9e7edc06e42a549b934ed14f9c84c86
3
+ size 1152
onnx/speakers/amitaro_sasayaki_a/synthesizer.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66b43fcf188b152cd9011d6c2d1ac3e16eb9e043589b28c7fe2bb729379afaa0
3
+ size 250644697
onnx/speakers/amitaro_sasayaki_b/config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "hayakoe_amitaro_sasayaki_b",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 500,
6
+ "seed": 42,
7
+ "epochs": 100,
8
+ "learning_rate": 0.0001,
9
+ "betas": [
10
+ 0.8,
11
+ 0.99
12
+ ],
13
+ "eps": 1e-09,
14
+ "batch_size": 2,
15
+ "bf16_run": false,
16
+ "fp16_run": false,
17
+ "lr_decay": 0.99996,
18
+ "segment_size": 16384,
19
+ "init_lr_ratio": 1,
20
+ "warmup_epochs": 0,
21
+ "c_mel": 45,
22
+ "c_kl": 1.0,
23
+ "c_commit": 100,
24
+ "skip_optimizer": false,
25
+ "freeze_ZH_bert": false,
26
+ "freeze_JP_bert": false,
27
+ "freeze_EN_bert": false,
28
+ "freeze_emo": false,
29
+ "freeze_style": false,
30
+ "freeze_decoder": false
31
+ },
32
+ "data": {
33
+ "use_jp_extra": true,
34
+ "training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_b/train.list",
35
+ "validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_b/val.list",
36
+ "max_wav_value": 32768.0,
37
+ "sampling_rate": 44100,
38
+ "filter_length": 2048,
39
+ "hop_length": 512,
40
+ "win_length": 2048,
41
+ "n_mel_channels": 128,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": null,
44
+ "add_blank": true,
45
+ "n_speakers": 1,
46
+ "cleaned_text": true,
47
+ "spk2id": {
48
+ "amitaro_sasayaki_b": 0
49
+ },
50
+ "num_styles": 1,
51
+ "style2id": {
52
+ "Neutral": 0
53
+ }
54
+ },
55
+ "model": {
56
+ "use_spk_conditioned_encoder": true,
57
+ "use_noise_scaled_mas": true,
58
+ "use_mel_posterior_encoder": false,
59
+ "use_duration_discriminator": false,
60
+ "use_wavlm_discriminator": true,
61
+ "inter_channels": 192,
62
+ "hidden_channels": 192,
63
+ "filter_channels": 768,
64
+ "n_heads": 2,
65
+ "n_layers": 6,
66
+ "kernel_size": 3,
67
+ "p_dropout": 0.1,
68
+ "resblock": "1",
69
+ "resblock_kernel_sizes": [
70
+ 3,
71
+ 7,
72
+ 11
73
+ ],
74
+ "resblock_dilation_sizes": [
75
+ [
76
+ 1,
77
+ 3,
78
+ 5
79
+ ],
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ]
90
+ ],
91
+ "upsample_rates": [
92
+ 8,
93
+ 8,
94
+ 2,
95
+ 2,
96
+ 2
97
+ ],
98
+ "upsample_initial_channel": 512,
99
+ "upsample_kernel_sizes": [
100
+ 16,
101
+ 16,
102
+ 8,
103
+ 2,
104
+ 2
105
+ ],
106
+ "n_layers_q": 3,
107
+ "use_spectral_norm": false,
108
+ "gin_channels": 512,
109
+ "slm": {
110
+ "model": "microsoft/wavlm-base-plus",
111
+ "sr": 16000,
112
+ "hidden": 768,
113
+ "nlayers": 13,
114
+ "initial_channel": 64
115
+ }
116
+ },
117
+ "version": "2.7.0-JP-Extra"
118
+ }
onnx/speakers/amitaro_sasayaki_b/duration_predictor.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7273327640ddb5345963ae5fea5a5517c65d7efa57c35db3811634f727180975
3
+ size 31503027
onnx/speakers/amitaro_sasayaki_b/style_vectors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fac4742f009abb2d8a3fb007319887ee664634c6eed7816a9b72b8c90bdcde3
3
+ size 1152
onnx/speakers/amitaro_sasayaki_b/synthesizer.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:742e7ff7a39b7218a19ba3120c1281dd6eb8726b89c872ad3eeee1068006bd1b
3
+ size 250644697
onnx/speakers/amitaro_yofukashi/config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "hayakoe_amitaro_yofukashi",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 500,
6
+ "seed": 42,
7
+ "epochs": 21,
8
+ "learning_rate": 0.0001,
9
+ "betas": [
10
+ 0.8,
11
+ 0.99
12
+ ],
13
+ "eps": 1e-09,
14
+ "batch_size": 2,
15
+ "bf16_run": false,
16
+ "fp16_run": false,
17
+ "lr_decay": 0.99996,
18
+ "segment_size": 16384,
19
+ "init_lr_ratio": 1,
20
+ "warmup_epochs": 0,
21
+ "c_mel": 45,
22
+ "c_kl": 1.0,
23
+ "c_commit": 100,
24
+ "skip_optimizer": false,
25
+ "freeze_ZH_bert": false,
26
+ "freeze_JP_bert": false,
27
+ "freeze_EN_bert": false,
28
+ "freeze_emo": false,
29
+ "freeze_style": false,
30
+ "freeze_decoder": false
31
+ },
32
+ "data": {
33
+ "use_jp_extra": true,
34
+ "training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_yofukashi/train.list",
35
+ "validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_yofukashi/val.list",
36
+ "max_wav_value": 32768.0,
37
+ "sampling_rate": 44100,
38
+ "filter_length": 2048,
39
+ "hop_length": 512,
40
+ "win_length": 2048,
41
+ "n_mel_channels": 128,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": null,
44
+ "add_blank": true,
45
+ "n_speakers": 1,
46
+ "cleaned_text": true,
47
+ "spk2id": {
48
+ "amitaro_yofukashi": 0
49
+ },
50
+ "num_styles": 1,
51
+ "style2id": {
52
+ "Neutral": 0
53
+ }
54
+ },
55
+ "model": {
56
+ "use_spk_conditioned_encoder": true,
57
+ "use_noise_scaled_mas": true,
58
+ "use_mel_posterior_encoder": false,
59
+ "use_duration_discriminator": false,
60
+ "use_wavlm_discriminator": true,
61
+ "inter_channels": 192,
62
+ "hidden_channels": 192,
63
+ "filter_channels": 768,
64
+ "n_heads": 2,
65
+ "n_layers": 6,
66
+ "kernel_size": 3,
67
+ "p_dropout": 0.1,
68
+ "resblock": "1",
69
+ "resblock_kernel_sizes": [
70
+ 3,
71
+ 7,
72
+ 11
73
+ ],
74
+ "resblock_dilation_sizes": [
75
+ [
76
+ 1,
77
+ 3,
78
+ 5
79
+ ],
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ]
90
+ ],
91
+ "upsample_rates": [
92
+ 8,
93
+ 8,
94
+ 2,
95
+ 2,
96
+ 2
97
+ ],
98
+ "upsample_initial_channel": 512,
99
+ "upsample_kernel_sizes": [
100
+ 16,
101
+ 16,
102
+ 8,
103
+ 2,
104
+ 2
105
+ ],
106
+ "n_layers_q": 3,
107
+ "use_spectral_norm": false,
108
+ "gin_channels": 512,
109
+ "slm": {
110
+ "model": "microsoft/wavlm-base-plus",
111
+ "sr": 16000,
112
+ "hidden": 768,
113
+ "nlayers": 13,
114
+ "initial_channel": 64
115
+ }
116
+ },
117
+ "version": "2.7.0-JP-Extra"
118
+ }
onnx/speakers/amitaro_yofukashi/duration_predictor.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8899e93605fc44ce3e058882a3fd09110973416fa9510104a58a1d5c5a5777b1
3
+ size 31503027
onnx/speakers/amitaro_yofukashi/style_vectors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c249725ac0de81417da52cf0b3211bb7a14610f534df933f0d4a307fcaa659a0
3
+ size 1152
onnx/speakers/amitaro_yofukashi/synthesizer.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:999faade6b925fb9008c8a97fd87e95e6e20266b4d142b106edc0f9fc4b18ba0
3
+ size 250644697
pytorch/speakers/amitaro_normal/config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "hayakoe_amitaro_normal",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 500,
6
+ "seed": 42,
7
+ "epochs": 21,
8
+ "learning_rate": 0.0001,
9
+ "betas": [
10
+ 0.8,
11
+ 0.99
12
+ ],
13
+ "eps": 1e-09,
14
+ "batch_size": 2,
15
+ "bf16_run": false,
16
+ "fp16_run": false,
17
+ "lr_decay": 0.99996,
18
+ "segment_size": 16384,
19
+ "init_lr_ratio": 1,
20
+ "warmup_epochs": 0,
21
+ "c_mel": 45,
22
+ "c_kl": 1.0,
23
+ "c_commit": 100,
24
+ "skip_optimizer": false,
25
+ "freeze_ZH_bert": false,
26
+ "freeze_JP_bert": false,
27
+ "freeze_EN_bert": false,
28
+ "freeze_emo": false,
29
+ "freeze_style": false,
30
+ "freeze_decoder": false
31
+ },
32
+ "data": {
33
+ "use_jp_extra": true,
34
+ "training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_normal/train.list",
35
+ "validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_normal/val.list",
36
+ "max_wav_value": 32768.0,
37
+ "sampling_rate": 44100,
38
+ "filter_length": 2048,
39
+ "hop_length": 512,
40
+ "win_length": 2048,
41
+ "n_mel_channels": 128,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": null,
44
+ "add_blank": true,
45
+ "n_speakers": 1,
46
+ "cleaned_text": true,
47
+ "spk2id": {
48
+ "amitaro_normal": 0
49
+ },
50
+ "num_styles": 1,
51
+ "style2id": {
52
+ "Neutral": 0
53
+ }
54
+ },
55
+ "model": {
56
+ "use_spk_conditioned_encoder": true,
57
+ "use_noise_scaled_mas": true,
58
+ "use_mel_posterior_encoder": false,
59
+ "use_duration_discriminator": false,
60
+ "use_wavlm_discriminator": true,
61
+ "inter_channels": 192,
62
+ "hidden_channels": 192,
63
+ "filter_channels": 768,
64
+ "n_heads": 2,
65
+ "n_layers": 6,
66
+ "kernel_size": 3,
67
+ "p_dropout": 0.1,
68
+ "resblock": "1",
69
+ "resblock_kernel_sizes": [
70
+ 3,
71
+ 7,
72
+ 11
73
+ ],
74
+ "resblock_dilation_sizes": [
75
+ [
76
+ 1,
77
+ 3,
78
+ 5
79
+ ],
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ]
90
+ ],
91
+ "upsample_rates": [
92
+ 8,
93
+ 8,
94
+ 2,
95
+ 2,
96
+ 2
97
+ ],
98
+ "upsample_initial_channel": 512,
99
+ "upsample_kernel_sizes": [
100
+ 16,
101
+ 16,
102
+ 8,
103
+ 2,
104
+ 2
105
+ ],
106
+ "n_layers_q": 3,
107
+ "use_spectral_norm": false,
108
+ "gin_channels": 512,
109
+ "slm": {
110
+ "model": "microsoft/wavlm-base-plus",
111
+ "sr": 16000,
112
+ "hidden": 768,
113
+ "nlayers": 13,
114
+ "initial_channel": 64
115
+ }
116
+ },
117
+ "version": "2.7.0-JP-Extra"
118
+ }
pytorch/speakers/amitaro_normal/hayakoe_amitaro_normal_e11_s2000.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:243d24c78d418b1b83001b5eb067e87372c398c25d36f65a5e47043dbf3484c5
3
+ size 251155732
pytorch/speakers/amitaro_normal/style_vectors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28f9059b9da6c20dd891a2f22df2bfe77311a51d337623bd88e701de2f942643
3
+ size 1152
pytorch/speakers/amitaro_punsuka/config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "hayakoe_amitaro_punsuka",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 500,
6
+ "seed": 42,
7
+ "epochs": 11,
8
+ "learning_rate": 0.0001,
9
+ "betas": [
10
+ 0.8,
11
+ 0.99
12
+ ],
13
+ "eps": 1e-09,
14
+ "batch_size": 2,
15
+ "bf16_run": false,
16
+ "fp16_run": false,
17
+ "lr_decay": 0.99996,
18
+ "segment_size": 16384,
19
+ "init_lr_ratio": 1,
20
+ "warmup_epochs": 0,
21
+ "c_mel": 45,
22
+ "c_kl": 1.0,
23
+ "c_commit": 100,
24
+ "skip_optimizer": false,
25
+ "freeze_ZH_bert": false,
26
+ "freeze_JP_bert": false,
27
+ "freeze_EN_bert": false,
28
+ "freeze_emo": false,
29
+ "freeze_style": false,
30
+ "freeze_decoder": false
31
+ },
32
+ "data": {
33
+ "use_jp_extra": true,
34
+ "training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_punsuka/train.list",
35
+ "validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_punsuka/val.list",
36
+ "max_wav_value": 32768.0,
37
+ "sampling_rate": 44100,
38
+ "filter_length": 2048,
39
+ "hop_length": 512,
40
+ "win_length": 2048,
41
+ "n_mel_channels": 128,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": null,
44
+ "add_blank": true,
45
+ "n_speakers": 1,
46
+ "cleaned_text": true,
47
+ "spk2id": {
48
+ "amitaro_punsuka": 0
49
+ },
50
+ "num_styles": 1,
51
+ "style2id": {
52
+ "Neutral": 0
53
+ }
54
+ },
55
+ "model": {
56
+ "use_spk_conditioned_encoder": true,
57
+ "use_noise_scaled_mas": true,
58
+ "use_mel_posterior_encoder": false,
59
+ "use_duration_discriminator": false,
60
+ "use_wavlm_discriminator": true,
61
+ "inter_channels": 192,
62
+ "hidden_channels": 192,
63
+ "filter_channels": 768,
64
+ "n_heads": 2,
65
+ "n_layers": 6,
66
+ "kernel_size": 3,
67
+ "p_dropout": 0.1,
68
+ "resblock": "1",
69
+ "resblock_kernel_sizes": [
70
+ 3,
71
+ 7,
72
+ 11
73
+ ],
74
+ "resblock_dilation_sizes": [
75
+ [
76
+ 1,
77
+ 3,
78
+ 5
79
+ ],
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ]
90
+ ],
91
+ "upsample_rates": [
92
+ 8,
93
+ 8,
94
+ 2,
95
+ 2,
96
+ 2
97
+ ],
98
+ "upsample_initial_channel": 512,
99
+ "upsample_kernel_sizes": [
100
+ 16,
101
+ 16,
102
+ 8,
103
+ 2,
104
+ 2
105
+ ],
106
+ "n_layers_q": 3,
107
+ "use_spectral_norm": false,
108
+ "gin_channels": 512,
109
+ "slm": {
110
+ "model": "microsoft/wavlm-base-plus",
111
+ "sr": 16000,
112
+ "hidden": 768,
113
+ "nlayers": 13,
114
+ "initial_channel": 64
115
+ }
116
+ },
117
+ "version": "2.7.0-JP-Extra"
118
+ }
pytorch/speakers/amitaro_punsuka/hayakoe_amitaro_punsuka_e11_s2000.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:caa8a3a78795cab1b9d4c7507afab5b4dcf7f594c08fa49e826b917588db3cbe
3
+ size 251155732
pytorch/speakers/amitaro_punsuka/style_vectors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cdbc3e625231b57adb16c89499bd0f01fb74acac42abbd1c0aceb96346bfa68
3
+ size 1152
pytorch/speakers/amitaro_runrun/config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "hayakoe_amitaro_runrun",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 500,
6
+ "seed": 42,
7
+ "epochs": 21,
8
+ "learning_rate": 0.0001,
9
+ "betas": [
10
+ 0.8,
11
+ 0.99
12
+ ],
13
+ "eps": 1e-09,
14
+ "batch_size": 2,
15
+ "bf16_run": false,
16
+ "fp16_run": false,
17
+ "lr_decay": 0.99996,
18
+ "segment_size": 16384,
19
+ "init_lr_ratio": 1,
20
+ "warmup_epochs": 0,
21
+ "c_mel": 45,
22
+ "c_kl": 1.0,
23
+ "c_commit": 100,
24
+ "skip_optimizer": false,
25
+ "freeze_ZH_bert": false,
26
+ "freeze_JP_bert": false,
27
+ "freeze_EN_bert": false,
28
+ "freeze_emo": false,
29
+ "freeze_style": false,
30
+ "freeze_decoder": false
31
+ },
32
+ "data": {
33
+ "use_jp_extra": true,
34
+ "training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_runrun/train.list",
35
+ "validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_runrun/val.list",
36
+ "max_wav_value": 32768.0,
37
+ "sampling_rate": 44100,
38
+ "filter_length": 2048,
39
+ "hop_length": 512,
40
+ "win_length": 2048,
41
+ "n_mel_channels": 128,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": null,
44
+ "add_blank": true,
45
+ "n_speakers": 1,
46
+ "cleaned_text": true,
47
+ "spk2id": {
48
+ "amitaro_runrun": 0
49
+ },
50
+ "num_styles": 1,
51
+ "style2id": {
52
+ "Neutral": 0
53
+ }
54
+ },
55
+ "model": {
56
+ "use_spk_conditioned_encoder": true,
57
+ "use_noise_scaled_mas": true,
58
+ "use_mel_posterior_encoder": false,
59
+ "use_duration_discriminator": false,
60
+ "use_wavlm_discriminator": true,
61
+ "inter_channels": 192,
62
+ "hidden_channels": 192,
63
+ "filter_channels": 768,
64
+ "n_heads": 2,
65
+ "n_layers": 6,
66
+ "kernel_size": 3,
67
+ "p_dropout": 0.1,
68
+ "resblock": "1",
69
+ "resblock_kernel_sizes": [
70
+ 3,
71
+ 7,
72
+ 11
73
+ ],
74
+ "resblock_dilation_sizes": [
75
+ [
76
+ 1,
77
+ 3,
78
+ 5
79
+ ],
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ]
90
+ ],
91
+ "upsample_rates": [
92
+ 8,
93
+ 8,
94
+ 2,
95
+ 2,
96
+ 2
97
+ ],
98
+ "upsample_initial_channel": 512,
99
+ "upsample_kernel_sizes": [
100
+ 16,
101
+ 16,
102
+ 8,
103
+ 2,
104
+ 2
105
+ ],
106
+ "n_layers_q": 3,
107
+ "use_spectral_norm": false,
108
+ "gin_channels": 512,
109
+ "slm": {
110
+ "model": "microsoft/wavlm-base-plus",
111
+ "sr": 16000,
112
+ "hidden": 768,
113
+ "nlayers": 13,
114
+ "initial_channel": 64
115
+ }
116
+ },
117
+ "version": "2.7.0-JP-Extra"
118
+ }
pytorch/speakers/amitaro_runrun/hayakoe_amitaro_runrun_e11_s2000.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00ac68eff452b4c2a3d1301ed46de128aa2a1a36e401954d7bd1c08408408473
3
+ size 251155732
pytorch/speakers/amitaro_runrun/style_vectors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b59a7286e5ea0c549acbdb086aa52fd227b6e270ec20918bc8cc63b61cbf64b8
3
+ size 1152
pytorch/speakers/amitaro_sasayaki_a/config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "hayakoe_amitaro_sasayaki_a",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 500,
6
+ "seed": 42,
7
+ "epochs": 100,
8
+ "learning_rate": 0.0001,
9
+ "betas": [
10
+ 0.8,
11
+ 0.99
12
+ ],
13
+ "eps": 1e-09,
14
+ "batch_size": 2,
15
+ "bf16_run": false,
16
+ "fp16_run": false,
17
+ "lr_decay": 0.99996,
18
+ "segment_size": 16384,
19
+ "init_lr_ratio": 1,
20
+ "warmup_epochs": 0,
21
+ "c_mel": 45,
22
+ "c_kl": 1.0,
23
+ "c_commit": 100,
24
+ "skip_optimizer": false,
25
+ "freeze_ZH_bert": false,
26
+ "freeze_JP_bert": false,
27
+ "freeze_EN_bert": false,
28
+ "freeze_emo": false,
29
+ "freeze_style": false,
30
+ "freeze_decoder": false
31
+ },
32
+ "data": {
33
+ "use_jp_extra": true,
34
+ "training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_a/train.list",
35
+ "validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_a/val.list",
36
+ "max_wav_value": 32768.0,
37
+ "sampling_rate": 44100,
38
+ "filter_length": 2048,
39
+ "hop_length": 512,
40
+ "win_length": 2048,
41
+ "n_mel_channels": 128,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": null,
44
+ "add_blank": true,
45
+ "n_speakers": 1,
46
+ "cleaned_text": true,
47
+ "spk2id": {
48
+ "amitaro_sasayaki_a": 0
49
+ },
50
+ "num_styles": 1,
51
+ "style2id": {
52
+ "Neutral": 0
53
+ }
54
+ },
55
+ "model": {
56
+ "use_spk_conditioned_encoder": true,
57
+ "use_noise_scaled_mas": true,
58
+ "use_mel_posterior_encoder": false,
59
+ "use_duration_discriminator": false,
60
+ "use_wavlm_discriminator": true,
61
+ "inter_channels": 192,
62
+ "hidden_channels": 192,
63
+ "filter_channels": 768,
64
+ "n_heads": 2,
65
+ "n_layers": 6,
66
+ "kernel_size": 3,
67
+ "p_dropout": 0.1,
68
+ "resblock": "1",
69
+ "resblock_kernel_sizes": [
70
+ 3,
71
+ 7,
72
+ 11
73
+ ],
74
+ "resblock_dilation_sizes": [
75
+ [
76
+ 1,
77
+ 3,
78
+ 5
79
+ ],
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ]
90
+ ],
91
+ "upsample_rates": [
92
+ 8,
93
+ 8,
94
+ 2,
95
+ 2,
96
+ 2
97
+ ],
98
+ "upsample_initial_channel": 512,
99
+ "upsample_kernel_sizes": [
100
+ 16,
101
+ 16,
102
+ 8,
103
+ 2,
104
+ 2
105
+ ],
106
+ "n_layers_q": 3,
107
+ "use_spectral_norm": false,
108
+ "gin_channels": 512,
109
+ "slm": {
110
+ "model": "microsoft/wavlm-base-plus",
111
+ "sr": 16000,
112
+ "hidden": 768,
113
+ "nlayers": 13,
114
+ "initial_channel": 64
115
+ }
116
+ },
117
+ "version": "2.7.0-JP-Extra"
118
+ }
pytorch/speakers/amitaro_sasayaki_a/hayakoe_amitaro_sasayaki_a_e87_s4000.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bdef1fb086b368918a67cbc098ccf07f12d21f5db56df697c74056ebe18e967
3
+ size 251155732
pytorch/speakers/amitaro_sasayaki_a/style_vectors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab461895545a48c61e6d96ba8e9dc717d9e7edc06e42a549b934ed14f9c84c86
3
+ size 1152
pytorch/speakers/amitaro_sasayaki_b/config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "hayakoe_amitaro_sasayaki_b",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 500,
6
+ "seed": 42,
7
+ "epochs": 100,
8
+ "learning_rate": 0.0001,
9
+ "betas": [
10
+ 0.8,
11
+ 0.99
12
+ ],
13
+ "eps": 1e-09,
14
+ "batch_size": 2,
15
+ "bf16_run": false,
16
+ "fp16_run": false,
17
+ "lr_decay": 0.99996,
18
+ "segment_size": 16384,
19
+ "init_lr_ratio": 1,
20
+ "warmup_epochs": 0,
21
+ "c_mel": 45,
22
+ "c_kl": 1.0,
23
+ "c_commit": 100,
24
+ "skip_optimizer": false,
25
+ "freeze_ZH_bert": false,
26
+ "freeze_JP_bert": false,
27
+ "freeze_EN_bert": false,
28
+ "freeze_emo": false,
29
+ "freeze_style": false,
30
+ "freeze_decoder": false
31
+ },
32
+ "data": {
33
+ "use_jp_extra": true,
34
+ "training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_b/train.list",
35
+ "validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_sasayaki_b/val.list",
36
+ "max_wav_value": 32768.0,
37
+ "sampling_rate": 44100,
38
+ "filter_length": 2048,
39
+ "hop_length": 512,
40
+ "win_length": 2048,
41
+ "n_mel_channels": 128,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": null,
44
+ "add_blank": true,
45
+ "n_speakers": 1,
46
+ "cleaned_text": true,
47
+ "spk2id": {
48
+ "amitaro_sasayaki_b": 0
49
+ },
50
+ "num_styles": 1,
51
+ "style2id": {
52
+ "Neutral": 0
53
+ }
54
+ },
55
+ "model": {
56
+ "use_spk_conditioned_encoder": true,
57
+ "use_noise_scaled_mas": true,
58
+ "use_mel_posterior_encoder": false,
59
+ "use_duration_discriminator": false,
60
+ "use_wavlm_discriminator": true,
61
+ "inter_channels": 192,
62
+ "hidden_channels": 192,
63
+ "filter_channels": 768,
64
+ "n_heads": 2,
65
+ "n_layers": 6,
66
+ "kernel_size": 3,
67
+ "p_dropout": 0.1,
68
+ "resblock": "1",
69
+ "resblock_kernel_sizes": [
70
+ 3,
71
+ 7,
72
+ 11
73
+ ],
74
+ "resblock_dilation_sizes": [
75
+ [
76
+ 1,
77
+ 3,
78
+ 5
79
+ ],
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ]
90
+ ],
91
+ "upsample_rates": [
92
+ 8,
93
+ 8,
94
+ 2,
95
+ 2,
96
+ 2
97
+ ],
98
+ "upsample_initial_channel": 512,
99
+ "upsample_kernel_sizes": [
100
+ 16,
101
+ 16,
102
+ 8,
103
+ 2,
104
+ 2
105
+ ],
106
+ "n_layers_q": 3,
107
+ "use_spectral_norm": false,
108
+ "gin_channels": 512,
109
+ "slm": {
110
+ "model": "microsoft/wavlm-base-plus",
111
+ "sr": 16000,
112
+ "hidden": 768,
113
+ "nlayers": 13,
114
+ "initial_channel": 64
115
+ }
116
+ },
117
+ "version": "2.7.0-JP-Extra"
118
+ }
pytorch/speakers/amitaro_sasayaki_b/hayakoe_amitaro_sasayaki_b_e98_s4500.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7b1212c0d916a57cc9f1b9b09c93576aabe1fd0fd5e3dc7be0fdfa114bfaffd
3
+ size 251155732
pytorch/speakers/amitaro_sasayaki_b/style_vectors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fac4742f009abb2d8a3fb007319887ee664634c6eed7816a9b72b8c90bdcde3
3
+ size 1152
pytorch/speakers/amitaro_yofukashi/config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "hayakoe_amitaro_yofukashi",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 500,
6
+ "seed": 42,
7
+ "epochs": 21,
8
+ "learning_rate": 0.0001,
9
+ "betas": [
10
+ 0.8,
11
+ 0.99
12
+ ],
13
+ "eps": 1e-09,
14
+ "batch_size": 2,
15
+ "bf16_run": false,
16
+ "fp16_run": false,
17
+ "lr_decay": 0.99996,
18
+ "segment_size": 16384,
19
+ "init_lr_ratio": 1,
20
+ "warmup_epochs": 0,
21
+ "c_mel": 45,
22
+ "c_kl": 1.0,
23
+ "c_commit": 100,
24
+ "skip_optimizer": false,
25
+ "freeze_ZH_bert": false,
26
+ "freeze_JP_bert": false,
27
+ "freeze_EN_bert": false,
28
+ "freeze_emo": false,
29
+ "freeze_style": false,
30
+ "freeze_decoder": false
31
+ },
32
+ "data": {
33
+ "use_jp_extra": true,
34
+ "training_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_yofukashi/train.list",
35
+ "validation_files": "/home/lemon/claude-projects/vits-rnd/hayakoe/data/dataset/amitaro_yofukashi/val.list",
36
+ "max_wav_value": 32768.0,
37
+ "sampling_rate": 44100,
38
+ "filter_length": 2048,
39
+ "hop_length": 512,
40
+ "win_length": 2048,
41
+ "n_mel_channels": 128,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": null,
44
+ "add_blank": true,
45
+ "n_speakers": 1,
46
+ "cleaned_text": true,
47
+ "spk2id": {
48
+ "amitaro_yofukashi": 0
49
+ },
50
+ "num_styles": 1,
51
+ "style2id": {
52
+ "Neutral": 0
53
+ }
54
+ },
55
+ "model": {
56
+ "use_spk_conditioned_encoder": true,
57
+ "use_noise_scaled_mas": true,
58
+ "use_mel_posterior_encoder": false,
59
+ "use_duration_discriminator": false,
60
+ "use_wavlm_discriminator": true,
61
+ "inter_channels": 192,
62
+ "hidden_channels": 192,
63
+ "filter_channels": 768,
64
+ "n_heads": 2,
65
+ "n_layers": 6,
66
+ "kernel_size": 3,
67
+ "p_dropout": 0.1,
68
+ "resblock": "1",
69
+ "resblock_kernel_sizes": [
70
+ 3,
71
+ 7,
72
+ 11
73
+ ],
74
+ "resblock_dilation_sizes": [
75
+ [
76
+ 1,
77
+ 3,
78
+ 5
79
+ ],
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ]
90
+ ],
91
+ "upsample_rates": [
92
+ 8,
93
+ 8,
94
+ 2,
95
+ 2,
96
+ 2
97
+ ],
98
+ "upsample_initial_channel": 512,
99
+ "upsample_kernel_sizes": [
100
+ 16,
101
+ 16,
102
+ 8,
103
+ 2,
104
+ 2
105
+ ],
106
+ "n_layers_q": 3,
107
+ "use_spectral_norm": false,
108
+ "gin_channels": 512,
109
+ "slm": {
110
+ "model": "microsoft/wavlm-base-plus",
111
+ "sr": 16000,
112
+ "hidden": 768,
113
+ "nlayers": 13,
114
+ "initial_channel": 64
115
+ }
116
+ },
117
+ "version": "2.7.0-JP-Extra"
118
+ }
pytorch/speakers/amitaro_yofukashi/hayakoe_amitaro_yofukashi_e11_s2000.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:836dfc7745d9616a26afcad054b6140143dd2157b45cac0b71fa4f86cb10b425
3
+ size 251155732
pytorch/speakers/amitaro_yofukashi/style_vectors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c249725ac0de81417da52cf0b3211bb7a14610f534df933f0d4a307fcaa659a0
3
+ size 1152