Duplicate from myshell-ai/OpenVoice
Browse filesCo-authored-by: XuminYu <XuminYu@users.noreply.huggingface.co>
- .gitattributes +35 -0
- README.md +33 -0
- checkpoints/base_speakers/EN/checkpoint.pth +3 -0
- checkpoints/base_speakers/EN/config.json +145 -0
- checkpoints/base_speakers/EN/en_default_se.pth +3 -0
- checkpoints/base_speakers/EN/en_style_se.pth +3 -0
- checkpoints/base_speakers/ZH/checkpoint.pth +3 -0
- checkpoints/base_speakers/ZH/config.json +137 -0
- checkpoints/base_speakers/ZH/zh_default_se.pth +3 -0
- checkpoints/converter/checkpoint.pth +3 -0
- checkpoints/converter/config.json +57 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
tags:
|
| 4 |
+
- audio
|
| 5 |
+
- text-to-speech
|
| 6 |
+
- instant-voice-cloning
|
| 7 |
+
language:
|
| 8 |
+
- en
|
| 9 |
+
- zh
|
| 10 |
+
inference: false
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# OpenVoice
|
| 14 |
+
|
| 15 |
+
<a href="https://trendshift.io/repositories/6161" target="_blank"><img src="https://trendshift.io/api/badge/repositories/6161" alt="myshell-ai%2FOpenVoice | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
| 16 |
+
|
| 17 |
+
OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
|
| 18 |
+
|
| 19 |
+
<video controls autoplay src="https://cdn-uploads.huggingface.co/production/uploads/641de0213239b631552713e4/uCHTHD9OUotgOflqDu3QK.mp4"></video>
|
| 20 |
+
|
| 21 |
+
### Features
|
| 22 |
+
- **Accurate Tone Color Cloning.** OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
|
| 23 |
+
- **Flexible Voice Style Control.** OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
|
| 24 |
+
- **Zero-shot Cross-lingual Voice Cloning.** Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
|
| 25 |
+
|
| 26 |
+
### How to Use
|
| 27 |
+
Please see [usage](https://github.com/myshell-ai/OpenVoice/blob/main/docs/USAGE.md) for detailed instructions.
|
| 28 |
+
|
| 29 |
+
### Links
|
| 30 |
+
- [Github](https://github.com/myshell-ai/OpenVoice)
|
| 31 |
+
- [HFDemo](https://huggingface.co/spaces/myshell-ai/OpenVoice)
|
| 32 |
+
- [Discord](https://discord.gg/myshell)
|
| 33 |
+
|
checkpoints/base_speakers/EN/checkpoint.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1db1ae1a5c8ded049bd1536051489aefbfad4a5077c01c2257e9e88fa1bb8422
|
| 3 |
+
size 160467309
|
checkpoints/base_speakers/EN/config.json
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"data": {
|
| 3 |
+
"text_cleaners": [
|
| 4 |
+
"cjke_cleaners2"
|
| 5 |
+
],
|
| 6 |
+
"sampling_rate": 22050,
|
| 7 |
+
"filter_length": 1024,
|
| 8 |
+
"hop_length": 256,
|
| 9 |
+
"win_length": 1024,
|
| 10 |
+
"n_mel_channels": 80,
|
| 11 |
+
"add_blank": true,
|
| 12 |
+
"cleaned_text": true,
|
| 13 |
+
"n_speakers": 10
|
| 14 |
+
},
|
| 15 |
+
"model": {
|
| 16 |
+
"inter_channels": 192,
|
| 17 |
+
"hidden_channels": 192,
|
| 18 |
+
"filter_channels": 768,
|
| 19 |
+
"n_heads": 2,
|
| 20 |
+
"n_layers": 6,
|
| 21 |
+
"n_layers_trans_flow": 3,
|
| 22 |
+
"kernel_size": 3,
|
| 23 |
+
"p_dropout": 0.1,
|
| 24 |
+
"resblock": "1",
|
| 25 |
+
"resblock_kernel_sizes": [
|
| 26 |
+
3,
|
| 27 |
+
7,
|
| 28 |
+
11
|
| 29 |
+
],
|
| 30 |
+
"resblock_dilation_sizes": [
|
| 31 |
+
[
|
| 32 |
+
1,
|
| 33 |
+
3,
|
| 34 |
+
5
|
| 35 |
+
],
|
| 36 |
+
[
|
| 37 |
+
1,
|
| 38 |
+
3,
|
| 39 |
+
5
|
| 40 |
+
],
|
| 41 |
+
[
|
| 42 |
+
1,
|
| 43 |
+
3,
|
| 44 |
+
5
|
| 45 |
+
]
|
| 46 |
+
],
|
| 47 |
+
"upsample_rates": [
|
| 48 |
+
8,
|
| 49 |
+
8,
|
| 50 |
+
2,
|
| 51 |
+
2
|
| 52 |
+
],
|
| 53 |
+
"upsample_initial_channel": 512,
|
| 54 |
+
"upsample_kernel_sizes": [
|
| 55 |
+
16,
|
| 56 |
+
16,
|
| 57 |
+
4,
|
| 58 |
+
4
|
| 59 |
+
],
|
| 60 |
+
"n_layers_q": 3,
|
| 61 |
+
"use_spectral_norm": false,
|
| 62 |
+
"gin_channels": 256
|
| 63 |
+
},
|
| 64 |
+
"symbols": [
|
| 65 |
+
"_",
|
| 66 |
+
",",
|
| 67 |
+
".",
|
| 68 |
+
"!",
|
| 69 |
+
"?",
|
| 70 |
+
"-",
|
| 71 |
+
"~",
|
| 72 |
+
"\u2026",
|
| 73 |
+
"N",
|
| 74 |
+
"Q",
|
| 75 |
+
"a",
|
| 76 |
+
"b",
|
| 77 |
+
"d",
|
| 78 |
+
"e",
|
| 79 |
+
"f",
|
| 80 |
+
"g",
|
| 81 |
+
"h",
|
| 82 |
+
"i",
|
| 83 |
+
"j",
|
| 84 |
+
"k",
|
| 85 |
+
"l",
|
| 86 |
+
"m",
|
| 87 |
+
"n",
|
| 88 |
+
"o",
|
| 89 |
+
"p",
|
| 90 |
+
"s",
|
| 91 |
+
"t",
|
| 92 |
+
"u",
|
| 93 |
+
"v",
|
| 94 |
+
"w",
|
| 95 |
+
"x",
|
| 96 |
+
"y",
|
| 97 |
+
"z",
|
| 98 |
+
"\u0251",
|
| 99 |
+
"\u00e6",
|
| 100 |
+
"\u0283",
|
| 101 |
+
"\u0291",
|
| 102 |
+
"\u00e7",
|
| 103 |
+
"\u026f",
|
| 104 |
+
"\u026a",
|
| 105 |
+
"\u0254",
|
| 106 |
+
"\u025b",
|
| 107 |
+
"\u0279",
|
| 108 |
+
"\u00f0",
|
| 109 |
+
"\u0259",
|
| 110 |
+
"\u026b",
|
| 111 |
+
"\u0265",
|
| 112 |
+
"\u0278",
|
| 113 |
+
"\u028a",
|
| 114 |
+
"\u027e",
|
| 115 |
+
"\u0292",
|
| 116 |
+
"\u03b8",
|
| 117 |
+
"\u03b2",
|
| 118 |
+
"\u014b",
|
| 119 |
+
"\u0266",
|
| 120 |
+
"\u207c",
|
| 121 |
+
"\u02b0",
|
| 122 |
+
"`",
|
| 123 |
+
"^",
|
| 124 |
+
"#",
|
| 125 |
+
"*",
|
| 126 |
+
"=",
|
| 127 |
+
"\u02c8",
|
| 128 |
+
"\u02cc",
|
| 129 |
+
"\u2192",
|
| 130 |
+
"\u2193",
|
| 131 |
+
"\u2191",
|
| 132 |
+
" "
|
| 133 |
+
],
|
| 134 |
+
"speakers": {
|
| 135 |
+
"default": 1,
|
| 136 |
+
"whispering": 2,
|
| 137 |
+
"shouting": 3,
|
| 138 |
+
"excited": 4,
|
| 139 |
+
"cheerful": 5,
|
| 140 |
+
"terrified": 6,
|
| 141 |
+
"angry": 7,
|
| 142 |
+
"sad": 8,
|
| 143 |
+
"friendly": 9
|
| 144 |
+
}
|
| 145 |
+
}
|
checkpoints/base_speakers/EN/en_default_se.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cab24002eec738d0fe72cb73a34e57fbc3999c1bd4a1670a7b56ee4e3590ac9
|
| 3 |
+
size 1789
|
checkpoints/base_speakers/EN/en_style_se.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6f698153be5004b90a8642d1157c89cae7dd296752a3276450ced6a17b8b98a9
|
| 3 |
+
size 1783
|
checkpoints/base_speakers/ZH/checkpoint.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9
|
| 3 |
+
size 160467309
|
checkpoints/base_speakers/ZH/config.json
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"data": {
|
| 3 |
+
"text_cleaners": [
|
| 4 |
+
"cjke_cleaners2"
|
| 5 |
+
],
|
| 6 |
+
"sampling_rate": 22050,
|
| 7 |
+
"filter_length": 1024,
|
| 8 |
+
"hop_length": 256,
|
| 9 |
+
"win_length": 1024,
|
| 10 |
+
"n_mel_channels": 80,
|
| 11 |
+
"add_blank": true,
|
| 12 |
+
"cleaned_text": true,
|
| 13 |
+
"n_speakers": 10
|
| 14 |
+
},
|
| 15 |
+
"model": {
|
| 16 |
+
"inter_channels": 192,
|
| 17 |
+
"hidden_channels": 192,
|
| 18 |
+
"filter_channels": 768,
|
| 19 |
+
"n_heads": 2,
|
| 20 |
+
"n_layers": 6,
|
| 21 |
+
"n_layers_trans_flow": 3,
|
| 22 |
+
"kernel_size": 3,
|
| 23 |
+
"p_dropout": 0.1,
|
| 24 |
+
"resblock": "1",
|
| 25 |
+
"resblock_kernel_sizes": [
|
| 26 |
+
3,
|
| 27 |
+
7,
|
| 28 |
+
11
|
| 29 |
+
],
|
| 30 |
+
"resblock_dilation_sizes": [
|
| 31 |
+
[
|
| 32 |
+
1,
|
| 33 |
+
3,
|
| 34 |
+
5
|
| 35 |
+
],
|
| 36 |
+
[
|
| 37 |
+
1,
|
| 38 |
+
3,
|
| 39 |
+
5
|
| 40 |
+
],
|
| 41 |
+
[
|
| 42 |
+
1,
|
| 43 |
+
3,
|
| 44 |
+
5
|
| 45 |
+
]
|
| 46 |
+
],
|
| 47 |
+
"upsample_rates": [
|
| 48 |
+
8,
|
| 49 |
+
8,
|
| 50 |
+
2,
|
| 51 |
+
2
|
| 52 |
+
],
|
| 53 |
+
"upsample_initial_channel": 512,
|
| 54 |
+
"upsample_kernel_sizes": [
|
| 55 |
+
16,
|
| 56 |
+
16,
|
| 57 |
+
4,
|
| 58 |
+
4
|
| 59 |
+
],
|
| 60 |
+
"n_layers_q": 3,
|
| 61 |
+
"use_spectral_norm": false,
|
| 62 |
+
"gin_channels": 256
|
| 63 |
+
},
|
| 64 |
+
"symbols": [
|
| 65 |
+
"_",
|
| 66 |
+
",",
|
| 67 |
+
".",
|
| 68 |
+
"!",
|
| 69 |
+
"?",
|
| 70 |
+
"-",
|
| 71 |
+
"~",
|
| 72 |
+
"\u2026",
|
| 73 |
+
"N",
|
| 74 |
+
"Q",
|
| 75 |
+
"a",
|
| 76 |
+
"b",
|
| 77 |
+
"d",
|
| 78 |
+
"e",
|
| 79 |
+
"f",
|
| 80 |
+
"g",
|
| 81 |
+
"h",
|
| 82 |
+
"i",
|
| 83 |
+
"j",
|
| 84 |
+
"k",
|
| 85 |
+
"l",
|
| 86 |
+
"m",
|
| 87 |
+
"n",
|
| 88 |
+
"o",
|
| 89 |
+
"p",
|
| 90 |
+
"s",
|
| 91 |
+
"t",
|
| 92 |
+
"u",
|
| 93 |
+
"v",
|
| 94 |
+
"w",
|
| 95 |
+
"x",
|
| 96 |
+
"y",
|
| 97 |
+
"z",
|
| 98 |
+
"\u0251",
|
| 99 |
+
"\u00e6",
|
| 100 |
+
"\u0283",
|
| 101 |
+
"\u0291",
|
| 102 |
+
"\u00e7",
|
| 103 |
+
"\u026f",
|
| 104 |
+
"\u026a",
|
| 105 |
+
"\u0254",
|
| 106 |
+
"\u025b",
|
| 107 |
+
"\u0279",
|
| 108 |
+
"\u00f0",
|
| 109 |
+
"\u0259",
|
| 110 |
+
"\u026b",
|
| 111 |
+
"\u0265",
|
| 112 |
+
"\u0278",
|
| 113 |
+
"\u028a",
|
| 114 |
+
"\u027e",
|
| 115 |
+
"\u0292",
|
| 116 |
+
"\u03b8",
|
| 117 |
+
"\u03b2",
|
| 118 |
+
"\u014b",
|
| 119 |
+
"\u0266",
|
| 120 |
+
"\u207c",
|
| 121 |
+
"\u02b0",
|
| 122 |
+
"`",
|
| 123 |
+
"^",
|
| 124 |
+
"#",
|
| 125 |
+
"*",
|
| 126 |
+
"=",
|
| 127 |
+
"\u02c8",
|
| 128 |
+
"\u02cc",
|
| 129 |
+
"\u2192",
|
| 130 |
+
"\u2193",
|
| 131 |
+
"\u2191",
|
| 132 |
+
" "
|
| 133 |
+
],
|
| 134 |
+
"speakers": {
|
| 135 |
+
"default": 0
|
| 136 |
+
}
|
| 137 |
+
}
|
checkpoints/base_speakers/ZH/zh_default_se.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf
|
| 3 |
+
size 1789
|
checkpoints/converter/checkpoint.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:89ae83aa4e3668fef64b388b789ff7b0ce0def9f801069edfc18a00ea420748d
|
| 3 |
+
size 131327338
|
checkpoints/converter/config.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"data": {
|
| 3 |
+
"sampling_rate": 22050,
|
| 4 |
+
"filter_length": 1024,
|
| 5 |
+
"hop_length": 256,
|
| 6 |
+
"win_length": 1024,
|
| 7 |
+
"n_speakers": 0
|
| 8 |
+
},
|
| 9 |
+
"model": {
|
| 10 |
+
"inter_channels": 192,
|
| 11 |
+
"hidden_channels": 192,
|
| 12 |
+
"filter_channels": 768,
|
| 13 |
+
"n_heads": 2,
|
| 14 |
+
"n_layers": 6,
|
| 15 |
+
"kernel_size": 3,
|
| 16 |
+
"p_dropout": 0.1,
|
| 17 |
+
"resblock": "1",
|
| 18 |
+
"resblock_kernel_sizes": [
|
| 19 |
+
3,
|
| 20 |
+
7,
|
| 21 |
+
11
|
| 22 |
+
],
|
| 23 |
+
"resblock_dilation_sizes": [
|
| 24 |
+
[
|
| 25 |
+
1,
|
| 26 |
+
3,
|
| 27 |
+
5
|
| 28 |
+
],
|
| 29 |
+
[
|
| 30 |
+
1,
|
| 31 |
+
3,
|
| 32 |
+
5
|
| 33 |
+
],
|
| 34 |
+
[
|
| 35 |
+
1,
|
| 36 |
+
3,
|
| 37 |
+
5
|
| 38 |
+
]
|
| 39 |
+
],
|
| 40 |
+
"upsample_rates": [
|
| 41 |
+
8,
|
| 42 |
+
8,
|
| 43 |
+
2,
|
| 44 |
+
2
|
| 45 |
+
],
|
| 46 |
+
"upsample_initial_channel": 512,
|
| 47 |
+
"upsample_kernel_sizes": [
|
| 48 |
+
16,
|
| 49 |
+
16,
|
| 50 |
+
4,
|
| 51 |
+
4
|
| 52 |
+
],
|
| 53 |
+
"n_layers_q": 3,
|
| 54 |
+
"use_spectral_norm": false,
|
| 55 |
+
"gin_channels": 256
|
| 56 |
+
}
|
| 57 |
+
}
|