Egor3333 XuminYu commited on
Commit
5727c90
·
0 Parent(s):

Duplicate from myshell-ai/OpenVoice

Browse files

Co-authored-by: XuminYu <XuminYu@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - audio
5
+ - text-to-speech
6
+ - instant-voice-cloning
7
+ language:
8
+ - en
9
+ - zh
10
+ inference: false
11
+ ---
12
+
13
+ # OpenVoice
14
+
15
+ <a href="https://trendshift.io/repositories/6161" target="_blank"><img src="https://trendshift.io/api/badge/repositories/6161" alt="myshell-ai%2FOpenVoice | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
16
+
17
+ OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
18
+
19
+ <video controls autoplay src="https://cdn-uploads.huggingface.co/production/uploads/641de0213239b631552713e4/uCHTHD9OUotgOflqDu3QK.mp4"></video>
20
+
21
+ ### Features
22
+ - **Accurate Tone Color Cloning.** OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
23
+ - **Flexible Voice Style Control.** OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
24
+ - **Zero-shot Cross-lingual Voice Cloning.** Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
25
+
26
+ ### How to Use
27
+ Please see [usage](https://github.com/myshell-ai/OpenVoice/blob/main/docs/USAGE.md) for detailed instructions.
28
+
29
+ ### Links
30
+ - [Github](https://github.com/myshell-ai/OpenVoice)
31
+ - [HFDemo](https://huggingface.co/spaces/myshell-ai/OpenVoice)
32
+ - [Discord](https://discord.gg/myshell)
33
+
checkpoints/base_speakers/EN/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1db1ae1a5c8ded049bd1536051489aefbfad4a5077c01c2257e9e88fa1bb8422
3
+ size 160467309
checkpoints/base_speakers/EN/config.json ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data": {
3
+ "text_cleaners": [
4
+ "cjke_cleaners2"
5
+ ],
6
+ "sampling_rate": 22050,
7
+ "filter_length": 1024,
8
+ "hop_length": 256,
9
+ "win_length": 1024,
10
+ "n_mel_channels": 80,
11
+ "add_blank": true,
12
+ "cleaned_text": true,
13
+ "n_speakers": 10
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "n_layers_trans_flow": 3,
22
+ "kernel_size": 3,
23
+ "p_dropout": 0.1,
24
+ "resblock": "1",
25
+ "resblock_kernel_sizes": [
26
+ 3,
27
+ 7,
28
+ 11
29
+ ],
30
+ "resblock_dilation_sizes": [
31
+ [
32
+ 1,
33
+ 3,
34
+ 5
35
+ ],
36
+ [
37
+ 1,
38
+ 3,
39
+ 5
40
+ ],
41
+ [
42
+ 1,
43
+ 3,
44
+ 5
45
+ ]
46
+ ],
47
+ "upsample_rates": [
48
+ 8,
49
+ 8,
50
+ 2,
51
+ 2
52
+ ],
53
+ "upsample_initial_channel": 512,
54
+ "upsample_kernel_sizes": [
55
+ 16,
56
+ 16,
57
+ 4,
58
+ 4
59
+ ],
60
+ "n_layers_q": 3,
61
+ "use_spectral_norm": false,
62
+ "gin_channels": 256
63
+ },
64
+ "symbols": [
65
+ "_",
66
+ ",",
67
+ ".",
68
+ "!",
69
+ "?",
70
+ "-",
71
+ "~",
72
+ "\u2026",
73
+ "N",
74
+ "Q",
75
+ "a",
76
+ "b",
77
+ "d",
78
+ "e",
79
+ "f",
80
+ "g",
81
+ "h",
82
+ "i",
83
+ "j",
84
+ "k",
85
+ "l",
86
+ "m",
87
+ "n",
88
+ "o",
89
+ "p",
90
+ "s",
91
+ "t",
92
+ "u",
93
+ "v",
94
+ "w",
95
+ "x",
96
+ "y",
97
+ "z",
98
+ "\u0251",
99
+ "\u00e6",
100
+ "\u0283",
101
+ "\u0291",
102
+ "\u00e7",
103
+ "\u026f",
104
+ "\u026a",
105
+ "\u0254",
106
+ "\u025b",
107
+ "\u0279",
108
+ "\u00f0",
109
+ "\u0259",
110
+ "\u026b",
111
+ "\u0265",
112
+ "\u0278",
113
+ "\u028a",
114
+ "\u027e",
115
+ "\u0292",
116
+ "\u03b8",
117
+ "\u03b2",
118
+ "\u014b",
119
+ "\u0266",
120
+ "\u207c",
121
+ "\u02b0",
122
+ "`",
123
+ "^",
124
+ "#",
125
+ "*",
126
+ "=",
127
+ "\u02c8",
128
+ "\u02cc",
129
+ "\u2192",
130
+ "\u2193",
131
+ "\u2191",
132
+ " "
133
+ ],
134
+ "speakers": {
135
+ "default": 1,
136
+ "whispering": 2,
137
+ "shouting": 3,
138
+ "excited": 4,
139
+ "cheerful": 5,
140
+ "terrified": 6,
141
+ "angry": 7,
142
+ "sad": 8,
143
+ "friendly": 9
144
+ }
145
+ }
checkpoints/base_speakers/EN/en_default_se.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cab24002eec738d0fe72cb73a34e57fbc3999c1bd4a1670a7b56ee4e3590ac9
3
+ size 1789
checkpoints/base_speakers/EN/en_style_se.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f698153be5004b90a8642d1157c89cae7dd296752a3276450ced6a17b8b98a9
3
+ size 1783
checkpoints/base_speakers/ZH/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9
3
+ size 160467309
checkpoints/base_speakers/ZH/config.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data": {
3
+ "text_cleaners": [
4
+ "cjke_cleaners2"
5
+ ],
6
+ "sampling_rate": 22050,
7
+ "filter_length": 1024,
8
+ "hop_length": 256,
9
+ "win_length": 1024,
10
+ "n_mel_channels": 80,
11
+ "add_blank": true,
12
+ "cleaned_text": true,
13
+ "n_speakers": 10
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "n_layers_trans_flow": 3,
22
+ "kernel_size": 3,
23
+ "p_dropout": 0.1,
24
+ "resblock": "1",
25
+ "resblock_kernel_sizes": [
26
+ 3,
27
+ 7,
28
+ 11
29
+ ],
30
+ "resblock_dilation_sizes": [
31
+ [
32
+ 1,
33
+ 3,
34
+ 5
35
+ ],
36
+ [
37
+ 1,
38
+ 3,
39
+ 5
40
+ ],
41
+ [
42
+ 1,
43
+ 3,
44
+ 5
45
+ ]
46
+ ],
47
+ "upsample_rates": [
48
+ 8,
49
+ 8,
50
+ 2,
51
+ 2
52
+ ],
53
+ "upsample_initial_channel": 512,
54
+ "upsample_kernel_sizes": [
55
+ 16,
56
+ 16,
57
+ 4,
58
+ 4
59
+ ],
60
+ "n_layers_q": 3,
61
+ "use_spectral_norm": false,
62
+ "gin_channels": 256
63
+ },
64
+ "symbols": [
65
+ "_",
66
+ ",",
67
+ ".",
68
+ "!",
69
+ "?",
70
+ "-",
71
+ "~",
72
+ "\u2026",
73
+ "N",
74
+ "Q",
75
+ "a",
76
+ "b",
77
+ "d",
78
+ "e",
79
+ "f",
80
+ "g",
81
+ "h",
82
+ "i",
83
+ "j",
84
+ "k",
85
+ "l",
86
+ "m",
87
+ "n",
88
+ "o",
89
+ "p",
90
+ "s",
91
+ "t",
92
+ "u",
93
+ "v",
94
+ "w",
95
+ "x",
96
+ "y",
97
+ "z",
98
+ "\u0251",
99
+ "\u00e6",
100
+ "\u0283",
101
+ "\u0291",
102
+ "\u00e7",
103
+ "\u026f",
104
+ "\u026a",
105
+ "\u0254",
106
+ "\u025b",
107
+ "\u0279",
108
+ "\u00f0",
109
+ "\u0259",
110
+ "\u026b",
111
+ "\u0265",
112
+ "\u0278",
113
+ "\u028a",
114
+ "\u027e",
115
+ "\u0292",
116
+ "\u03b8",
117
+ "\u03b2",
118
+ "\u014b",
119
+ "\u0266",
120
+ "\u207c",
121
+ "\u02b0",
122
+ "`",
123
+ "^",
124
+ "#",
125
+ "*",
126
+ "=",
127
+ "\u02c8",
128
+ "\u02cc",
129
+ "\u2192",
130
+ "\u2193",
131
+ "\u2191",
132
+ " "
133
+ ],
134
+ "speakers": {
135
+ "default": 0
136
+ }
137
+ }
checkpoints/base_speakers/ZH/zh_default_se.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf
3
+ size 1789
checkpoints/converter/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89ae83aa4e3668fef64b388b789ff7b0ce0def9f801069edfc18a00ea420748d
3
+ size 131327338
checkpoints/converter/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data": {
3
+ "sampling_rate": 22050,
4
+ "filter_length": 1024,
5
+ "hop_length": 256,
6
+ "win_length": 1024,
7
+ "n_speakers": 0
8
+ },
9
+ "model": {
10
+ "inter_channels": 192,
11
+ "hidden_channels": 192,
12
+ "filter_channels": 768,
13
+ "n_heads": 2,
14
+ "n_layers": 6,
15
+ "kernel_size": 3,
16
+ "p_dropout": 0.1,
17
+ "resblock": "1",
18
+ "resblock_kernel_sizes": [
19
+ 3,
20
+ 7,
21
+ 11
22
+ ],
23
+ "resblock_dilation_sizes": [
24
+ [
25
+ 1,
26
+ 3,
27
+ 5
28
+ ],
29
+ [
30
+ 1,
31
+ 3,
32
+ 5
33
+ ],
34
+ [
35
+ 1,
36
+ 3,
37
+ 5
38
+ ]
39
+ ],
40
+ "upsample_rates": [
41
+ 8,
42
+ 8,
43
+ 2,
44
+ 2
45
+ ],
46
+ "upsample_initial_channel": 512,
47
+ "upsample_kernel_sizes": [
48
+ 16,
49
+ 16,
50
+ 4,
51
+ 4
52
+ ],
53
+ "n_layers_q": 3,
54
+ "use_spectral_norm": false,
55
+ "gin_channels": 256
56
+ }
57
+ }