Chuatury commited on
Commit
416031b
·
unverified ·
1 Parent(s): ddc2731

remove tts

Browse files
README.md CHANGED
@@ -5,11 +5,11 @@ colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.48.0
8
- app_file: app_locally.py
9
  pinned: false
10
  license: mit
11
  models:
12
- - myshell-ai/OpenVoice
13
  ---
14
 
15
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
5
  colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.48.0
8
+ app_file: app.py
9
  pinned: false
10
  license: mit
11
  models:
12
+ - myshell-ai/OpenVoice
13
  ---
14
 
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,268 +1,114 @@
1
  import os
 
 
2
  import gradio as gr
3
- import requests
4
- import langid
5
- import base64
6
- import json
7
- import time
8
 
9
- API_URL = os.environ.get("API_URL")
10
- TOKEN = os.environ.get("TOKEN")
11
- supported_languages = ['zh', 'en']
12
 
13
- output_dir = 'outputs'
14
- os.makedirs(output_dir, exist_ok=True)
15
-
16
- def audio_to_base64(audio_file):
17
- with open(audio_file, "rb") as audio_file:
18
- audio_data = audio_file.read()
19
- base64_data = base64.b64encode(audio_data).decode("utf-8")
20
- return base64_data
21
-
22
- def predict(prompt, style, audio_file_pth, agree):
23
- # initialize a empty info
24
- text_hint = ''
25
- # agree with the terms
26
- if agree == False:
27
- text_hint += '[ERROR] Please accept the Terms & Condition!\n'
28
- gr.Warning("Please accept the Terms & Condition!")
29
- return (
30
- text_hint,
31
- None,
32
- None,
33
- )
34
-
35
- # first detect the input language
36
- language_predicted = langid.classify(prompt)[0].strip()
37
- print(f"Detected language:{language_predicted}")
38
-
39
-
40
- if language_predicted not in supported_languages:
41
- text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
42
- gr.Warning(
43
- f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
44
- )
45
 
46
- return (
47
- text_hint,
48
- None,
49
- None,
50
- )
 
51
 
52
- if language_predicted == "en":
53
- if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
54
- text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
55
- gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
56
- return (
57
- text_hint,
58
- None,
59
- None,
60
- )
61
- style = 'en_' + style
62
- prompt_length = len(prompt.split(' '))
63
 
64
- else:
65
- if style not in ['default']:
66
- text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
67
- gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
68
- return (
69
- text_hint,
70
- None,
71
- None,
72
- )
73
- style = 'cn_' + style
74
- prompt_length = len(prompt)
75
 
76
- speaker_wav = audio_file_pth
 
 
 
 
 
 
 
77
 
78
- if prompt_length < 2:
79
- text_hint += f"[ERROR] Please give a longer prompt text \n"
80
- gr.Warning("Please give a longer prompt text")
81
- return (
82
- text_hint,
83
- None,
84
- None,
85
- )
86
- if prompt_length > 50:
87
- text_hint += f"[ERROR] Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
88
- gr.Warning(
89
- "Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo for your usage"
 
 
 
 
 
90
  )
 
 
 
91
  return (
92
  text_hint,
93
  None,
94
  None,
95
  )
96
 
97
- save_path = f'{output_dir}/output.wav'
98
- speaker_audio_base64 = audio_to_base64(speaker_wav)
99
- data = {
100
- "text": prompt,
101
- "reference_speaker": speaker_audio_base64,
102
- "emotion": style
103
- }
104
-
105
- start = time.time()
106
- # Send the data as a POST request
107
-
108
- headers = {
109
- "Authorization": f"Bearer {TOKEN}"
110
- }
111
-
112
- response = requests.post(API_URL, json=data, headers=headers, timeout=60)
113
- print(f'Get response successfully within {time.time() - start}')
114
-
115
- task_id = response.json()['task_id']
116
- while True:
117
- response = requests.post(API_URL.replace('run', 'get_result'), json={'task_id': task_id}, headers=headers)
118
- json_data = response.json()
119
- status = json_data['status']
120
- if status in ["CREATED", "RUNNING"]:
121
- time.sleep(1)
122
- continue
123
- if status == 'FAILED':
124
- text_hint += f"[HTTP ERROR] {json_data['error']} \n"
125
- gr.Warning(
126
- f"[HTTP ERROR] {json_data['error']} \n"
127
- )
128
- return (
129
- text_hint,
130
- None,
131
- None,
132
- )
133
- else:
134
- decoded_bytes = base64.b64decode(json_data['result']['base64'].encode('utf-8'))
135
- with open(save_path, 'wb') as f:
136
- f.write(decoded_bytes)
137
-
138
- text_hint += f'''Get response successfully \n'''
139
- return (
140
- text_hint,
141
- save_path,
142
- speaker_wav,
143
- )
144
-
145
-
146
- title = "MyShell OpenVoice"
147
-
148
- description = """
149
- We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
150
- """
151
-
152
- markdown_table = """
153
- <div align="center" style="margin-bottom: 10px;">
154
-
155
- | | | |
156
- | :-----------: | :-----------: | :-----------: |
157
- | **OpenSource Repo** | **Project Page** | **Join the Community** |
158
- | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
159
-
160
- </div>
161
- """
162
-
163
- markdown_table_v2 = """
164
- <div align="center" style="margin-bottom: 2px;">
165
 
166
- | | | | |
167
- | :-----------: | :-----------: | :-----------: | :-----------: |
168
- | **Github Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
169
 
170
- | | |
171
- | :-----------: | :-----------: |
172
- **Join the Community** | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
 
 
173
 
174
- </div>
175
- """
176
- content = """
177
- <div>
178
- <strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
179
- This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
180
- </div>
181
- """
182
- wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
183
-
184
-
185
- examples = [
186
- [
187
- "今天天气真好,我们一起出去吃饭吧。",
188
- 'default',
189
- "examples/speaker1.mp3",
190
- True,
191
- ],[
192
- "This audio is generated by open voice with a half-performance model.",
193
- 'whispering',
194
- "examples/speaker2.mp3",
195
- True,
196
- ],
197
- [
198
- "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
199
- 'sad',
200
- "examples/speaker0.mp3",
201
- True,
202
- ],
203
- ]
204
 
205
  with gr.Blocks(analytics_enabled=False) as demo:
206
 
207
  with gr.Row():
208
  with gr.Column():
209
- with gr.Row():
210
- gr.Markdown(
211
- """
212
- ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
213
- """
214
- )
215
- with gr.Row():
216
- gr.Markdown(markdown_table_v2)
217
- with gr.Row():
218
- gr.Markdown(description)
219
- with gr.Column():
220
- gr.Video('./open_voice.mp4', autoplay=True)
221
-
222
- with gr.Row():
223
- gr.HTML(wrapped_markdown_content)
224
-
225
- with gr.Row():
226
- with gr.Column():
227
- input_text_gr = gr.Textbox(
228
- label="Text Prompt",
229
- info="One or two sentences at a time is better. Up to 200 text characters.",
230
- value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
231
- )
232
- style_gr = gr.Dropdown(
233
- label="Style",
234
- info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
235
- choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
236
- max_choices=1,
237
- value="default",
238
- )
239
  ref_gr = gr.Audio(
240
  label="Reference Audio",
241
  info="Click on the ✎ button to upload your own target speaker audio",
242
  type="filepath",
243
- value="examples/speaker2.mp3",
244
  )
245
- tos_gr = gr.Checkbox(
246
- label="Agree",
247
- value=False,
248
- info="I agree to the terms of the MIT license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
 
249
  )
250
 
251
  tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
252
 
253
-
254
  with gr.Column():
255
  out_text_gr = gr.Text(label="Info")
256
- audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
257
  ref_audio_gr = gr.Audio(label="Reference Audio Used")
258
 
259
- gr.Examples(examples,
260
- label="Examples",
261
- inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
262
- outputs=[out_text_gr, audio_gr, ref_audio_gr],
263
- fn=predict,
264
- cache_examples=False,)
265
- tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
266
 
267
- demo.queue(concurrency_count=6)
268
- demo.launch(debug=True, show_api=True)
 
1
  import os
2
+ import torch
3
+ import argparse
4
  import gradio as gr
 
 
 
 
 
5
 
 
 
 
6
 
7
+ parser = argparse.ArgumentParser()
8
+ # parser.add_argument(
9
+ # "--online_checkpoint_url",
10
+ # default="https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_1226.zip",
11
+ # )
12
+ parser.add_argument(
13
+ "--share", action="store_true", default=False, help="make link public"
14
+ )
15
+ args = parser.parse_args()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # first download the checkpoints from server
18
+ # if not os.path.exists("checkpoints/"):
19
+ # print("Downloading OpenVoice checkpoint ...")
20
+ # os.system(f"wget {args.online_checkpoint_url} -O ckpt.zip")
21
+ # print("Extracting OpenVoice checkpoint ...")
22
+ # ZipFile("ckpt.zip").extractall()
23
 
24
+ print("Starting OpenVoice")
 
 
 
 
 
 
 
 
 
 
25
 
26
+ from openvoice import se_extractor
27
+ from openvoice.api import ToneColorConverter
 
 
 
 
 
 
 
 
 
28
 
29
+ ckpt_converter = "checkpoints/converter"
30
+ device = "cuda" if torch.cuda.is_available() else "cpu"
31
+ output_dir = "outputs"
32
+ os.makedirs(output_dir, exist_ok=True)
33
+ tone_color_converter = ToneColorConverter(
34
+ f"{ckpt_converter}/config.json", device=device
35
+ )
36
+ tone_color_converter.load_ckpt(f"{ckpt_converter}/checkpoint.pth")
37
 
38
+ def predict(speaker_wav, transform_wav):
39
+ # initialize a empty info
40
+ text_hint = ""
41
+
42
+ # extract source_se
43
+ source_se, _ = se_extractor.get_se(
44
+ transform_wav,
45
+ tone_color_converter,
46
+ vad=True,
47
+ )
48
+
49
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
50
+ try:
51
+ target_se, _ = se_extractor.get_se(
52
+ speaker_wav,
53
+ tone_color_converter,
54
+ vad=True,
55
  )
56
+ except Exception as e:
57
+ text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
58
+ gr.Warning("[ERROR] Get target tone color error {str(e)} \n")
59
  return (
60
  text_hint,
61
  None,
62
  None,
63
  )
64
 
65
+ save_path = f"{output_dir}/output.wav"
66
+ # Run the tone color converter
67
+ tone_color_converter.convert(
68
+ audio_src_path=transform_wav,
69
+ src_se=source_se,
70
+ tgt_se=target_se,
71
+ output_path=save_path,
72
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ text_hint += f"""Get response successfully \n"""
 
 
75
 
76
+ return (
77
+ text_hint,
78
+ save_path,
79
+ speaker_wav,
80
+ )
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  with gr.Blocks(analytics_enabled=False) as demo:
84
 
85
  with gr.Row():
86
  with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  ref_gr = gr.Audio(
88
  label="Reference Audio",
89
  info="Click on the ✎ button to upload your own target speaker audio",
90
  type="filepath",
91
+ value="examples/speaker0.mp3",
92
  )
93
+ tra_gr = gr.Audio(
94
+ label="Transform Audio",
95
+ info="Click on the ✎ button to upload your own target transform audio",
96
+ type="filepath",
97
+ value=None,
98
  )
99
 
100
  tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
101
 
 
102
  with gr.Column():
103
  out_text_gr = gr.Text(label="Info")
104
+ audio_gr = gr.Audio(label="Synthesized Audio", autoplay=True)
105
  ref_audio_gr = gr.Audio(label="Reference Audio Used")
106
 
107
+ tts_button.click(
108
+ predict,
109
+ [ref_gr, tra_gr],
110
+ outputs=[out_text_gr, audio_gr, ref_audio_gr],
111
+ )
 
 
112
 
113
+ demo.queue()
114
+ demo.launch(debug=True, show_api=True, share=args.share)
app_locally.py DELETED
@@ -1,222 +0,0 @@
1
- import os
2
- import torch
3
- import argparse
4
- import gradio as gr
5
- # from zipfile import ZipFile
6
- import langid
7
-
8
-
9
- parser = argparse.ArgumentParser()
10
- # parser.add_argument(
11
- # "--online_checkpoint_url",
12
- # default="https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_1226.zip",
13
- # )
14
- parser.add_argument(
15
- "--share", action="store_true", default=False, help="make link public"
16
- )
17
- args = parser.parse_args()
18
-
19
- # first download the checkpoints from server
20
- # if not os.path.exists("checkpoints/"):
21
- # print("Downloading OpenVoice checkpoint ...")
22
- # os.system(f"wget {args.online_checkpoint_url} -O ckpt.zip")
23
- # print("Extracting OpenVoice checkpoint ...")
24
- # ZipFile("ckpt.zip").extractall()
25
-
26
- print("Starting OpenVoice")
27
-
28
- # Init EN/ZH baseTTS and ToneConvertor
29
- from openvoice import se_extractor
30
- from openvoice.api import BaseSpeakerTTS, ToneColorConverter
31
-
32
- en_ckpt_base = "checkpoints/base_speakers/EN"
33
- zh_ckpt_base = "checkpoints/base_speakers/ZH"
34
- ckpt_converter = "checkpoints/converter"
35
- device = "cuda" if torch.cuda.is_available() else "cpu"
36
- output_dir = "outputs"
37
- os.makedirs(output_dir, exist_ok=True)
38
- en_base_speaker_tts = BaseSpeakerTTS(f"{en_ckpt_base}/config.json", device=device)
39
- en_base_speaker_tts.load_ckpt(f"{en_ckpt_base}/checkpoint.pth")
40
- zh_base_speaker_tts = BaseSpeakerTTS(f"{zh_ckpt_base}/config.json", device=device)
41
- zh_base_speaker_tts.load_ckpt(f"{zh_ckpt_base}/checkpoint.pth")
42
- tone_color_converter = ToneColorConverter(
43
- f"{ckpt_converter}/config.json", device=device
44
- )
45
- tone_color_converter.load_ckpt(f"{ckpt_converter}/checkpoint.pth")
46
- en_source_default_se = torch.load(f"{en_ckpt_base}/en_default_se.pth").to(device)
47
- zh_source_se = torch.load(f"{zh_ckpt_base}/zh_default_se.pth").to(device)
48
-
49
- supported_languages = ["zh", "en"]
50
-
51
-
52
- def predict(prompt, speaker_wav, transform_wav):
53
- # initialize a empty info
54
- text_hint = ""
55
-
56
- if transform_wav is not None:
57
- # if transform_wav is provided, use it as the source audio
58
- src_path = transform_wav
59
- text_hint += f"Using transform audio {src_path} as source audio \n"
60
-
61
- # extract source_se
62
- source_se, _ = se_extractor.get_se(
63
- transform_wav,
64
- tone_color_converter,
65
- vad=True,
66
- )
67
- else:
68
- # first detect the input language
69
- language_predicted = langid.classify(prompt)[0].strip()
70
- print(f"Detected language:{language_predicted}")
71
-
72
- if language_predicted not in supported_languages:
73
- text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
74
- gr.Warning(
75
- f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
76
- )
77
-
78
- return (
79
- text_hint,
80
- None,
81
- None,
82
- )
83
-
84
- if language_predicted == "zh":
85
- tts_model = zh_base_speaker_tts
86
- source_se = zh_source_se
87
- language = "Chinese"
88
-
89
- else:
90
- tts_model = en_base_speaker_tts
91
- source_se = en_source_default_se
92
- language = "English"
93
-
94
- text_hint += f"Using TTS to generate source audio from the prompt text \n"
95
- src_path = f"{output_dir}/tmp.wav"
96
- tts_model.tts(prompt, src_path, speaker="default", language=language)
97
-
98
- # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
99
- try:
100
- target_se, wavs_folder = se_extractor.get_se(
101
- speaker_wav,
102
- tone_color_converter,
103
- vad=True,
104
- )
105
- # os.system(f'rm -rf {wavs_folder}')
106
- except Exception as e:
107
- text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
108
- gr.Warning("[ERROR] Get target tone color error {str(e)} \n")
109
- return (
110
- text_hint,
111
- None,
112
- None,
113
- )
114
-
115
- save_path = f"{output_dir}/output.wav"
116
- # Run the tone color converter
117
- tone_color_converter.convert(
118
- audio_src_path=src_path,
119
- src_se=source_se,
120
- tgt_se=target_se,
121
- output_path=save_path,
122
- )
123
-
124
- text_hint += f"""Get response successfully \n"""
125
-
126
- return (
127
- text_hint,
128
- save_path,
129
- speaker_wav,
130
- src_path,
131
- )
132
-
133
-
134
- title = "MyShell OpenVoice"
135
-
136
- content = """
137
- <div>
138
- <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
139
- This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
140
- </div>
141
- """
142
- wrapped_markdown_content = (
143
- f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
144
- )
145
-
146
-
147
- examples = [
148
- [
149
- "今天天气真好,我们一起出去吃饭吧。",
150
- "examples/speaker0.mp3",
151
- None,
152
- ],
153
- [
154
- "This audio is generated by open voice with a half-performance model.",
155
- "examples/speaker1.mp3",
156
- None,
157
- ],
158
- [
159
- "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
160
- "examples/speaker2.mp3",
161
- None,
162
- ],
163
- ]
164
-
165
- with gr.Blocks(analytics_enabled=False) as demo:
166
-
167
- with gr.Row():
168
- with gr.Column():
169
- with gr.Row():
170
- gr.Markdown(
171
- """
172
- ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
173
- """
174
- )
175
-
176
- with gr.Row():
177
- gr.HTML(wrapped_markdown_content)
178
-
179
- with gr.Row():
180
- with gr.Column():
181
- input_text_gr = gr.Textbox(
182
- label="Text Prompt",
183
- info="One or two sentences at a time is better. Up to 200 text characters.",
184
- value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
185
- )
186
- ref_gr = gr.Audio(
187
- label="Reference Audio",
188
- info="Click on the ✎ button to upload your own target speaker audio",
189
- type="filepath",
190
- value="examples/speaker0.mp3",
191
- )
192
- tra_gr = gr.Audio(
193
- label="Transform Audio",
194
- info="Click on the ✎ button to upload your own target transform audio",
195
- type="filepath",
196
- value=None,
197
- )
198
-
199
- tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
200
-
201
- with gr.Column():
202
- out_text_gr = gr.Text(label="Info")
203
- audio_gr = gr.Audio(label="Synthesized Audio", autoplay=True)
204
- ref_audio_gr = gr.Audio(label="Reference Audio Used")
205
- tts_audio_gr = gr.Audio(label="TTS Generated Audio")
206
-
207
- gr.Examples(
208
- examples,
209
- label="Examples",
210
- inputs=[input_text_gr, ref_gr, tra_gr],
211
- outputs=[out_text_gr, audio_gr, ref_audio_gr, tts_audio_gr],
212
- fn=predict,
213
- cache_examples=False,
214
- )
215
- tts_button.click(
216
- predict,
217
- [input_text_gr, ref_gr, tra_gr],
218
- outputs=[out_text_gr, audio_gr, ref_audio_gr, tts_audio_gr],
219
- )
220
-
221
- demo.queue()
222
- demo.launch(debug=True, show_api=True, share=args.share)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/base_speakers/EN/checkpoint.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1db1ae1a5c8ded049bd1536051489aefbfad4a5077c01c2257e9e88fa1bb8422
3
- size 160467309
 
 
 
 
checkpoints/base_speakers/EN/config.json DELETED
@@ -1,145 +0,0 @@
1
- {
2
- "data": {
3
- "text_cleaners": [
4
- "cjke_cleaners2"
5
- ],
6
- "sampling_rate": 22050,
7
- "filter_length": 1024,
8
- "hop_length": 256,
9
- "win_length": 1024,
10
- "n_mel_channels": 80,
11
- "add_blank": true,
12
- "cleaned_text": true,
13
- "n_speakers": 10
14
- },
15
- "model": {
16
- "inter_channels": 192,
17
- "hidden_channels": 192,
18
- "filter_channels": 768,
19
- "n_heads": 2,
20
- "n_layers": 6,
21
- "n_layers_trans_flow": 3,
22
- "kernel_size": 3,
23
- "p_dropout": 0.1,
24
- "resblock": "1",
25
- "resblock_kernel_sizes": [
26
- 3,
27
- 7,
28
- 11
29
- ],
30
- "resblock_dilation_sizes": [
31
- [
32
- 1,
33
- 3,
34
- 5
35
- ],
36
- [
37
- 1,
38
- 3,
39
- 5
40
- ],
41
- [
42
- 1,
43
- 3,
44
- 5
45
- ]
46
- ],
47
- "upsample_rates": [
48
- 8,
49
- 8,
50
- 2,
51
- 2
52
- ],
53
- "upsample_initial_channel": 512,
54
- "upsample_kernel_sizes": [
55
- 16,
56
- 16,
57
- 4,
58
- 4
59
- ],
60
- "n_layers_q": 3,
61
- "use_spectral_norm": false,
62
- "gin_channels": 256
63
- },
64
- "symbols": [
65
- "_",
66
- ",",
67
- ".",
68
- "!",
69
- "?",
70
- "-",
71
- "~",
72
- "\u2026",
73
- "N",
74
- "Q",
75
- "a",
76
- "b",
77
- "d",
78
- "e",
79
- "f",
80
- "g",
81
- "h",
82
- "i",
83
- "j",
84
- "k",
85
- "l",
86
- "m",
87
- "n",
88
- "o",
89
- "p",
90
- "s",
91
- "t",
92
- "u",
93
- "v",
94
- "w",
95
- "x",
96
- "y",
97
- "z",
98
- "\u0251",
99
- "\u00e6",
100
- "\u0283",
101
- "\u0291",
102
- "\u00e7",
103
- "\u026f",
104
- "\u026a",
105
- "\u0254",
106
- "\u025b",
107
- "\u0279",
108
- "\u00f0",
109
- "\u0259",
110
- "\u026b",
111
- "\u0265",
112
- "\u0278",
113
- "\u028a",
114
- "\u027e",
115
- "\u0292",
116
- "\u03b8",
117
- "\u03b2",
118
- "\u014b",
119
- "\u0266",
120
- "\u207c",
121
- "\u02b0",
122
- "`",
123
- "^",
124
- "#",
125
- "*",
126
- "=",
127
- "\u02c8",
128
- "\u02cc",
129
- "\u2192",
130
- "\u2193",
131
- "\u2191",
132
- " "
133
- ],
134
- "speakers": {
135
- "default": 1,
136
- "whispering": 2,
137
- "shouting": 3,
138
- "excited": 4,
139
- "cheerful": 5,
140
- "terrified": 6,
141
- "angry": 7,
142
- "sad": 8,
143
- "friendly": 9
144
- }
145
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/base_speakers/EN/en_default_se.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9cab24002eec738d0fe72cb73a34e57fbc3999c1bd4a1670a7b56ee4e3590ac9
3
- size 1789
 
 
 
 
checkpoints/base_speakers/ZH/checkpoint.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9
3
- size 160467309
 
 
 
 
checkpoints/base_speakers/ZH/config.json DELETED
@@ -1,137 +0,0 @@
1
- {
2
- "data": {
3
- "text_cleaners": [
4
- "cjke_cleaners2"
5
- ],
6
- "sampling_rate": 22050,
7
- "filter_length": 1024,
8
- "hop_length": 256,
9
- "win_length": 1024,
10
- "n_mel_channels": 80,
11
- "add_blank": true,
12
- "cleaned_text": true,
13
- "n_speakers": 10
14
- },
15
- "model": {
16
- "inter_channels": 192,
17
- "hidden_channels": 192,
18
- "filter_channels": 768,
19
- "n_heads": 2,
20
- "n_layers": 6,
21
- "n_layers_trans_flow": 3,
22
- "kernel_size": 3,
23
- "p_dropout": 0.1,
24
- "resblock": "1",
25
- "resblock_kernel_sizes": [
26
- 3,
27
- 7,
28
- 11
29
- ],
30
- "resblock_dilation_sizes": [
31
- [
32
- 1,
33
- 3,
34
- 5
35
- ],
36
- [
37
- 1,
38
- 3,
39
- 5
40
- ],
41
- [
42
- 1,
43
- 3,
44
- 5
45
- ]
46
- ],
47
- "upsample_rates": [
48
- 8,
49
- 8,
50
- 2,
51
- 2
52
- ],
53
- "upsample_initial_channel": 512,
54
- "upsample_kernel_sizes": [
55
- 16,
56
- 16,
57
- 4,
58
- 4
59
- ],
60
- "n_layers_q": 3,
61
- "use_spectral_norm": false,
62
- "gin_channels": 256
63
- },
64
- "symbols": [
65
- "_",
66
- ",",
67
- ".",
68
- "!",
69
- "?",
70
- "-",
71
- "~",
72
- "\u2026",
73
- "N",
74
- "Q",
75
- "a",
76
- "b",
77
- "d",
78
- "e",
79
- "f",
80
- "g",
81
- "h",
82
- "i",
83
- "j",
84
- "k",
85
- "l",
86
- "m",
87
- "n",
88
- "o",
89
- "p",
90
- "s",
91
- "t",
92
- "u",
93
- "v",
94
- "w",
95
- "x",
96
- "y",
97
- "z",
98
- "\u0251",
99
- "\u00e6",
100
- "\u0283",
101
- "\u0291",
102
- "\u00e7",
103
- "\u026f",
104
- "\u026a",
105
- "\u0254",
106
- "\u025b",
107
- "\u0279",
108
- "\u00f0",
109
- "\u0259",
110
- "\u026b",
111
- "\u0265",
112
- "\u0278",
113
- "\u028a",
114
- "\u027e",
115
- "\u0292",
116
- "\u03b8",
117
- "\u03b2",
118
- "\u014b",
119
- "\u0266",
120
- "\u207c",
121
- "\u02b0",
122
- "`",
123
- "^",
124
- "#",
125
- "*",
126
- "=",
127
- "\u02c8",
128
- "\u02cc",
129
- "\u2192",
130
- "\u2193",
131
- "\u2191",
132
- " "
133
- ],
134
- "speakers": {
135
- "default": 0
136
- }
137
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/base_speakers/ZH/zh_default_se.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf
3
- size 1789
 
 
 
 
openvoice/api.py CHANGED
@@ -1,12 +1,9 @@
1
  import torch
2
- import numpy as np
3
- import re
4
  import soundfile
5
  from openvoice import utils
6
  from openvoice import commons
7
  import os
8
  import librosa
9
- from openvoice.text import text_to_sequence
10
  from openvoice.mel_processing import spectrogram_torch
11
  from openvoice.models import SynthesizerTrn
12
 
@@ -39,65 +36,6 @@ class OpenVoiceBaseClass(object):
39
  print('missing/unexpected keys:', a, b)
40
 
41
 
42
- class BaseSpeakerTTS(OpenVoiceBaseClass):
43
- language_marks = {
44
- "english": "EN",
45
- "chinese": "ZH",
46
- }
47
-
48
- @staticmethod
49
- def get_text(text, hps, is_symbol):
50
- text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
51
- if hps.data.add_blank:
52
- text_norm = commons.intersperse(text_norm, 0)
53
- text_norm = torch.LongTensor(text_norm)
54
- return text_norm
55
-
56
- @staticmethod
57
- def audio_numpy_concat(segment_data_list, sr, speed=1.):
58
- audio_segments = []
59
- for segment_data in segment_data_list:
60
- audio_segments += segment_data.reshape(-1).tolist()
61
- audio_segments += [0] * int((sr * 0.05)/speed)
62
- audio_segments = np.array(audio_segments).astype(np.float32)
63
- return audio_segments
64
-
65
- @staticmethod
66
- def split_sentences_into_pieces(text, language_str):
67
- texts = utils.split_sentence(text, language_str=language_str)
68
- print(" > Text splitted to sentences.")
69
- print('\n'.join(texts))
70
- print(" > ===========================")
71
- return texts
72
-
73
- def tts(self, text, output_path, speaker, language='English', speed=1.0):
74
- mark = self.language_marks.get(language.lower(), None)
75
- assert mark is not None, f"language {language} is not supported"
76
-
77
- texts = self.split_sentences_into_pieces(text, mark)
78
-
79
- audio_list = []
80
- for t in texts:
81
- t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
82
- t = f'[{mark}]{t}[{mark}]'
83
- stn_tst = self.get_text(t, self.hps, False)
84
- device = self.device
85
- speaker_id = self.hps.speakers[speaker]
86
- with torch.no_grad():
87
- x_tst = stn_tst.unsqueeze(0).to(device)
88
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
89
- sid = torch.LongTensor([speaker_id]).to(device)
90
- audio = self.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6,
91
- length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
92
- audio_list.append(audio)
93
- audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed)
94
-
95
- if output_path is None:
96
- return audio
97
- else:
98
- soundfile.write(output_path, audio, self.hps.data.sampling_rate)
99
-
100
-
101
  class ToneColorConverter(OpenVoiceBaseClass):
102
  def __init__(self, *args, **kwargs):
103
  super().__init__(*args, **kwargs)
 
1
  import torch
 
 
2
  import soundfile
3
  from openvoice import utils
4
  from openvoice import commons
5
  import os
6
  import librosa
 
7
  from openvoice.mel_processing import spectrogram_torch
8
  from openvoice.models import SynthesizerTrn
9
 
 
36
  print('missing/unexpected keys:', a, b)
37
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  class ToneColorConverter(OpenVoiceBaseClass):
40
  def __init__(self, *args, **kwargs):
41
  super().__init__(*args, **kwargs)
openvoice/openvoice_app.py DELETED
@@ -1,275 +0,0 @@
1
- import os
2
- import torch
3
- import argparse
4
- import gradio as gr
5
- from zipfile import ZipFile
6
- import langid
7
- from openvoice import se_extractor
8
- from openvoice.api import BaseSpeakerTTS, ToneColorConverter
9
-
10
- parser = argparse.ArgumentParser()
11
- parser.add_argument("--share", action='store_true', default=False, help="make link public")
12
- args = parser.parse_args()
13
-
14
- en_ckpt_base = 'checkpoints/base_speakers/EN'
15
- zh_ckpt_base = 'checkpoints/base_speakers/ZH'
16
- ckpt_converter = 'checkpoints/converter'
17
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
18
- output_dir = 'outputs'
19
- os.makedirs(output_dir, exist_ok=True)
20
-
21
- # load models
22
- en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
23
- en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
24
- zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
25
- zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
26
- tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
27
- tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
28
-
29
- # load speaker embeddings
30
- en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
31
- en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
32
- zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
33
-
34
- # This online demo mainly supports English and Chinese
35
- supported_languages = ['zh', 'en']
36
-
37
- def predict(prompt, style, audio_file_pth, agree):
38
- # initialize a empty info
39
- text_hint = ''
40
- # agree with the terms
41
- if agree == False:
42
- text_hint += '[ERROR] Please accept the Terms & Condition!\n'
43
- gr.Warning("Please accept the Terms & Condition!")
44
- return (
45
- text_hint,
46
- None,
47
- None,
48
- )
49
-
50
- # first detect the input language
51
- language_predicted = langid.classify(prompt)[0].strip()
52
- print(f"Detected language:{language_predicted}")
53
-
54
- if language_predicted not in supported_languages:
55
- text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
56
- gr.Warning(
57
- f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
58
- )
59
-
60
- return (
61
- text_hint,
62
- None,
63
- None,
64
- )
65
-
66
- if language_predicted == "zh":
67
- tts_model = zh_base_speaker_tts
68
- source_se = zh_source_se
69
- language = 'Chinese'
70
- if style not in ['default']:
71
- text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
72
- gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
73
- return (
74
- text_hint,
75
- None,
76
- None,
77
- )
78
-
79
- else:
80
- tts_model = en_base_speaker_tts
81
- if style == 'default':
82
- source_se = en_source_default_se
83
- else:
84
- source_se = en_source_style_se
85
- language = 'English'
86
- if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
87
- text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
88
- gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
89
- return (
90
- text_hint,
91
- None,
92
- None,
93
- )
94
-
95
- speaker_wav = audio_file_pth
96
-
97
- if len(prompt) < 2:
98
- text_hint += f"[ERROR] Please give a longer prompt text \n"
99
- gr.Warning("Please give a longer prompt text")
100
- return (
101
- text_hint,
102
- None,
103
- None,
104
- )
105
- if len(prompt) > 200:
106
- text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
107
- gr.Warning(
108
- "Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage"
109
- )
110
- return (
111
- text_hint,
112
- None,
113
- None,
114
- )
115
-
116
- # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
117
- try:
118
- target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', vad=True)
119
- except Exception as e:
120
- text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
121
- gr.Warning(
122
- "[ERROR] Get target tone color error {str(e)} \n"
123
- )
124
- return (
125
- text_hint,
126
- None,
127
- None,
128
- )
129
-
130
- src_path = f'{output_dir}/tmp.wav'
131
- tts_model.tts(prompt, src_path, speaker=style, language=language)
132
-
133
- save_path = f'{output_dir}/output.wav'
134
- # Run the tone color converter
135
- encode_message = "@MyShell"
136
- tone_color_converter.convert(
137
- audio_src_path=src_path,
138
- src_se=source_se,
139
- tgt_se=target_se,
140
- output_path=save_path,
141
- message=encode_message)
142
-
143
- text_hint += f'''Get response successfully \n'''
144
-
145
- return (
146
- text_hint,
147
- save_path,
148
- speaker_wav,
149
- )
150
-
151
-
152
-
153
- title = "MyShell OpenVoice"
154
-
155
- description = """
156
- We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
157
- """
158
-
159
- markdown_table = """
160
- <div align="center" style="margin-bottom: 10px;">
161
-
162
- | | | |
163
- | :-----------: | :-----------: | :-----------: |
164
- | **OpenSource Repo** | **Project Page** | **Join the Community** |
165
- | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
166
-
167
- </div>
168
- """
169
-
170
- markdown_table_v2 = """
171
- <div align="center" style="margin-bottom: 2px;">
172
-
173
- | | | | |
174
- | :-----------: | :-----------: | :-----------: | :-----------: |
175
- | **OpenSource Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
176
-
177
- | | |
178
- | :-----------: | :-----------: |
179
- **Join the Community** | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
180
-
181
- </div>
182
- """
183
- content = """
184
- <div>
185
- <strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
186
- This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
187
- </div>
188
- """
189
- wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
190
-
191
-
192
- examples = [
193
- [
194
- "今天天气真好,我们一起出去吃饭吧。",
195
- 'default',
196
- "resources/demo_speaker1.mp3",
197
- True,
198
- ],[
199
- "This audio is generated by open voice with a half-performance model.",
200
- 'whispering',
201
- "resources/demo_speaker2.mp3",
202
- True,
203
- ],
204
- [
205
- "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
206
- 'sad',
207
- "resources/demo_speaker0.mp3",
208
- True,
209
- ],
210
- ]
211
-
212
- with gr.Blocks(analytics_enabled=False) as demo:
213
-
214
- with gr.Row():
215
- with gr.Column():
216
- with gr.Row():
217
- gr.Markdown(
218
- """
219
- ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
220
- """
221
- )
222
- with gr.Row():
223
- gr.Markdown(markdown_table_v2)
224
- with gr.Row():
225
- gr.Markdown(description)
226
- with gr.Column():
227
- gr.Video('https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f', autoplay=True)
228
-
229
- with gr.Row():
230
- gr.HTML(wrapped_markdown_content)
231
-
232
- with gr.Row():
233
- with gr.Column():
234
- input_text_gr = gr.Textbox(
235
- label="Text Prompt",
236
- info="One or two sentences at a time is better. Up to 200 text characters.",
237
- value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
238
- )
239
- style_gr = gr.Dropdown(
240
- label="Style",
241
- info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
242
- choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
243
- max_choices=1,
244
- value="default",
245
- )
246
- ref_gr = gr.Audio(
247
- label="Reference Audio",
248
- info="Click on the ✎ button to upload your own target speaker audio",
249
- type="filepath",
250
- value="resources/demo_speaker2.mp3",
251
- )
252
- tos_gr = gr.Checkbox(
253
- label="Agree",
254
- value=False,
255
- info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
256
- )
257
-
258
- tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
259
-
260
-
261
- with gr.Column():
262
- out_text_gr = gr.Text(label="Info")
263
- audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
264
- ref_audio_gr = gr.Audio(label="Reference Audio Used")
265
-
266
- gr.Examples(examples,
267
- label="Examples",
268
- inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
269
- outputs=[out_text_gr, audio_gr, ref_audio_gr],
270
- fn=predict,
271
- cache_examples=False,)
272
- tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
273
-
274
- demo.queue()
275
- demo.launch(debug=True, show_api=True, share=args.share)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
openvoice/text/__init__.py DELETED
@@ -1,79 +0,0 @@
1
- """ from https://github.com/keithito/tacotron """
2
- from openvoice.text import cleaners
3
- from openvoice.text.symbols import symbols
4
-
5
-
6
- # Mappings from symbol to numeric ID and vice versa:
7
- _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8
- _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9
-
10
-
11
- def text_to_sequence(text, symbols, cleaner_names):
12
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13
- Args:
14
- text: string to convert to a sequence
15
- cleaner_names: names of the cleaner functions to run the text through
16
- Returns:
17
- List of integers corresponding to the symbols in the text
18
- '''
19
- sequence = []
20
- symbol_to_id = {s: i for i, s in enumerate(symbols)}
21
- clean_text = _clean_text(text, cleaner_names)
22
- print(clean_text)
23
- print(f" length:{len(clean_text)}")
24
- for symbol in clean_text:
25
- if symbol not in symbol_to_id.keys():
26
- continue
27
- symbol_id = symbol_to_id[symbol]
28
- sequence += [symbol_id]
29
- print(f" length:{len(sequence)}")
30
- return sequence
31
-
32
-
33
- def cleaned_text_to_sequence(cleaned_text, symbols):
34
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
35
- Args:
36
- text: string to convert to a sequence
37
- Returns:
38
- List of integers corresponding to the symbols in the text
39
- '''
40
- symbol_to_id = {s: i for i, s in enumerate(symbols)}
41
- sequence = [symbol_to_id[symbol] for symbol in cleaned_text if symbol in symbol_to_id.keys()]
42
- return sequence
43
-
44
-
45
-
46
- from openvoice.text.symbols import language_tone_start_map
47
- def cleaned_text_to_sequence_vits2(cleaned_text, tones, language, symbols, languages):
48
- """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
49
- Args:
50
- text: string to convert to a sequence
51
- Returns:
52
- List of integers corresponding to the symbols in the text
53
- """
54
- symbol_to_id = {s: i for i, s in enumerate(symbols)}
55
- language_id_map = {s: i for i, s in enumerate(languages)}
56
- phones = [symbol_to_id[symbol] for symbol in cleaned_text]
57
- tone_start = language_tone_start_map[language]
58
- tones = [i + tone_start for i in tones]
59
- lang_id = language_id_map[language]
60
- lang_ids = [lang_id for i in phones]
61
- return phones, tones, lang_ids
62
-
63
-
64
- def sequence_to_text(sequence):
65
- '''Converts a sequence of IDs back to a string'''
66
- result = ''
67
- for symbol_id in sequence:
68
- s = _id_to_symbol[symbol_id]
69
- result += s
70
- return result
71
-
72
-
73
- def _clean_text(text, cleaner_names):
74
- for name in cleaner_names:
75
- cleaner = getattr(cleaners, name)
76
- if not cleaner:
77
- raise Exception('Unknown cleaner: %s' % name)
78
- text = cleaner(text)
79
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
openvoice/text/cleaners.py DELETED
@@ -1,16 +0,0 @@
1
- import re
2
- from openvoice.text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
3
- from openvoice.text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
4
-
5
- def cjke_cleaners2(text):
6
- text = re.sub(r'\[ZH\](.*?)\[ZH\]',
7
- lambda x: chinese_to_ipa(x.group(1))+' ', text)
8
- text = re.sub(r'\[JA\](.*?)\[JA\]',
9
- lambda x: japanese_to_ipa2(x.group(1))+' ', text)
10
- text = re.sub(r'\[KO\](.*?)\[KO\]',
11
- lambda x: korean_to_ipa(x.group(1))+' ', text)
12
- text = re.sub(r'\[EN\](.*?)\[EN\]',
13
- lambda x: english_to_ipa2(x.group(1))+' ', text)
14
- text = re.sub(r'\s+$', '', text)
15
- text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
16
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
openvoice/text/english.py DELETED
@@ -1,188 +0,0 @@
1
- """ from https://github.com/keithito/tacotron """
2
-
3
- '''
4
- Cleaners are transformations that run over the input text at both training and eval time.
5
-
6
- Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7
- hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8
- 1. "english_cleaners" for English text
9
- 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10
- the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11
- 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12
- the symbols in symbols.py to match your data).
13
- '''
14
-
15
-
16
- # Regular expression matching whitespace:
17
-
18
-
19
- import re
20
- import inflect
21
- from unidecode import unidecode
22
- import eng_to_ipa as ipa
23
- _inflect = inflect.engine()
24
- _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
25
- _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
26
- _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
27
- _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
28
- _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
29
- _number_re = re.compile(r'[0-9]+')
30
-
31
- # List of (regular expression, replacement) pairs for abbreviations:
32
- _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
33
- ('mrs', 'misess'),
34
- ('mr', 'mister'),
35
- ('dr', 'doctor'),
36
- ('st', 'saint'),
37
- ('co', 'company'),
38
- ('jr', 'junior'),
39
- ('maj', 'major'),
40
- ('gen', 'general'),
41
- ('drs', 'doctors'),
42
- ('rev', 'reverend'),
43
- ('lt', 'lieutenant'),
44
- ('hon', 'honorable'),
45
- ('sgt', 'sergeant'),
46
- ('capt', 'captain'),
47
- ('esq', 'esquire'),
48
- ('ltd', 'limited'),
49
- ('col', 'colonel'),
50
- ('ft', 'fort'),
51
- ]]
52
-
53
-
54
- # List of (ipa, lazy ipa) pairs:
55
- _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
56
- ('r', 'ɹ'),
57
- ('æ', 'e'),
58
- ('ɑ', 'a'),
59
- ('ɔ', 'o'),
60
- ('ð', 'z'),
61
- ('θ', 's'),
62
- ('ɛ', 'e'),
63
- ('ɪ', 'i'),
64
- ('ʊ', 'u'),
65
- ('ʒ', 'ʥ'),
66
- ('ʤ', 'ʥ'),
67
- ('ˈ', '↓'),
68
- ]]
69
-
70
- # List of (ipa, lazy ipa2) pairs:
71
- _lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
72
- ('r', 'ɹ'),
73
- ('ð', 'z'),
74
- ('θ', 's'),
75
- ('ʒ', 'ʑ'),
76
- ('ʤ', 'dʑ'),
77
- ('ˈ', '↓'),
78
- ]]
79
-
80
- # List of (ipa, ipa2) pairs
81
- _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
82
- ('r', 'ɹ'),
83
- ('ʤ', 'dʒ'),
84
- ('ʧ', 'tʃ')
85
- ]]
86
-
87
-
88
- def expand_abbreviations(text):
89
- for regex, replacement in _abbreviations:
90
- text = re.sub(regex, replacement, text)
91
- return text
92
-
93
-
94
- def collapse_whitespace(text):
95
- return re.sub(r'\s+', ' ', text)
96
-
97
-
98
- def _remove_commas(m):
99
- return m.group(1).replace(',', '')
100
-
101
-
102
- def _expand_decimal_point(m):
103
- return m.group(1).replace('.', ' point ')
104
-
105
-
106
- def _expand_dollars(m):
107
- match = m.group(1)
108
- parts = match.split('.')
109
- if len(parts) > 2:
110
- return match + ' dollars' # Unexpected format
111
- dollars = int(parts[0]) if parts[0] else 0
112
- cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
113
- if dollars and cents:
114
- dollar_unit = 'dollar' if dollars == 1 else 'dollars'
115
- cent_unit = 'cent' if cents == 1 else 'cents'
116
- return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
117
- elif dollars:
118
- dollar_unit = 'dollar' if dollars == 1 else 'dollars'
119
- return '%s %s' % (dollars, dollar_unit)
120
- elif cents:
121
- cent_unit = 'cent' if cents == 1 else 'cents'
122
- return '%s %s' % (cents, cent_unit)
123
- else:
124
- return 'zero dollars'
125
-
126
-
127
- def _expand_ordinal(m):
128
- return _inflect.number_to_words(m.group(0))
129
-
130
-
131
- def _expand_number(m):
132
- num = int(m.group(0))
133
- if num > 1000 and num < 3000:
134
- if num == 2000:
135
- return 'two thousand'
136
- elif num > 2000 and num < 2010:
137
- return 'two thousand ' + _inflect.number_to_words(num % 100)
138
- elif num % 100 == 0:
139
- return _inflect.number_to_words(num // 100) + ' hundred'
140
- else:
141
- return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
142
- else:
143
- return _inflect.number_to_words(num, andword='')
144
-
145
-
146
- def normalize_numbers(text):
147
- text = re.sub(_comma_number_re, _remove_commas, text)
148
- text = re.sub(_pounds_re, r'\1 pounds', text)
149
- text = re.sub(_dollars_re, _expand_dollars, text)
150
- text = re.sub(_decimal_number_re, _expand_decimal_point, text)
151
- text = re.sub(_ordinal_re, _expand_ordinal, text)
152
- text = re.sub(_number_re, _expand_number, text)
153
- return text
154
-
155
-
156
- def mark_dark_l(text):
157
- return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
158
-
159
-
160
- def english_to_ipa(text):
161
- text = unidecode(text).lower()
162
- text = expand_abbreviations(text)
163
- text = normalize_numbers(text)
164
- phonemes = ipa.convert(text)
165
- phonemes = collapse_whitespace(phonemes)
166
- return phonemes
167
-
168
-
169
- def english_to_lazy_ipa(text):
170
- text = english_to_ipa(text)
171
- for regex, replacement in _lazy_ipa:
172
- text = re.sub(regex, replacement, text)
173
- return text
174
-
175
-
176
- def english_to_ipa2(text):
177
- text = english_to_ipa(text)
178
- text = mark_dark_l(text)
179
- for regex, replacement in _ipa_to_ipa2:
180
- text = re.sub(regex, replacement, text)
181
- return text.replace('...', '…')
182
-
183
-
184
- def english_to_lazy_ipa2(text):
185
- text = english_to_ipa(text)
186
- for regex, replacement in _lazy_ipa2:
187
- text = re.sub(regex, replacement, text)
188
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
openvoice/text/mandarin.py DELETED
@@ -1,326 +0,0 @@
1
- import os
2
- import sys
3
- import re
4
- from pypinyin import lazy_pinyin, BOPOMOFO
5
- import jieba
6
- import cn2an
7
- import logging
8
-
9
-
10
- # List of (Latin alphabet, bopomofo) pairs:
11
- _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
12
- ('a', 'ㄟˉ'),
13
- ('b', 'ㄅㄧˋ'),
14
- ('c', 'ㄙㄧˉ'),
15
- ('d', 'ㄉㄧˋ'),
16
- ('e', 'ㄧˋ'),
17
- ('f', 'ㄝˊㄈㄨˋ'),
18
- ('g', 'ㄐㄧˋ'),
19
- ('h', 'ㄝˇㄑㄩˋ'),
20
- ('i', 'ㄞˋ'),
21
- ('j', 'ㄐㄟˋ'),
22
- ('k', 'ㄎㄟˋ'),
23
- ('l', 'ㄝˊㄛˋ'),
24
- ('m', 'ㄝˊㄇㄨˋ'),
25
- ('n', 'ㄣˉ'),
26
- ('o', 'ㄡˉ'),
27
- ('p', 'ㄆㄧˉ'),
28
- ('q', 'ㄎㄧㄡˉ'),
29
- ('r', 'ㄚˋ'),
30
- ('s', 'ㄝˊㄙˋ'),
31
- ('t', 'ㄊㄧˋ'),
32
- ('u', 'ㄧㄡˉ'),
33
- ('v', 'ㄨㄧˉ'),
34
- ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
35
- ('x', 'ㄝˉㄎㄨˋㄙˋ'),
36
- ('y', 'ㄨㄞˋ'),
37
- ('z', 'ㄗㄟˋ')
38
- ]]
39
-
40
- # List of (bopomofo, romaji) pairs:
41
- _bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
42
- ('ㄅㄛ', 'p⁼wo'),
43
- ('ㄆㄛ', 'pʰwo'),
44
- ('ㄇㄛ', 'mwo'),
45
- ('ㄈㄛ', 'fwo'),
46
- ('ㄅ', 'p⁼'),
47
- ('ㄆ', 'pʰ'),
48
- ('ㄇ', 'm'),
49
- ('ㄈ', 'f'),
50
- ('ㄉ', 't⁼'),
51
- ('ㄊ', 'tʰ'),
52
- ('ㄋ', 'n'),
53
- ('ㄌ', 'l'),
54
- ('ㄍ', 'k⁼'),
55
- ('ㄎ', 'kʰ'),
56
- ('ㄏ', 'h'),
57
- ('ㄐ', 'ʧ⁼'),
58
- ('ㄑ', 'ʧʰ'),
59
- ('ㄒ', 'ʃ'),
60
- ('ㄓ', 'ʦ`⁼'),
61
- ('ㄔ', 'ʦ`ʰ'),
62
- ('ㄕ', 's`'),
63
- ('ㄖ', 'ɹ`'),
64
- ('ㄗ', 'ʦ⁼'),
65
- ('ㄘ', 'ʦʰ'),
66
- ('ㄙ', 's'),
67
- ('ㄚ', 'a'),
68
- ('ㄛ', 'o'),
69
- ('ㄜ', 'ə'),
70
- ('ㄝ', 'e'),
71
- ('ㄞ', 'ai'),
72
- ('ㄟ', 'ei'),
73
- ('ㄠ', 'au'),
74
- ('ㄡ', 'ou'),
75
- ('ㄧㄢ', 'yeNN'),
76
- ('ㄢ', 'aNN'),
77
- ('ㄧㄣ', 'iNN'),
78
- ('ㄣ', 'əNN'),
79
- ('ㄤ', 'aNg'),
80
- ('ㄧㄥ', 'iNg'),
81
- ('ㄨㄥ', 'uNg'),
82
- ('ㄩㄥ', 'yuNg'),
83
- ('ㄥ', 'əNg'),
84
- ('ㄦ', 'əɻ'),
85
- ('ㄧ', 'i'),
86
- ('ㄨ', 'u'),
87
- ('ㄩ', 'ɥ'),
88
- ('ˉ', '→'),
89
- ('ˊ', '↑'),
90
- ('ˇ', '↓↑'),
91
- ('ˋ', '↓'),
92
- ('˙', ''),
93
- (',', ','),
94
- ('。', '.'),
95
- ('!', '!'),
96
- ('?', '?'),
97
- ('—', '-')
98
- ]]
99
-
100
- # List of (romaji, ipa) pairs:
101
- _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
102
- ('ʃy', 'ʃ'),
103
- ('ʧʰy', 'ʧʰ'),
104
- ('ʧ⁼y', 'ʧ⁼'),
105
- ('NN', 'n'),
106
- ('Ng', 'ŋ'),
107
- ('y', 'j'),
108
- ('h', 'x')
109
- ]]
110
-
111
- # List of (bopomofo, ipa) pairs:
112
- _bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
113
- ('ㄅㄛ', 'p⁼wo'),
114
- ('ㄆㄛ', 'pʰwo'),
115
- ('ㄇㄛ', 'mwo'),
116
- ('ㄈㄛ', 'fwo'),
117
- ('ㄅ', 'p⁼'),
118
- ('ㄆ', 'pʰ'),
119
- ('ㄇ', 'm'),
120
- ('ㄈ', 'f'),
121
- ('ㄉ', 't⁼'),
122
- ('ㄊ', 'tʰ'),
123
- ('ㄋ', 'n'),
124
- ('ㄌ', 'l'),
125
- ('ㄍ', 'k⁼'),
126
- ('ㄎ', 'kʰ'),
127
- ('ㄏ', 'x'),
128
- ('ㄐ', 'tʃ⁼'),
129
- ('ㄑ', 'tʃʰ'),
130
- ('ㄒ', 'ʃ'),
131
- ('ㄓ', 'ts`⁼'),
132
- ('ㄔ', 'ts`ʰ'),
133
- ('ㄕ', 's`'),
134
- ('ㄖ', 'ɹ`'),
135
- ('ㄗ', 'ts⁼'),
136
- ('ㄘ', 'tsʰ'),
137
- ('ㄙ', 's'),
138
- ('ㄚ', 'a'),
139
- ('ㄛ', 'o'),
140
- ('ㄜ', 'ə'),
141
- ('ㄝ', 'ɛ'),
142
- ('ㄞ', 'aɪ'),
143
- ('ㄟ', 'eɪ'),
144
- ('ㄠ', 'ɑʊ'),
145
- ('ㄡ', 'oʊ'),
146
- ('ㄧㄢ', 'jɛn'),
147
- ('ㄩㄢ', 'ɥæn'),
148
- ('ㄢ', 'an'),
149
- ('ㄧㄣ', 'in'),
150
- ('ㄩㄣ', 'ɥn'),
151
- ('ㄣ', 'ən'),
152
- ('ㄤ', 'ɑŋ'),
153
- ('ㄧㄥ', 'iŋ'),
154
- ('ㄨㄥ', 'ʊŋ'),
155
- ('ㄩㄥ', 'jʊŋ'),
156
- ('ㄥ', 'əŋ'),
157
- ('ㄦ', 'əɻ'),
158
- ('ㄧ', 'i'),
159
- ('ㄨ', 'u'),
160
- ('ㄩ', 'ɥ'),
161
- ('ˉ', '→'),
162
- ('ˊ', '↑'),
163
- ('ˇ', '↓↑'),
164
- ('ˋ', '↓'),
165
- ('˙', ''),
166
- (',', ','),
167
- ('。', '.'),
168
- ('!', '!'),
169
- ('?', '?'),
170
- ('—', '-')
171
- ]]
172
-
173
- # List of (bopomofo, ipa2) pairs:
174
- _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
175
- ('ㄅㄛ', 'pwo'),
176
- ('ㄆㄛ', 'pʰwo'),
177
- ('ㄇㄛ', 'mwo'),
178
- ('ㄈㄛ', 'fwo'),
179
- ('ㄅ', 'p'),
180
- ('ㄆ', 'pʰ'),
181
- ('ㄇ', 'm'),
182
- ('ㄈ', 'f'),
183
- ('ㄉ', 't'),
184
- ('ㄊ', 'tʰ'),
185
- ('ㄋ', 'n'),
186
- ('ㄌ', 'l'),
187
- ('ㄍ', 'k'),
188
- ('ㄎ', 'kʰ'),
189
- ('ㄏ', 'h'),
190
- ('ㄐ', 'tɕ'),
191
- ('ㄑ', 'tɕʰ'),
192
- ('ㄒ', 'ɕ'),
193
- ('ㄓ', 'tʂ'),
194
- ('ㄔ', 'tʂʰ'),
195
- ('ㄕ', 'ʂ'),
196
- ('ㄖ', 'ɻ'),
197
- ('ㄗ', 'ts'),
198
- ('ㄘ', 'tsʰ'),
199
- ('ㄙ', 's'),
200
- ('ㄚ', 'a'),
201
- ('ㄛ', 'o'),
202
- ('ㄜ', 'ɤ'),
203
- ('ㄝ', 'ɛ'),
204
- ('ㄞ', 'aɪ'),
205
- ('ㄟ', 'eɪ'),
206
- ('ㄠ', 'ɑʊ'),
207
- ('ㄡ', 'oʊ'),
208
- ('ㄧㄢ', 'jɛn'),
209
- ('ㄩㄢ', 'yæn'),
210
- ('ㄢ', 'an'),
211
- ('ㄧㄣ', 'in'),
212
- ('ㄩㄣ', 'yn'),
213
- ('ㄣ', 'ən'),
214
- ('ㄤ', 'ɑŋ'),
215
- ('ㄧㄥ', 'iŋ'),
216
- ('ㄨㄥ', 'ʊŋ'),
217
- ('ㄩㄥ', 'jʊŋ'),
218
- ('ㄥ', 'ɤŋ'),
219
- ('ㄦ', 'əɻ'),
220
- ('ㄧ', 'i'),
221
- ('ㄨ', 'u'),
222
- ('ㄩ', 'y'),
223
- ('ˉ', '˥'),
224
- ('ˊ', '˧˥'),
225
- ('ˇ', '˨˩˦'),
226
- ('ˋ', '˥˩'),
227
- ('˙', ''),
228
- (',', ','),
229
- ('。', '.'),
230
- ('!', '!'),
231
- ('?', '?'),
232
- ('—', '-')
233
- ]]
234
-
235
-
236
- def number_to_chinese(text):
237
- numbers = re.findall(r'\d+(?:\.?\d+)?', text)
238
- for number in numbers:
239
- text = text.replace(number, cn2an.an2cn(number), 1)
240
- return text
241
-
242
-
243
- def chinese_to_bopomofo(text):
244
- text = text.replace('、', ',').replace(';', ',').replace(':', ',')
245
- words = jieba.lcut(text, cut_all=False)
246
- text = ''
247
- for word in words:
248
- bopomofos = lazy_pinyin(word, BOPOMOFO)
249
- if not re.search('[\u4e00-\u9fff]', word):
250
- text += word
251
- continue
252
- for i in range(len(bopomofos)):
253
- bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
254
- if text != '':
255
- text += ' '
256
- text += ''.join(bopomofos)
257
- return text
258
-
259
-
260
- def latin_to_bopomofo(text):
261
- for regex, replacement in _latin_to_bopomofo:
262
- text = re.sub(regex, replacement, text)
263
- return text
264
-
265
-
266
- def bopomofo_to_romaji(text):
267
- for regex, replacement in _bopomofo_to_romaji:
268
- text = re.sub(regex, replacement, text)
269
- return text
270
-
271
-
272
- def bopomofo_to_ipa(text):
273
- for regex, replacement in _bopomofo_to_ipa:
274
- text = re.sub(regex, replacement, text)
275
- return text
276
-
277
-
278
- def bopomofo_to_ipa2(text):
279
- for regex, replacement in _bopomofo_to_ipa2:
280
- text = re.sub(regex, replacement, text)
281
- return text
282
-
283
-
284
- def chinese_to_romaji(text):
285
- text = number_to_chinese(text)
286
- text = chinese_to_bopomofo(text)
287
- text = latin_to_bopomofo(text)
288
- text = bopomofo_to_romaji(text)
289
- text = re.sub('i([aoe])', r'y\1', text)
290
- text = re.sub('u([aoəe])', r'w\1', text)
291
- text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
292
- r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
293
- text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
294
- return text
295
-
296
-
297
- def chinese_to_lazy_ipa(text):
298
- text = chinese_to_romaji(text)
299
- for regex, replacement in _romaji_to_ipa:
300
- text = re.sub(regex, replacement, text)
301
- return text
302
-
303
-
304
- def chinese_to_ipa(text):
305
- text = number_to_chinese(text)
306
- text = chinese_to_bopomofo(text)
307
- text = latin_to_bopomofo(text)
308
- text = bopomofo_to_ipa(text)
309
- text = re.sub('i([aoe])', r'j\1', text)
310
- text = re.sub('u([aoəe])', r'w\1', text)
311
- text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
312
- r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
313
- text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
314
- return text
315
-
316
-
317
- def chinese_to_ipa2(text):
318
- text = number_to_chinese(text)
319
- text = chinese_to_bopomofo(text)
320
- text = latin_to_bopomofo(text)
321
- text = bopomofo_to_ipa2(text)
322
- text = re.sub(r'i([aoe])', r'j\1', text)
323
- text = re.sub(r'u([aoəe])', r'w\1', text)
324
- text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
325
- text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
326
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
openvoice/text/symbols.py DELETED
@@ -1,88 +0,0 @@
1
- '''
2
- Defines the set of symbols used in text input to the model.
3
- '''
4
-
5
- # japanese_cleaners
6
- # _pad = '_'
7
- # _punctuation = ',.!?-'
8
- # _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
9
-
10
-
11
- '''# japanese_cleaners2
12
- _pad = '_'
13
- _punctuation = ',.!?-~…'
14
- _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
15
- '''
16
-
17
-
18
- '''# korean_cleaners
19
- _pad = '_'
20
- _punctuation = ',.!?…~'
21
- _letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
22
- '''
23
-
24
- '''# chinese_cleaners
25
- _pad = '_'
26
- _punctuation = ',。!?—…'
27
- _letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
28
- '''
29
-
30
- # # zh_ja_mixture_cleaners
31
- # _pad = '_'
32
- # _punctuation = ',.!?-~…'
33
- # _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
34
-
35
-
36
- '''# sanskrit_cleaners
37
- _pad = '_'
38
- _punctuation = '।'
39
- _letters = 'ँंःअआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहऽािीुूृॄेैोौ्ॠॢ '
40
- '''
41
-
42
- '''# cjks_cleaners
43
- _pad = '_'
44
- _punctuation = ',.!?-~…'
45
- _letters = 'NQabdefghijklmnopstuvwxyzʃʧʥʦɯɹəɥçɸɾβŋɦː⁼ʰ`^#*=→↓↑ '
46
- '''
47
-
48
- '''# thai_cleaners
49
- _pad = '_'
50
- _punctuation = '.!? '
51
- _letters = 'กขฃคฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลวศษสหฬอฮฯะัาำิีึืุูเแโใไๅๆ็่้๊๋์'
52
- '''
53
-
54
- # # cjke_cleaners2
55
- _pad = '_'
56
- _punctuation = ',.!?-~…'
57
- _letters = 'NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ '
58
-
59
-
60
- '''# shanghainese_cleaners
61
- _pad = '_'
62
- _punctuation = ',.!?…'
63
- _letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
64
- '''
65
-
66
- '''# chinese_dialect_cleaners
67
- _pad = '_'
68
- _punctuation = ',.!?~…─'
69
- _letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚ᴀᴇ↑↓∅ⱼ '
70
- '''
71
-
72
- # Export all symbols:
73
- symbols = [_pad] + list(_punctuation) + list(_letters)
74
-
75
- # Special symbol ids
76
- SPACE_ID = symbols.index(" ")
77
-
78
- num_ja_tones = 1
79
- num_kr_tones = 1
80
- num_zh_tones = 6
81
- num_en_tones = 4
82
-
83
- language_tone_start_map = {
84
- "ZH": 0,
85
- "JP": num_zh_tones,
86
- "EN": num_zh_tones + num_ja_tones,
87
- 'KR': num_zh_tones + num_ja_tones + num_en_tones,
88
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,16 +1,8 @@
1
- langid
2
  librosa==0.9.1
3
  faster-whisper==0.9.0
4
  pydub==0.25.1
5
- wavmark==0.0.2
6
  numpy==1.22.0
7
- eng_to_ipa==0.0.2
8
- inflect==7.0.0
9
- unidecode==1.3.7
10
  whisper-timestamped==1.14.2
11
  openai
12
- python-dotenv
13
- pypinyin==0.50.0
14
- cn2an==0.5.22
15
- jieba==0.42.1
16
- torch
 
 
1
  librosa==0.9.1
2
  faster-whisper==0.9.0
3
  pydub==0.25.1
 
4
  numpy==1.22.0
 
 
 
5
  whisper-timestamped==1.14.2
6
  openai
7
+ torch
8
+ torchaudio