Azul Alysum commited on
Commit
99ed4b2
ยท
1 Parent(s): 48f1945

Remove unecessary functions

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +100 -227
  3. client.py +5 -0
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: ๐Ÿ˜Š๐ŸŽ™๏ธ
4
  colorFrom: red
5
  colorTo: pink
6
  sdk: gradio
7
- sdk_version: 3.16.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
4
  colorFrom: red
5
  colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 3.36.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py CHANGED
@@ -13,6 +13,7 @@ import commons
13
  import utils
14
  import gradio as gr
15
  import gradio.utils as gr_utils
 
16
  import gradio.processing_utils as gr_processing_utils
17
  from models import SynthesizerTrn
18
  from text import text_to_sequence, _clean_text
@@ -27,7 +28,10 @@ def audio_postprocess(self, y):
27
  data = audio_postprocess_ori(self, y)
28
  if data is None:
29
  return None
30
- return gr_processing_utils.encode_url_or_file_to_base64(data["name"])
 
 
 
31
 
32
 
33
  gr.Audio.postprocess = audio_postprocess
@@ -41,92 +45,24 @@ def get_text(text, hps, is_symbol):
41
  return text_norm
42
 
43
 
44
- def create_tts_fn(model, hps, speaker_ids):
45
- def tts_fn(text, speaker, speed, is_symbol):
46
- if limitation:
47
- text_len = len(re.sub("\[([A-Z]{2})\]", "", text))
48
- max_len = 150
49
- if is_symbol:
50
- max_len *= 3
51
- if text_len > max_len:
52
- return "Error: Text is too long", None
53
 
54
- speaker_id = speaker_ids[speaker]
55
- stn_tst = get_text(text, hps, is_symbol)
56
- with no_grad():
57
- x_tst = stn_tst.unsqueeze(0).to(device)
58
- x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
59
- sid = LongTensor([speaker_id]).to(device)
60
- audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
61
- length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
62
- del stn_tst, x_tst, x_tst_lengths, sid
63
- return "Success", (hps.data.sampling_rate, audio)
64
-
65
- return tts_fn
66
-
67
-
68
- def create_vc_fn(model, hps, speaker_ids):
69
- def vc_fn(original_speaker, target_speaker, input_audio):
70
- if input_audio is None:
71
- return "You need to upload an audio", None
72
- sampling_rate, audio = input_audio
73
- duration = audio.shape[0] / sampling_rate
74
- if limitation and duration > 30:
75
- return "Error: Audio is too long", None
76
- original_speaker_id = speaker_ids[original_speaker]
77
- target_speaker_id = speaker_ids[target_speaker]
78
-
79
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
80
- if len(audio.shape) > 1:
81
- audio = librosa.to_mono(audio.transpose(1, 0))
82
- if sampling_rate != hps.data.sampling_rate:
83
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
84
- with no_grad():
85
- y = torch.FloatTensor(audio)
86
- y = y.unsqueeze(0)
87
- spec = spectrogram_torch(y, hps.data.filter_length,
88
- hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
89
- center=False).to(device)
90
- spec_lengths = LongTensor([spec.size(-1)]).to(device)
91
- sid_src = LongTensor([original_speaker_id]).to(device)
92
- sid_tgt = LongTensor([target_speaker_id]).to(device)
93
- audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
94
- 0, 0].data.cpu().float().numpy()
95
- del y, spec, spec_lengths, sid_src, sid_tgt
96
- return "Success", (hps.data.sampling_rate, audio)
97
-
98
- return vc_fn
99
-
100
-
101
- def create_soft_vc_fn(model, hps, speaker_ids):
102
- def soft_vc_fn(target_speaker, input_audio1, input_audio2):
103
- input_audio = input_audio1
104
- if input_audio is None:
105
- input_audio = input_audio2
106
- if input_audio is None:
107
- return "You need to upload an audio", None
108
- sampling_rate, audio = input_audio
109
- duration = audio.shape[0] / sampling_rate
110
- if limitation and duration > 30:
111
- return "Error: Audio is too long", None
112
- target_speaker_id = speaker_ids[target_speaker]
113
-
114
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
115
- if len(audio.shape) > 1:
116
- audio = librosa.to_mono(audio.transpose(1, 0))
117
- if sampling_rate != 16000:
118
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
119
- with torch.inference_mode():
120
- units = hubert.units(torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0).to(device))
121
- with no_grad():
122
- unit_lengths = LongTensor([units.size(1)]).to(device)
123
- sid = LongTensor([target_speaker_id]).to(device)
124
- audio = model.infer(units, unit_lengths, sid=sid, noise_scale=.667,
125
- noise_scale_w=0.8)[0][0, 0].data.cpu().float().numpy()
126
- del units, unit_lengths, sid
127
- return "Success", (hps.data.sampling_rate, audio)
128
-
129
- return soft_vc_fn
130
 
131
 
132
  def create_to_symbol_fn(hps):
@@ -156,6 +92,7 @@ download_audio_js = """
156
  """
157
 
158
  if __name__ == '__main__':
 
159
  parser = argparse.ArgumentParser()
160
  parser.add_argument('--device', type=str, default='cpu')
161
  parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
@@ -163,158 +100,94 @@ if __name__ == '__main__':
163
 
164
  device = torch.device(args.device)
165
  models_tts = []
166
- models_vc = []
167
  models_soft_vc = []
168
  with open("saved_model/info.json", "r", encoding="utf-8") as f:
169
  models_info = json.load(f)
170
- for i, info in models_info.items():
171
- name = info["title"]
172
- author = info["author"]
173
- lang = info["lang"]
174
- example = info["example"]
175
- config_path = f"saved_model/{i}/config.json"
176
- model_path = f"saved_model/{i}/model.pth"
177
- cover = info["cover"]
178
- cover_path = f"saved_model/{i}/{cover}" if cover else None
179
- hps = utils.get_hparams_from_file(config_path)
180
- model = SynthesizerTrn(
181
- len(hps.symbols),
182
- hps.data.filter_length // 2 + 1,
183
- hps.train.segment_size // hps.data.hop_length,
184
- n_speakers=hps.data.n_speakers,
185
- **hps.model)
186
- utils.load_checkpoint(model_path, model, None)
187
- model.eval().to(device)
188
- speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
189
- speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
190
-
191
- t = info["type"]
192
- if t == "vits":
193
- models_tts.append((name, author, cover_path, speakers, lang, example,
194
- hps.symbols, create_tts_fn(model, hps, speaker_ids),
195
- create_to_symbol_fn(hps)))
196
- models_vc.append((name, author, cover_path, speakers, create_vc_fn(model, hps, speaker_ids)))
197
- elif t == "soft-vits-vc":
198
- models_soft_vc.append((name, author, cover_path, speakers, create_soft_vc_fn(model, hps, speaker_ids)))
199
 
200
  hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).to(device)
201
 
202
  app = gr.Blocks()
203
 
204
  with app:
205
- gr.Markdown("# Moe TTS And Voice Conversion Using VITS Model\n\n"
206
- "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=skytnt.moegoe)\n\n"
207
- "[Open In Colab]"
208
- "(https://colab.research.google.com/drive/14Pb8lpmwZL-JI5Ub6jpG4sz2-8KS0kbS?usp=sharing)"
209
- " without queue and length limitation.\n\n"
210
- "Feel free to [open discussion](https://huggingface.co/spaces/skytnt/moe-tts/discussions/new) "
211
- "if you want to add your model to this app.")
212
  with gr.Tabs():
213
- with gr.TabItem("TTS"):
214
- with gr.Tabs():
215
- for i, (name, author, cover_path, speakers, lang, example, symbols, tts_fn,
216
- to_symbol_fn) in enumerate(models_tts):
217
- with gr.TabItem(f"model{i}"):
218
- with gr.Column():
219
- cover_markdown = f"![cover](file/{cover_path})\n\n" if cover_path else ""
220
- gr.Markdown(f"## {name}\n\n"
221
- f"{cover_markdown}"
222
- f"model author: {author}\n\n"
223
- f"language: {lang}")
224
- tts_input1 = gr.TextArea(label="Text (150 words limitation)", value=example,
225
- elem_id=f"tts-input{i}")
226
- tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
227
- type="index", value=speakers[0])
228
- tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.5, maximum=2, step=0.1)
229
- with gr.Accordion(label="Advanced Options", open=False):
230
- temp_text_var = gr.Variable()
231
- symbol_input = gr.Checkbox(value=False, label="Symbol input")
232
- symbol_list = gr.Dataset(label="Symbol list", components=[tts_input1],
233
- samples=[[x] for x in symbols],
234
- elem_id=f"symbol-list{i}")
235
- symbol_list_json = gr.Json(value=symbols, visible=False)
236
- tts_submit = gr.Button("Generate", variant="primary")
237
- tts_output1 = gr.Textbox(label="Output Message")
238
- tts_output2 = gr.Audio(label="Output Audio", elem_id=f"tts-audio{i}")
239
- download = gr.Button("Download Audio")
240
- download.click(None, [], [], _js=download_audio_js.format(audio_id=f"tts-audio{i}"))
241
-
242
- tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, symbol_input],
243
- [tts_output1, tts_output2])
244
- symbol_input.change(to_symbol_fn,
245
- [symbol_input, tts_input1, temp_text_var],
246
- [tts_input1, temp_text_var])
247
- symbol_list.click(None, [symbol_list, symbol_list_json], [],
248
- _js=f"""
249
- (i,symbols) => {{
250
- let root = document.querySelector("body > gradio-app");
251
- if (root.shadowRoot != null)
252
- root = root.shadowRoot;
253
- let text_input = root.querySelector("#tts-input{i}").querySelector("textarea");
254
- let startPos = text_input.selectionStart;
255
- let endPos = text_input.selectionEnd;
256
- let oldTxt = text_input.value;
257
- let result = oldTxt.substring(0, startPos) + symbols[i] + oldTxt.substring(endPos);
258
- text_input.value = result;
259
- let x = window.scrollX, y = window.scrollY;
260
- text_input.focus();
261
- text_input.selectionStart = startPos + symbols[i].length;
262
- text_input.selectionEnd = startPos + symbols[i].length;
263
- text_input.blur();
264
- window.scrollTo(x, y);
265
- return [];
266
- }}""")
267
-
268
- with gr.TabItem("Voice Conversion"):
269
- with gr.Tabs():
270
- for i, (name, author, cover_path, speakers, vc_fn) in enumerate(models_vc):
271
- with gr.TabItem(f"model{i}"):
272
- cover_markdown = f"![cover](file/{cover_path})\n\n" if cover_path else ""
273
- gr.Markdown(f"## {name}\n\n"
274
- f"{cover_markdown}"
275
- f"model author: {author}")
276
- vc_input1 = gr.Dropdown(label="Original Speaker", choices=speakers, type="index",
277
- value=speakers[0])
278
- vc_input2 = gr.Dropdown(label="Target Speaker", choices=speakers, type="index",
279
- value=speakers[min(len(speakers) - 1, 1)])
280
- vc_input3 = gr.Audio(label="Input Audio (30s limitation)")
281
- vc_submit = gr.Button("Convert", variant="primary")
282
- vc_output1 = gr.Textbox(label="Output Message")
283
- vc_output2 = gr.Audio(label="Output Audio", elem_id=f"vc-audio{i}")
284
- download = gr.Button("Download Audio")
285
- download.click(None, [], [], _js=download_audio_js.format(audio_id=f"vc-audio{i}"))
286
- vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2])
287
- with gr.TabItem("Soft Voice Conversion"):
288
- with gr.Tabs():
289
- for i, (name, author, cover_path, speakers, soft_vc_fn) in enumerate(models_soft_vc):
290
- with gr.TabItem(f"model{i}"):
291
- cover_markdown = f"![cover](file/{cover_path})\n\n" if cover_path else ""
292
- gr.Markdown(f"## {name}\n\n"
293
- f"{cover_markdown}"
294
- f"model author: {author}")
295
- vc_input1 = gr.Dropdown(label="Target Speaker", choices=speakers, type="index",
296
- value=speakers[0])
297
- source_tabs = gr.Tabs()
298
- with source_tabs:
299
- with gr.TabItem("microphone"):
300
- vc_input2 = gr.Audio(label="Input Audio (30s limitation)", source="microphone")
301
- with gr.TabItem("upload"):
302
- vc_input3 = gr.Audio(label="Input Audio (30s limitation)", source="upload")
303
- vc_submit = gr.Button("Convert", variant="primary")
304
- vc_output1 = gr.Textbox(label="Output Message")
305
- vc_output2 = gr.Audio(label="Output Audio", elem_id=f"svc-audio{i}")
306
- download = gr.Button("Download Audio")
307
- download.click(None, [], [], _js=download_audio_js.format(audio_id=f"svc-audio{i}"))
308
- # clear inputs
309
- source_tabs.set_event_trigger("change", None, [], [vc_input2, vc_input3],
310
- js="()=>[null,null]")
311
- vc_submit.click(soft_vc_fn, [vc_input1, vc_input2, vc_input3],
312
- [vc_output1, vc_output2])
313
  gr.Markdown(
314
- "unofficial demo for \n\n"
315
  "- [https://github.com/CjangCjengh/MoeGoe](https://github.com/CjangCjengh/MoeGoe)\n"
316
- "- [https://github.com/Francis-Komizu/VITS](https://github.com/Francis-Komizu/VITS)\n"
317
- "- [https://github.com/luoyily/MoeTTS](https://github.com/luoyily/MoeTTS)\n"
318
- "- [https://github.com/Francis-Komizu/Sovits](https://github.com/Francis-Komizu/Sovits)"
319
  )
320
  app.queue(concurrency_count=3).launch(show_api=True, share=args.share)
 
13
  import utils
14
  import gradio as gr
15
  import gradio.utils as gr_utils
16
+ from gradio_client import utils as client_utils
17
  import gradio.processing_utils as gr_processing_utils
18
  from models import SynthesizerTrn
19
  from text import text_to_sequence, _clean_text
 
28
  data = audio_postprocess_ori(self, y)
29
  if data is None:
30
  return None
31
+ try:
32
+ return gr_processing_utils.encode_url_or_file_to_base64(data["name"])
33
+ except:
34
+ return client_utils.encode_url_or_file_to_base64(data["name"])
35
 
36
 
37
  gr.Audio.postprocess = audio_postprocess
 
45
  return text_norm
46
 
47
 
48
+ def tts_fn(text, speaker_id, speed, is_symbol):
49
+ if limitation:
50
+ text_len = len(re.sub("\[([A-Z]{2})\]", "", text))
51
+ max_len = 150
52
+ if is_symbol:
53
+ max_len *= 3
54
+ if text_len > max_len:
55
+ return "Error: Text is too long", None
 
56
 
57
+ stn_tst = get_text(text, hps, is_symbol)
58
+ with no_grad():
59
+ x_tst = stn_tst.unsqueeze(0).to(device)
60
+ x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
61
+ sid = LongTensor([speaker_id]).to(device)
62
+ audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
63
+ length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
64
+ del stn_tst, x_tst, x_tst_lengths, sid
65
+ return "Success", (hps.data.sampling_rate, audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
 
68
  def create_to_symbol_fn(hps):
 
92
  """
93
 
94
  if __name__ == '__main__':
95
+ global speaker_ids, speakers
96
  parser = argparse.ArgumentParser()
97
  parser.add_argument('--device', type=str, default='cpu')
98
  parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
 
100
 
101
  device = torch.device(args.device)
102
  models_tts = []
 
103
  models_soft_vc = []
104
  with open("saved_model/info.json", "r", encoding="utf-8") as f:
105
  models_info = json.load(f)
106
+ info = models_info['0']
107
+ name = info["title"]
108
+ author = info["author"]
109
+ example = info["example"]
110
+ config_path = f"saved_model/0/config.json"
111
+ model_path = f"saved_model/0/model.pth"
112
+ hps = utils.get_hparams_from_file(config_path)
113
+ model = SynthesizerTrn(
114
+ len(hps.symbols),
115
+ hps.data.filter_length // 2 + 1,
116
+ hps.train.segment_size // hps.data.hop_length,
117
+ n_speakers=hps.data.n_speakers,
118
+ **hps.model)
119
+ utils.load_checkpoint(model_path, model, None)
120
+ model.eval().to(device)
121
+ speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
122
+ speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
123
+
124
+ models_tts.append((name, author
125
+ , speakers, example, hps.symbols, create_to_symbol_fn(hps)))
 
 
 
 
 
 
 
 
 
126
 
127
  hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).to(device)
128
 
129
  app = gr.Blocks()
130
 
131
  with app:
132
+ gr.Markdown("# Moe TTS And Voice Conversion Using VITS Model\n\n")
 
 
 
 
 
 
133
  with gr.Tabs():
134
+ name, author, speakers, example, symbols, to_symbol_fn = models_tts[0]
135
+ with gr.Tab("Model"):
136
+ with gr.Column():
137
+ gr.Markdown(f"## {name}\n\n"
138
+ f"Model Author: {author}\n\n")
139
+ tts_input1 = gr.TextArea(label="Text (150 words limitation)", value=f"[JA]{example}[JA]",
140
+ elem_id=f"tts-input0")
141
+ tts_input2 = gr.Number(label="Speaker ID (check next tab)", value=0, precision=0)
142
+ tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.5, maximum=2, step=0.1)
143
+ with gr.Accordion(label="Advanced Options", open=False):
144
+ temp_text_var = gr.Variable()
145
+ symbol_input = gr.Checkbox(value=False, label="Symbol input")
146
+ symbol_list = gr.Dataset(label="Symbol list", components=[tts_input1],
147
+ samples=[[x] for x in symbols],
148
+ elem_id=f"symbol-list0")
149
+ symbol_list_json = gr.Json(value=symbols, visible=False)
150
+ tts_submit = gr.Button("Generate", variant="primary")
151
+ tts_test = gr.Button("Test", variant="primary")
152
+ tts_output1 = gr.Textbox(label="Output Message")
153
+ tts_output2 = gr.Audio(label="Output Audio", elem_id=f"tts-audio0")
154
+ download = gr.Button("Download Audio")
155
+ download.click(None, [], [], _js=download_audio_js.format(audio_id=f"tts-audio0"))
156
+
157
+ tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, symbol_input],
158
+ [tts_output1, tts_output2])
159
+ tts_test.click(tts_fn, [tts_input1, tts_input2, tts_input3, symbol_input],
160
+ [tts_output1, tts_output2])
161
+ symbol_input.change(to_symbol_fn,
162
+ [symbol_input, tts_input1, temp_text_var],
163
+ [tts_input1, temp_text_var])
164
+ symbol_list.click(None, [symbol_list, symbol_list_json], [],
165
+ _js=f"""
166
+ (i,symbols) => {{
167
+ let root = document.querySelector("body > gradio-app");
168
+ if (root.shadowRoot != null)
169
+ root = root.shadowRoot;
170
+ let text_input = root.querySelector("#tts-input0").querySelector("textarea");
171
+ let startPos = text_input.selectionStart;
172
+ let endPos = text_input.selectionEnd;
173
+ let oldTxt = text_input.value;
174
+ let result = oldTxt.substring(0, startPos) + symbols[i] + oldTxt.substring(endPos);
175
+ text_input.value = result;
176
+ let x = window.scrollX, y = window.scrollY;
177
+ text_input.focus();
178
+ text_input.selectionStart = startPos + symbols[i].length;
179
+ text_input.selectionEnd = startPos + symbols[i].length;
180
+ text_input.blur();
181
+ window.scrollTo(x, y);
182
+ return [];
183
+ }}""")
184
+ with gr.Tab("Voices"):
185
+ gr.Markdown("## List of speakers and their IDs\n\n")
186
+ with gr.Column():
187
+ for index, speaker in enumerate(speakers):
188
+ gr.Markdown(f" {index}: {speaker}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  gr.Markdown(
190
+ "Model official repo \n\n"
191
  "- [https://github.com/CjangCjengh/MoeGoe](https://github.com/CjangCjengh/MoeGoe)\n"
 
 
 
192
  )
193
  app.queue(concurrency_count=3).launch(show_api=True, share=args.share)
client.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from gradio_client import Client
2
+
3
+ client = Client("http://127.0.0.1:7860/")
4
+ result = client.predict("Howdy!", 0, 0.5, True, fn_index=1)
5
+ print(result)