Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update app.py
Browse files- app.py +22 -22
- requirements.txt +0 -1
app.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
-
import psutil
|
| 4 |
-
import time
|
| 5 |
-
from threading import Timer
|
| 6 |
import librosa
|
| 7 |
import numpy as np
|
| 8 |
import torch
|
|
@@ -28,7 +25,7 @@ def get_text(text, hps, is_phoneme):
|
|
| 28 |
def create_tts_fn(model, hps, speaker_ids):
|
| 29 |
def tts_fn(text, speaker, speed, is_phoneme):
|
| 30 |
if limitation and ((len(text) > 60 and not is_phoneme) or (len(text) > 120 and is_phoneme)):
|
| 31 |
-
|
| 32 |
speaker_id = speaker_ids[speaker]
|
| 33 |
stn_tst = get_text(text, hps, is_phoneme)
|
| 34 |
with no_grad():
|
|
@@ -38,7 +35,7 @@ def create_tts_fn(model, hps, speaker_ids):
|
|
| 38 |
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
|
| 39 |
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
| 40 |
del stn_tst, x_tst, x_tst_lengths, sid
|
| 41 |
-
return
|
| 42 |
|
| 43 |
return tts_fn
|
| 44 |
|
|
@@ -46,11 +43,11 @@ def create_tts_fn(model, hps, speaker_ids):
|
|
| 46 |
def create_vc_fn(model, hps, speaker_ids):
|
| 47 |
def vc_fn(original_speaker, target_speaker, input_audio):
|
| 48 |
if input_audio is None:
|
| 49 |
-
|
| 50 |
sampling_rate, audio = input_audio
|
| 51 |
duration = audio.shape[0] / sampling_rate
|
| 52 |
if limitation and duration > 15:
|
| 53 |
-
|
| 54 |
original_speaker_id = speaker_ids[original_speaker]
|
| 55 |
target_speaker_id = speaker_ids[target_speaker]
|
| 56 |
|
|
@@ -71,7 +68,7 @@ def create_vc_fn(model, hps, speaker_ids):
|
|
| 71 |
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
| 72 |
0, 0].data.cpu().float().numpy()
|
| 73 |
del y, spec, spec_lengths, sid_src, sid_tgt
|
| 74 |
-
return
|
| 75 |
|
| 76 |
return vc_fn
|
| 77 |
|
|
@@ -144,21 +141,25 @@ if __name__ == '__main__':
|
|
| 144 |
with advanced_options:
|
| 145 |
phoneme_input = gr.Checkbox(value=False, label="Phoneme input")
|
| 146 |
to_phoneme_btn = gr.Button("Covert text to phoneme")
|
| 147 |
-
phoneme_list = gr.
|
| 148 |
-
|
|
|
|
| 149 |
tts_submit = gr.Button("Generate", variant="primary")
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
|
|
|
| 158 |
tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input],
|
| 159 |
-
[
|
| 160 |
to_phoneme_btn.click(lambda x: _clean_text(x, hps.data.text_cleaners) if x != "" else x,
|
| 161 |
[tts_input1], [tts_input1])
|
|
|
|
|
|
|
| 162 |
|
| 163 |
with gr.TabItem("Voice Conversion"):
|
| 164 |
with gr.Tabs():
|
|
@@ -172,7 +173,6 @@ if __name__ == '__main__':
|
|
| 172 |
value=speakers[1])
|
| 173 |
vc_input3 = gr.Audio(label="Input Audio (15s limitation)")
|
| 174 |
vc_submit = gr.Button("Convert", variant="primary")
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2])
|
| 178 |
app.launch()
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
|
|
|
|
|
|
|
|
|
| 3 |
import librosa
|
| 4 |
import numpy as np
|
| 5 |
import torch
|
|
|
|
| 25 |
def create_tts_fn(model, hps, speaker_ids):
|
| 26 |
def tts_fn(text, speaker, speed, is_phoneme):
|
| 27 |
if limitation and ((len(text) > 60 and not is_phoneme) or (len(text) > 120 and is_phoneme)):
|
| 28 |
+
raise gr.Error("Text is too long")
|
| 29 |
speaker_id = speaker_ids[speaker]
|
| 30 |
stn_tst = get_text(text, hps, is_phoneme)
|
| 31 |
with no_grad():
|
|
|
|
| 35 |
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
|
| 36 |
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
| 37 |
del stn_tst, x_tst, x_tst_lengths, sid
|
| 38 |
+
return hps.data.sampling_rate, audio
|
| 39 |
|
| 40 |
return tts_fn
|
| 41 |
|
|
|
|
| 43 |
def create_vc_fn(model, hps, speaker_ids):
|
| 44 |
def vc_fn(original_speaker, target_speaker, input_audio):
|
| 45 |
if input_audio is None:
|
| 46 |
+
raise gr.Error("You need to upload an audio")
|
| 47 |
sampling_rate, audio = input_audio
|
| 48 |
duration = audio.shape[0] / sampling_rate
|
| 49 |
if limitation and duration > 15:
|
| 50 |
+
raise gr.Error("Audio is too long")
|
| 51 |
original_speaker_id = speaker_ids[original_speaker]
|
| 52 |
target_speaker_id = speaker_ids[target_speaker]
|
| 53 |
|
|
|
|
| 68 |
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
| 69 |
0, 0].data.cpu().float().numpy()
|
| 70 |
del y, spec, spec_lengths, sid_src, sid_tgt
|
| 71 |
+
return hps.data.sampling_rate, audio
|
| 72 |
|
| 73 |
return vc_fn
|
| 74 |
|
|
|
|
| 141 |
with advanced_options:
|
| 142 |
phoneme_input = gr.Checkbox(value=False, label="Phoneme input")
|
| 143 |
to_phoneme_btn = gr.Button("Covert text to phoneme")
|
| 144 |
+
phoneme_list = gr.Dataset(label="Phoneme list", components=[tts_input1],
|
| 145 |
+
samples=[[x] for x in symbols])
|
| 146 |
+
phoneme_list_json = gr.Json(value=symbols, visible=False)
|
| 147 |
tts_submit = gr.Button("Generate", variant="primary")
|
| 148 |
+
tts_output = gr.Audio(label="Output Audio")
|
| 149 |
+
advanced_button.click(None, [], [], _js="""
|
| 150 |
+
() => {
|
| 151 |
+
let options = document.querySelector("body > gradio-app");
|
| 152 |
+
if (options.shadowRoot != null)
|
| 153 |
+
options = options.shadowRoot;
|
| 154 |
+
options = options.querySelector("#advanced-options");
|
| 155 |
+
options.style.display = ["none", ""].includes(options.style.display) ? "flex" : "none";
|
| 156 |
+
}""")
|
| 157 |
tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input],
|
| 158 |
+
[tts_output])
|
| 159 |
to_phoneme_btn.click(lambda x: _clean_text(x, hps.data.text_cleaners) if x != "" else x,
|
| 160 |
[tts_input1], [tts_input1])
|
| 161 |
+
phoneme_list.click(None, [phoneme_list, phoneme_list_json, tts_input1], [tts_input1],
|
| 162 |
+
_js="(i,phonemes, text) => text + phonemes[i]")
|
| 163 |
|
| 164 |
with gr.TabItem("Voice Conversion"):
|
| 165 |
with gr.Tabs():
|
|
|
|
| 173 |
value=speakers[1])
|
| 174 |
vc_input3 = gr.Audio(label="Input Audio (15s limitation)")
|
| 175 |
vc_submit = gr.Button("Convert", variant="primary")
|
| 176 |
+
vc_output = gr.Audio(label="Output Audio")
|
| 177 |
+
vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output])
|
|
|
|
| 178 |
app.launch()
|
requirements.txt
CHANGED
|
@@ -9,5 +9,4 @@ torch
|
|
| 9 |
torchvision
|
| 10 |
Unidecode
|
| 11 |
pyopenjtalk
|
| 12 |
-
psutil
|
| 13 |
gradio
|
|
|
|
| 9 |
torchvision
|
| 10 |
Unidecode
|
| 11 |
pyopenjtalk
|
|
|
|
| 12 |
gradio
|