Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,28 +1,18 @@
|
|
| 1 |
INTROTXT = """# StyleTTS 2
|
| 2 |
kudos to mrfakename for the base gradio code I'm borrowing here.
|
| 3 |
-
|
| 4 |
-
|
| 5 |
ๆฅๆฌ่ช็จ
|
| 6 |
-
|
| 7 |
You will probably experience slight artifacts at the beginning or at the end of the output, which is not there on my server.
|
| 8 |
-
|
| 9 |
Unfortunately, due to the variation in how floating-point operations are performed across different devices,
|
| 10 |
and given the intrinsic characteristics of models that incorporate diffusion components,
|
| 11 |
it is unlikely that you will achieve identical results to those obtained on my server, where the model was originally trained.
|
| 12 |
So, the output you're about to hear may not accurately reflect the true performance of the model.
|
| 13 |
it is also not limited to the artifacts, even the prosody and natural-ness of the speech is affected.
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
=========
|
| 18 |
-
|
| 19 |
้ณๅฃฐใฎ้ๅงๆใพใใฏ็ตไบๆใซใใใจใใจๅญๅจใใชใใฃใใฏใใฎใขใผใใฃใใกใฏใใใใใใง็บ็ใใๅฏ่ฝๆงใใใใพใใ
|
| 20 |
-
|
| 21 |
ๆฎๅฟตใชใใใ็ฐใชใใใใคในใงๆตฎๅๅฐๆฐ็นๆผ็ฎใ็ฐใชใๆนๆณใง่กใใใใใใใใใณDiffusionใณใณใใผใใณใใๅใๅ
ฅใใใขใใซใฎๅบๆใฎ็นๆงใ่ๆ
ฎใใใจใ
|
| 22 |
ใขใใซใๅ
ใ
ใใฌใผใใณใฐใใใใใใคในใงๅพใใใ็ตๆใจๅใ็ตๆใๅพใใใจใฏ้ฃใใใงใใใใ
|
| 23 |
ใใฎ็ตๆใไปฅไธใงไฝ้จใใใใใฉใผใใณในใฏใขใใซใฎ็ใฎๆง่ฝใๆญฃ็ขบใซๅๆ ใใฆใใพใใใ
|
| 24 |
ใใฎใใใใขใผใใฃใใกใฏใใฎๅ้กใ ใใงใฏใชใใใใใฅใฉใซใในใ้ณๅฃฐใฏใชใชใใฃใผใซใๅใณใพใใ
|
| 25 |
-
|
| 26 |
**
|
| 27 |
"""
|
| 28 |
import gradio as gr
|
|
@@ -59,7 +49,7 @@ for v in voicelist:
|
|
| 59 |
# # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
|
| 60 |
# return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
|
| 61 |
if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
|
| 62 |
-
def synthesize(text, voice,
|
| 63 |
if text.strip() == "":
|
| 64 |
raise gr.Error("You must enter some text")
|
| 65 |
if len(text) > 50000:
|
|
@@ -72,7 +62,7 @@ def synthesize(text, voice,embscale,alpha,beta, lngsteps, progress=gr.Progress()
|
|
| 72 |
audios = []
|
| 73 |
for t in progress.tqdm(texts):
|
| 74 |
print(t)
|
| 75 |
-
audios.append(styletts2importable.inference(t, voices[v], alpha=
|
| 76 |
return (24000, np.concatenate(audios))
|
| 77 |
# def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
|
| 78 |
# if password == os.environ['ACCESS_CODE']:
|
|
@@ -108,14 +98,18 @@ def clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progre
|
|
| 108 |
print(text)
|
| 109 |
print("*** end ***")
|
| 110 |
texts = txtsplit(text)
|
|
|
|
| 111 |
audios = []
|
| 112 |
# vs = styletts2importable.compute_style(voice)
|
| 113 |
-
|
| 114 |
# print(vs)
|
| 115 |
for t in progress.tqdm(texts):
|
| 116 |
-
audios.append(styletts2importable.inference(t,
|
| 117 |
# audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5))
|
| 118 |
return (24000, np.concatenate(audios))
|
|
|
|
|
|
|
|
|
|
| 119 |
def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
|
| 120 |
# if text.strip() == "":
|
| 121 |
# raise gr.Error("You must enter some text")
|
|
@@ -141,23 +135,22 @@ def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
|
|
| 141 |
with gr.Blocks() as vctk:
|
| 142 |
with gr.Row():
|
| 143 |
with gr.Column(scale=1):
|
| 144 |
-
|
| 145 |
-
voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.",
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha",
|
| 149 |
-
beta = gr.Slider(minimum=0, maximum=1, value=0.
|
| 150 |
-
# use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
|
| 151 |
with gr.Column(scale=1):
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
with gr.Blocks() as clone:
|
| 156 |
with gr.Row():
|
| 157 |
with gr.Column(scale=1):
|
| 158 |
clinp = gr.Textbox(label="Text", info="Enter the text | ใใญในใใๅ
ฅใใฆใใ ใใใ็ญใใใใจใฒใฉใใชใใพใ", interactive=True)
|
| 159 |
clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
|
| 160 |
-
vcsteps = gr.Slider(minimum=3, maximum=
|
| 161 |
embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
|
| 162 |
alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
|
| 163 |
beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
|
|
@@ -196,5 +189,4 @@ the base code was borrowed from -> [mrfakename](https://twitter.com/realmrfakena
|
|
| 196 |
""") # Please do not remove this line.
|
| 197 |
if __name__ == "__main__":
|
| 198 |
# demo.queue(api_open=False, max_size=15).launch(show_api=False)
|
| 199 |
-
demo.queue(api_open=False, max_size=15).launch(show_api=False)
|
| 200 |
-
|
|
|
|
| 1 |
INTROTXT = """# StyleTTS 2
|
| 2 |
kudos to mrfakename for the base gradio code I'm borrowing here.
|
|
|
|
|
|
|
| 3 |
ๆฅๆฌ่ช็จ
|
|
|
|
| 4 |
You will probably experience slight artifacts at the beginning or at the end of the output, which is not there on my server.
|
|
|
|
| 5 |
Unfortunately, due to the variation in how floating-point operations are performed across different devices,
|
| 6 |
and given the intrinsic characteristics of models that incorporate diffusion components,
|
| 7 |
it is unlikely that you will achieve identical results to those obtained on my server, where the model was originally trained.
|
| 8 |
So, the output you're about to hear may not accurately reflect the true performance of the model.
|
| 9 |
it is also not limited to the artifacts, even the prosody and natural-ness of the speech is affected.
|
|
|
|
|
|
|
|
|
|
| 10 |
=========
|
|
|
|
| 11 |
้ณๅฃฐใฎ้ๅงๆใพใใฏ็ตไบๆใซใใใจใใจๅญๅจใใชใใฃใใฏใใฎใขใผใใฃใใกใฏใใใใใใง็บ็ใใๅฏ่ฝๆงใใใใพใใ
|
|
|
|
| 12 |
ๆฎๅฟตใชใใใ็ฐใชใใใใคในใงๆตฎๅๅฐๆฐ็นๆผ็ฎใ็ฐใชใๆนๆณใง่กใใใใใใใใใณDiffusionใณใณใใผใใณใใๅใๅ
ฅใใใขใใซใฎๅบๆใฎ็นๆงใ่ๆ
ฎใใใจใ
|
| 13 |
ใขใใซใๅ
ใ
ใใฌใผใใณใฐใใใใใใคในใงๅพใใใ็ตๆใจๅใ็ตๆใๅพใใใจใฏ้ฃใใใงใใใใ
|
| 14 |
ใใฎ็ตๆใไปฅไธใงไฝ้จใใใใใฉใผใใณในใฏใขใใซใฎ็ใฎๆง่ฝใๆญฃ็ขบใซๅๆ ใใฆใใพใใใ
|
| 15 |
ใใฎใใใใขใผใใฃใใกใฏใใฎๅ้กใ ใใงใฏใชใใใใใฅใฉใซใในใ้ณๅฃฐใฏใชใชใใฃใผใซใๅใณใพใใ
|
|
|
|
| 16 |
**
|
| 17 |
"""
|
| 18 |
import gradio as gr
|
|
|
|
| 49 |
# # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
|
| 50 |
# return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
|
| 51 |
if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
|
| 52 |
+
def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
|
| 53 |
if text.strip() == "":
|
| 54 |
raise gr.Error("You must enter some text")
|
| 55 |
if len(text) > 50000:
|
|
|
|
| 62 |
audios = []
|
| 63 |
for t in progress.tqdm(texts):
|
| 64 |
print(t)
|
| 65 |
+
audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
|
| 66 |
return (24000, np.concatenate(audios))
|
| 67 |
# def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
|
| 68 |
# if password == os.environ['ACCESS_CODE']:
|
|
|
|
| 98 |
print(text)
|
| 99 |
print("*** end ***")
|
| 100 |
texts = txtsplit(text)
|
| 101 |
+
|
| 102 |
audios = []
|
| 103 |
# vs = styletts2importable.compute_style(voice)
|
| 104 |
+
|
| 105 |
# print(vs)
|
| 106 |
for t in progress.tqdm(texts):
|
| 107 |
+
audios.append(styletts2importable.inference(t, voices[v], alpha=alpha, beta=beta, diffusion_steps=vcsteps, embedding_scale=embscale))
|
| 108 |
# audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5))
|
| 109 |
return (24000, np.concatenate(audios))
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
|
| 113 |
def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
|
| 114 |
# if text.strip() == "":
|
| 115 |
# raise gr.Error("You must enter some text")
|
|
|
|
| 135 |
with gr.Blocks() as vctk:
|
| 136 |
with gr.Row():
|
| 137 |
with gr.Column(scale=1):
|
| 138 |
+
clinp = gr.Textbox(label="Text", info="Enter the text | ใใญในใใๅ
ฅใใฆใใ ใใใ็ญใใใใจใฒใฉใใชใใพใ",value="ใใชใใใใชใใจใไธ็ใฏ่ฒ่คชใใฆ่ฆใใพใใใใชใใฎ็ฌ้กใ็งใฎๆฅใ
ใๆใใ็
งใใใฆใใพใใใใชใใใใชใๆฅใฏใใพใใงๅฌใฎใใใซๅฏใใๆใใงใ." interactive=True)
|
| 139 |
+
voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", interactive=True)
|
| 140 |
+
vcsteps = gr.Slider(minimum=3, maximum=20, value=5, step=1, label="Diffusion Steps", info="You'll get more variation in the results if you increase it, doesn't necessarily improve anything.| ใใใไธใใใใใฃใจใจใขใผใทใงใใซใช้ณๅฃฐใซใชใใพใ๏ผไธใใใใใฎ้๏ผใๅขใใใใใใจใ ใใซใชใใฎใงใใๆณจๆใใ ใใ", interactive=True)
|
| 141 |
+
embscale = gr.Slider(minimum=1, maximum=10, value=1.8, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
|
| 142 |
+
alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", interactive=True)
|
| 143 |
+
beta = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1, label="Beta", interactive=True)
|
|
|
|
| 144 |
with gr.Column(scale=1):
|
| 145 |
+
clbtn = gr.Button("Synthesize", variant="primary")
|
| 146 |
+
claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
|
| 147 |
+
clbtn.click(clsynthesize, inputs=[clinp, voice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4)
|
| 148 |
with gr.Blocks() as clone:
|
| 149 |
with gr.Row():
|
| 150 |
with gr.Column(scale=1):
|
| 151 |
clinp = gr.Textbox(label="Text", info="Enter the text | ใใญในใใๅ
ฅใใฆใใ ใใใ็ญใใใใจใฒใฉใใชใใพใ", interactive=True)
|
| 152 |
clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
|
| 153 |
+
vcsteps = gr.Slider(minimum=3, maximum=10, value=2, step=1, label="Diffusion Steps", info="ใใใไธใใใใใฃใจใจใขใผใทใงใใซใช้ณๅฃฐใซใชใใพใ๏ผไธใใใใใฎ้๏ผใๅขใใใใใใจใ ใใซใชใใฎใงใใๆณจๆใใ ใใ", interactive=True)
|
| 154 |
embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
|
| 155 |
alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
|
| 156 |
beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
|
|
|
|
| 189 |
""") # Please do not remove this line.
|
| 190 |
if __name__ == "__main__":
|
| 191 |
# demo.queue(api_open=False, max_size=15).launch(show_api=False)
|
| 192 |
+
demo.queue(api_open=False, max_size=15).launch(show_api=False,share=True)
|
|
|