Spaces:

ShoukanLabs
/

styletts2_Japanese

Runtime error

App Files Files Community

Respair commited on Mar 28, 2024

Commit

97dc1c6

verified ·

1 Parent(s): fb780a9

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -8

app.py CHANGED Viewed

@@ -59,7 +59,7 @@ for v in voicelist:
 #     # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
 #     return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
 if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
-def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
     if text.strip() == "":
         raise gr.Error("You must enter some text")
     if len(text) > 50000:
@@ -72,7 +72,7 @@ def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
     audios = []
     for t in progress.tqdm(texts):
         print(t)
-        audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
     return (24000, np.concatenate(audios))
 # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
 #     if password == os.environ['ACCESS_CODE']:
@@ -141,20 +141,21 @@ def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
 with gr.Blocks() as vctk:
     with gr.Row():
         with gr.Column(scale=1):
-            inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
             voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
-            multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="これを増えたらもっとエモーショナルな結果になりますが、クオリティーのいい結果になるとは限らない。", interactive=True)
             alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
             beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
             # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
         with gr.Column(scale=1):
             btn = gr.Button("Synthesize", variant="primary")
             audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
-            btn.click(synthesize, inputs=[inp, voice, multispeakersteps,alpha,beta], outputs=[audio], concurrency_limit=4)
 with gr.Blocks() as clone:
     with gr.Row():
         with gr.Column(scale=1):
-            clinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
             clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
             vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
             embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
@@ -179,8 +180,8 @@ with gr.Blocks() as lj:
     with gr.Row():
         with gr.Column(scale=1):
             ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True, value="あなたがいないと、世界は色褪せて見えます。あなたの笑顔が私の日々を明るく照らしています。あなたがいない日は、まるで冬のように寒く、暗いです.")
-            embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. これを上げたらパフォーマンスがもっとエモーショナルになる、増やしすぎるとだめになるので、ご注意ください", interactive=True)
-            ljsteps = gr.Slider(minimum=3, maximum=20, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
         with gr.Column(scale=1):
             ljbtn = gr.Button("Synthesize", variant="primary")
             ljaudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})

 #     # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
 #     return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
 if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
+def synthesize(text, voice,embscale,alpha,beta, lngsteps, progress=gr.Progress()):
     if text.strip() == "":
         raise gr.Error("You must enter some text")
     if len(text) > 50000:
     audios = []
     for t in progress.tqdm(texts):
         print(t)
+        audios.append(styletts2importable.inference(t, voices[v], alpha=alpha, beta=beta, diffusion_steps=lngsteps, embedding_scale=embscale))
     return (24000, np.concatenate(audios))
 # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
 #     if password == os.environ['ACCESS_CODE']:
 with gr.Blocks() as vctk:
     with gr.Row():
         with gr.Column(scale=1):
+            inp = gr.Textbox(label="Text", info="Enter the text | テキストを入れてください、短すぎるとひどくなります", interactive=True)
             voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
+            embscale = gr.Slider(minimum=1, maximum=10, value=1.5, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="これを上げたらもっとエモーショナルな音声になります（下げたらその逆）、増やしすぎるとだめになるので、ご注意ください", interactive=True)
+            multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1, label="Diffusion Steps", info="これを増えたらクオリティーのいい結果になるとは限らない。", interactive=True)
             alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
             beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
             # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
         with gr.Column(scale=1):
             btn = gr.Button("Synthesize", variant="primary")
             audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
+            btn.click(synthesize, inputs=[inp, voice,embscale, multispeakersteps,alpha,beta], outputs=[audio], concurrency_limit=4)
 with gr.Blocks() as clone:
     with gr.Row():
         with gr.Column(scale=1):
+            clinp = gr.Textbox(label="Text", info="Enter the text | テキストを入れてください、短すぎるとひどくなります", interactive=True)
             clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
             vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
             embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
     with gr.Row():
         with gr.Column(scale=1):
             ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True, value="あなたがいないと、世界は色褪せて見えます。あなたの笑顔が私の日々を明るく照らしています。あなたがいない日は、まるで冬のように寒く、暗いです.")
+            embscale = gr.Slider(minimum=1, maximum=10, value=1.5, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="これを上げたらもっとエモーショナルな音声になります（下げたらその逆）、増やしすぎるとだめになるので、ご注意ください", interactive=True)
+            ljsteps = gr.Slider(minimum=3, maximum=20, value=5, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
         with gr.Column(scale=1):
             ljbtn = gr.Button("Synthesize", variant="primary")
             ljaudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})