Spaces:

ShoukanLabs
/

styletts2_Japanese

Runtime error

App Files Files Community

Respair commited on Mar 28, 2024

Commit

5782b8e

verified ·

1 Parent(s): ce7d002

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -3

app.py CHANGED Viewed

@@ -60,7 +60,7 @@ for v in voicelist:
 #     # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
 #     return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
 if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
-def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
     if text.strip() == "":
         raise gr.Error("You must enter some text")
     if len(text) > 50000:
@@ -73,7 +73,7 @@ def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
     audios = []
     for t in progress.tqdm(texts):
         print(t)
-        audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.4, diffusion_steps=lngsteps, embedding_scale=1.5))
     return (24000, np.concatenate(audios))
 # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
 #     if password == os.environ['ACCESS_CODE']:
@@ -178,17 +178,21 @@ def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
 #             clbtn = gr.Button("Synthesize", variant="primary")
 #             claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
 #             clbtn.click(clsynthesize, inputs=[clinp, voice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4)
 with gr.Blocks() as vctk:
     with gr.Row():
         with gr.Column(scale=1):
             inp = gr.Textbox(label="Text", info="Enter the text | テキストを入れてください、短すぎるとひどくなります.", value="あなたがいないと、世界は色褪せて見えます。あなたの笑顔が私の日々を明るく照らしています。あなたがいない日は、まるで冬のように寒く、暗いです.",  interactive=True)
             voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
             multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", interactive=True)
             # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
         with gr.Column(scale=1):
             btn = gr.Button("Synthesize", variant="primary")
             audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
-            btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)

 #     # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
 #     return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
 if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
+def synthesize(text, voice, lngsteps,embscale,alpha, beta, password, progress=gr.Progress()):
     if text.strip() == "":
         raise gr.Error("You must enter some text")
     if len(text) > 50000:
     audios = []
     for t in progress.tqdm(texts):
         print(t)
+        audios.append(styletts2importable.inference(t, voices[v], alpha=alpha, beta=beta, diffusion_steps=lngsteps, embedding_scale=embscale))
     return (24000, np.concatenate(audios))
 # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
 #     if password == os.environ['ACCESS_CODE']:
 #             clbtn = gr.Button("Synthesize", variant="primary")
 #             claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
 #             clbtn.click(clsynthesize, inputs=[clinp, voice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4)
 with gr.Blocks() as vctk:
     with gr.Row():
         with gr.Column(scale=1):
             inp = gr.Textbox(label="Text", info="Enter the text | テキストを入れてください、短すぎるとひどくなります.", value="あなたがいないと、世界は色褪せて見えます。あなたの笑顔が私の日々を明るく照らしています。あなたがいない日は、まるで冬のように寒く、暗いです.",  interactive=True)
             voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
+            embscale = gr.Slider(minimum=1, maximum=10, value=1.8, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="これを上げたらもっとエモーショナルな音声になります（下げたらその逆）、増やしすぎるとだめになるので、ご注意ください", interactive=True)
+            alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", interactive=True)
+            beta = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1, label="Beta", interactive=True)
             multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", interactive=True)
             # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
         with gr.Column(scale=1):
             btn = gr.Button("Synthesize", variant="primary")
             audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
+            btn.click(synthesize, inputs=[inp, voice, multispeakersteps,embscale,alpha,beta], outputs=[audio], concurrency_limit=4)