Spaces:

ShoukanLabs
/

styletts2_Japanese

Runtime error

App Files Files Community

Respair commited on Mar 28, 2024

Commit

00bdfef

verified ·

1 Parent(s): 97dc1c6

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -27

app.py CHANGED Viewed

@@ -1,28 +1,18 @@
 INTROTXT = """# StyleTTS 2
 kudos to mrfakename for the base gradio code I'm borrowing here.
 日本語用
 You will probably experience slight artifacts at the beginning or at the end of the output, which is not there on my server.
 Unfortunately, due to the variation in how floating-point operations are performed across different devices,
 and given the intrinsic characteristics of models that incorporate diffusion components,
 it is unlikely that you will achieve identical results to those obtained on my server, where the model was originally trained.
 So, the output you're about to hear may not accurately reflect the true performance of the model.
 it is also not limited to the artifacts, even the prosody and natural-ness of the speech is affected.
 =========
 音声の開始時または終了時に、もともと存在しなかったはずのアーティファクトが、ここで発生する可能性があります。
 残念ながら、異なるデバイスで浮動小数点演算が異なる方法で行われるため、およびDiffusionコンポーネントを取り入れたモデルの固有の特性を考慮すると、
 モデルが元々トレーニングされたデバイスで得られた結果と同じ結果を得ることは難しいでしょう。
 その結果、以下で体験するパフォーマンスはモデルの真の性能を正確に反映していません。
 そのため、アーティファクトの問題だけではなく、ナチュラルネスや音声クオリティーにも及びます。
 **
 """
 import gradio as gr
@@ -59,7 +49,7 @@ for v in voicelist:
 #     # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
 #     return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
 if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
-def synthesize(text, voice,embscale,alpha,beta, lngsteps, progress=gr.Progress()):
     if text.strip() == "":
         raise gr.Error("You must enter some text")
     if len(text) > 50000:
@@ -72,7 +62,7 @@ def synthesize(text, voice,embscale,alpha,beta, lngsteps, progress=gr.Progress()
     audios = []
     for t in progress.tqdm(texts):
         print(t)
-        audios.append(styletts2importable.inference(t, voices[v], alpha=alpha, beta=beta, diffusion_steps=lngsteps, embedding_scale=embscale))
     return (24000, np.concatenate(audios))
 # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
 #     if password == os.environ['ACCESS_CODE']:
@@ -108,14 +98,18 @@ def clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progre
     print(text)
     print("*** end ***")
     texts = txtsplit(text)
     audios = []
     # vs = styletts2importable.compute_style(voice)
-    vs = styletts2importable.compute_style(voice)
     # print(vs)
     for t in progress.tqdm(texts):
-        audios.append(styletts2importable.inference(t, vs, alpha=alpha, beta=beta, diffusion_steps=vcsteps, embedding_scale=embscale))
         # audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5))
     return (24000, np.concatenate(audios))
 def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
     # if text.strip() == "":
     #     raise gr.Error("You must enter some text")
@@ -141,23 +135,22 @@ def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
 with gr.Blocks() as vctk:
     with gr.Row():
         with gr.Column(scale=1):
-            inp = gr.Textbox(label="Text", info="Enter the text | テキストを入れてください、短すぎるとひどくなります", interactive=True)
-            voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
-            embscale = gr.Slider(minimum=1, maximum=10, value=1.5, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="これを上げたらもっとエモーショナルな音声になります（下げたらその逆）、増やしすぎるとだめになるので、ご注意ください", interactive=True)
-            multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1, label="Diffusion Steps", info="これを増えたらクオリティーのいい結果になるとは限らない。", interactive=True)
-            alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
-            beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
-            # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
         with gr.Column(scale=1):
-            btn = gr.Button("Synthesize", variant="primary")
-            audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
-            btn.click(synthesize, inputs=[inp, voice,embscale, multispeakersteps,alpha,beta], outputs=[audio], concurrency_limit=4)
 with gr.Blocks() as clone:
     with gr.Row():
         with gr.Column(scale=1):
             clinp = gr.Textbox(label="Text", info="Enter the text | テキストを入れてください、短すぎるとひどくなります", interactive=True)
             clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
-            vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
             embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
             alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
             beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
@@ -196,5 +189,4 @@ the base code was borrowed from -> [mrfakename](https://twitter.com/realmrfakena
 """) # Please do not remove this line.
 if __name__ == "__main__":
     # demo.queue(api_open=False, max_size=15).launch(show_api=False)
-    demo.queue(api_open=False, max_size=15).launch(show_api=False)

 INTROTXT = """# StyleTTS 2
 kudos to mrfakename for the base gradio code I'm borrowing here.
 日本語用
 You will probably experience slight artifacts at the beginning or at the end of the output, which is not there on my server.
 Unfortunately, due to the variation in how floating-point operations are performed across different devices,
 and given the intrinsic characteristics of models that incorporate diffusion components,
 it is unlikely that you will achieve identical results to those obtained on my server, where the model was originally trained.
 So, the output you're about to hear may not accurately reflect the true performance of the model.
 it is also not limited to the artifacts, even the prosody and natural-ness of the speech is affected.
 =========
 音声の開始時または終了時に、もともと存在しなかったはずのアーティファクトが、ここで発生する可能性があります。
 残念ながら、異なるデバイスで浮動小数点演算が異なる方法で行われるため、およびDiffusionコンポーネントを取り入れたモデルの固有の特性を考慮すると、
 モデルが元々トレーニングされたデバイスで得られた結果と同じ結果を得ることは難しいでしょう。
 その結果、以下で体験するパフォーマンスはモデルの真の性能を正確に反映していません。
 そのため、アーティファクトの問題だけではなく、ナチュラルネスや音声クオリティーにも及びます。
 **
 """
 import gradio as gr
 #     # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
 #     return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
 if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
+def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
     if text.strip() == "":
         raise gr.Error("You must enter some text")
     if len(text) > 50000:
     audios = []
     for t in progress.tqdm(texts):
         print(t)
+        audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
     return (24000, np.concatenate(audios))
 # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
 #     if password == os.environ['ACCESS_CODE']:
     print(text)
     print("*** end ***")
     texts = txtsplit(text)
     audios = []
     # vs = styletts2importable.compute_style(voice)
     # print(vs)
     for t in progress.tqdm(texts):
+        audios.append(styletts2importable.inference(t, voices[v], alpha=alpha, beta=beta, diffusion_steps=vcsteps, embedding_scale=embscale))
         # audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5))
     return (24000, np.concatenate(audios))
 def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
     # if text.strip() == "":
     #     raise gr.Error("You must enter some text")
 with gr.Blocks() as vctk:
     with gr.Row():
         with gr.Column(scale=1):
+            clinp = gr.Textbox(label="Text", info="Enter the text | テキストを入れてください、短すぎるとひどくなります",value="あなたがいないと、世界は色褪せて見えます。あなたの笑顔が私の日々を明るく照らしています。あなたがいない日は、まるで冬のように寒く、暗いです." interactive=True)
+            voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", interactive=True)
+            vcsteps = gr.Slider(minimum=3, maximum=20, value=5, step=1, label="Diffusion Steps", info="You'll get more variation in the results if you increase it, doesn't necessarily improve anything.| これを上げたらもっとエモーショナルな音声になります（下げたらその逆）、増やしすぎるとだめになるので、ご注意ください", interactive=True)
+            embscale = gr.Slider(minimum=1, maximum=10, value=1.8, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
+            alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", interactive=True)
+            beta = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1, label="Beta", interactive=True)
         with gr.Column(scale=1):
+            clbtn = gr.Button("Synthesize", variant="primary")
+            claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
+            clbtn.click(clsynthesize, inputs=[clinp, voice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4)
 with gr.Blocks() as clone:
     with gr.Row():
         with gr.Column(scale=1):
             clinp = gr.Textbox(label="Text", info="Enter the text | テキストを入れてください、短すぎるとひどくなります", interactive=True)
             clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
+            vcsteps = gr.Slider(minimum=3, maximum=10, value=2, step=1, label="Diffusion Steps", info="これを上げたらもっとエモーショナルな音声になります（下げたらその逆）、増やしすぎるとだめになるので、ご注意ください", interactive=True)
             embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
             alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
             beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
 """) # Please do not remove this line.
 if __name__ == "__main__":
     # demo.queue(api_open=False, max_size=15).launch(show_api=False)
+    demo.queue(api_open=False, max_size=15).launch(show_api=False,share=True)