Spaces:

AdalAbilbekov
/

EmotionalIntensityControl

Sleeping

App Files Files Community

AdalAbilbekov commited on Apr 1, 2024

Commit

9e9a056

1 Parent(s): 08f5766

1

Browse files

Files changed (4) hide show

app.py +36 -28
flagged/log.csv +3 -0
flagged/output/2e9a3b60dc40f07d4db8/audio.wav +0 -0
flagged/output/85c2e39535a1879bccc5/audio.wav +0 -0

app.py CHANGED Viewed

@@ -65,35 +65,42 @@ vocoder.remove_weight_norm()
 emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"])
 spekears = ['Madi', 'Marzhan', 'Akzhol']
-def generate_audio(text, quantity, speaker, emotion_1, emotion_2):
-    x, x_lengths = convert_text(text)
-    emo_1, emo_2 = emotions.index(emotion_1), emotions.index(emotion_2)
-    emo1 = torch.LongTensor([emo_1]).to(device)
-    emo2 = torch.LongTensor([emo_2]).to(device)
-    sid = torch.LongTensor([spekears.index(speaker)]).to(device)
-    intensity = quantity / 100
-    y_enc, y_dec, attn = gradtts_uncond_model.classifier_guidance_decode_two_mixture(
-                        x, x_lengths,
-                        n_timesteps=10,
-                        temperature=2.0,
-                        stoc=args.stoc,
-                        spk=sid,
-                        emo1=emo1,
-                        emo2=emo2,
-                        emo1_weight=intensity,
-                        length_scale=1.,
-                        classifier_func=model.forward,
-                        guidance=300,
-                        classifier_type=model.model_type
-                    )
-    y_dec = y_dec.detach()
-    res = y_dec.squeeze().to(device).numpy()
-    x = torch.from_numpy(res).unsqueeze(0)
-    y_g_hat = vocoder(x)
-    audio = y_g_hat.squeeze()
-    audio = audio * 32768.0
-    audio = audio.detach().cpu().numpy().astype('int16')
     sr = 22050
     return (sr, audio)
@@ -102,6 +109,7 @@ demo = gr.Interface(
     [
         gr.Textbox(value='Батпақ соры шабындыққа және жыл бойғы жайылымға пайдаланылады.', label="Text you want to synthesize"),
         gr.Slider(0, 100, value=50, step=10, label="Count", info="Choose between 0 and 100"),
         gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator."
         ),
         gr.Dropdown(emotions, value=emotions[0], label="Emotion 1", info="Select first emotion"),

 emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"])
 spekears = ['Madi', 'Marzhan', 'Akzhol']
+def generate_audio(text, quantity, guid, speaker, emotion_1, emotion_2):
+    y_dec = torch.tensor([torch.nan])
+    gui = guid
+    while torch.isnan(y_dec).sum() != 0:
+        x, x_lengths = convert_text(text)
+        emo_1, emo_2 = emotions.index(emotion_1), emotions.index(emotion_2)
+        emo1 = torch.LongTensor([emo_1]).to(device)
+        emo2 = torch.LongTensor([emo_2]).to(device)
+        sid = torch.LongTensor([spekears.index(speaker)]).to(device)
+        intensity = quantity / 100
+        y_enc, y_dec, attn = gradtts_uncond_model.classifier_guidance_decode_two_mixture(
+                            x, x_lengths,
+                            n_timesteps=100,
+                            temperature=2.0,
+                            stoc=args.stoc,
+                            spk=sid,
+                            emo1=emo1,
+                            emo2=emo2,
+                            emo1_weight=intensity,
+                            length_scale=1.,
+                            classifier_func=model.forward,
+                            guidance=gui,
+                            classifier_type=model.model_type
+                        )
+        y_dec = y_dec.detach()
+        res = y_dec.squeeze().to(device).numpy()
+        x = torch.from_numpy(res).unsqueeze(0)
+        y_g_hat = vocoder(x)
+        audio = y_g_hat.squeeze()
+        audio = audio * 32768.0
+        audio = audio.detach().cpu().numpy().astype('int16')
+        gui -= 50
+        if gui <= 0:
+            print('shabuya')
+            break
     sr = 22050
     return (sr, audio)
     [
         gr.Textbox(value='Батпақ соры шабындыққа және жыл бойғы жайылымға пайдаланылады.', label="Text you want to synthesize"),
         gr.Slider(0, 100, value=50, step=10, label="Count", info="Choose between 0 and 100"),
+        gr.Slider(0, 1000, value=100, step=10, label="Guidance", info="Choose between 0 and 1000"),
         gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator."
         ),
         gr.Dropdown(emotions, value=emotions[0], label="Emotion 1", info="Select first emotion"),

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+Text you want to synthesize,Count,Guidance,Narrator,Emotion 1,Emotion 2,output,flag,username,timestamp
+Батпақ соры шабындыққа және жыл бойғы жайылымға пайдаланылады.,50,20,Marzhan,angry,neutral,flagged/output/2e9a3b60dc40f07d4db8/audio.wav,,,2024-03-26 16:38:18.508507
+Батпақ соры шабындыққа және жыл бойғы жайылымға пайдаланылады.,50,300,Marzhan,happy,neutral,flagged/output/85c2e39535a1879bccc5/audio.wav,,,2024-03-29 22:08:19.838874

flagged/output/2e9a3b60dc40f07d4db8/audio.wav ADDED Viewed

Binary file (171 kB). View file

flagged/output/85c2e39535a1879bccc5/audio.wav ADDED Viewed

Binary file (173 kB). View file