Commit
·
9e9a056
1
Parent(s):
08f5766
- app.py +36 -28
- flagged/log.csv +3 -0
- flagged/output/2e9a3b60dc40f07d4db8/audio.wav +0 -0
- flagged/output/85c2e39535a1879bccc5/audio.wav +0 -0
app.py
CHANGED
|
@@ -65,35 +65,42 @@ vocoder.remove_weight_norm()
|
|
| 65 |
emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"])
|
| 66 |
spekears = ['Madi', 'Marzhan', 'Akzhol']
|
| 67 |
|
| 68 |
-
def generate_audio(text, quantity, speaker, emotion_1, emotion_2):
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
sr = 22050
|
| 98 |
return (sr, audio)
|
| 99 |
|
|
@@ -102,6 +109,7 @@ demo = gr.Interface(
|
|
| 102 |
[
|
| 103 |
gr.Textbox(value='Батпақ соры шабындыққа және жыл бойғы жайылымға пайдаланылады.', label="Text you want to synthesize"),
|
| 104 |
gr.Slider(0, 100, value=50, step=10, label="Count", info="Choose between 0 and 100"),
|
|
|
|
| 105 |
gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator."
|
| 106 |
),
|
| 107 |
gr.Dropdown(emotions, value=emotions[0], label="Emotion 1", info="Select first emotion"),
|
|
|
|
| 65 |
emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"])
|
| 66 |
spekears = ['Madi', 'Marzhan', 'Akzhol']
|
| 67 |
|
| 68 |
+
def generate_audio(text, quantity, guid, speaker, emotion_1, emotion_2):
|
| 69 |
+
y_dec = torch.tensor([torch.nan])
|
| 70 |
+
gui = guid
|
| 71 |
+
while torch.isnan(y_dec).sum() != 0:
|
| 72 |
+
x, x_lengths = convert_text(text)
|
| 73 |
+
emo_1, emo_2 = emotions.index(emotion_1), emotions.index(emotion_2)
|
| 74 |
+
emo1 = torch.LongTensor([emo_1]).to(device)
|
| 75 |
+
emo2 = torch.LongTensor([emo_2]).to(device)
|
| 76 |
+
sid = torch.LongTensor([spekears.index(speaker)]).to(device)
|
| 77 |
+
intensity = quantity / 100
|
| 78 |
|
| 79 |
+
y_enc, y_dec, attn = gradtts_uncond_model.classifier_guidance_decode_two_mixture(
|
| 80 |
+
x, x_lengths,
|
| 81 |
+
n_timesteps=100,
|
| 82 |
+
temperature=2.0,
|
| 83 |
+
stoc=args.stoc,
|
| 84 |
+
spk=sid,
|
| 85 |
+
emo1=emo1,
|
| 86 |
+
emo2=emo2,
|
| 87 |
+
emo1_weight=intensity,
|
| 88 |
+
length_scale=1.,
|
| 89 |
+
classifier_func=model.forward,
|
| 90 |
+
guidance=gui,
|
| 91 |
+
classifier_type=model.model_type
|
| 92 |
+
)
|
| 93 |
+
y_dec = y_dec.detach()
|
| 94 |
+
res = y_dec.squeeze().to(device).numpy()
|
| 95 |
+
x = torch.from_numpy(res).unsqueeze(0)
|
| 96 |
+
y_g_hat = vocoder(x)
|
| 97 |
+
audio = y_g_hat.squeeze()
|
| 98 |
+
audio = audio * 32768.0
|
| 99 |
+
audio = audio.detach().cpu().numpy().astype('int16')
|
| 100 |
+
gui -= 50
|
| 101 |
+
if gui <= 0:
|
| 102 |
+
print('shabuya')
|
| 103 |
+
break
|
| 104 |
sr = 22050
|
| 105 |
return (sr, audio)
|
| 106 |
|
|
|
|
| 109 |
[
|
| 110 |
gr.Textbox(value='Батпақ соры шабындыққа және жыл бойғы жайылымға пайдаланылады.', label="Text you want to synthesize"),
|
| 111 |
gr.Slider(0, 100, value=50, step=10, label="Count", info="Choose between 0 and 100"),
|
| 112 |
+
gr.Slider(0, 1000, value=100, step=10, label="Guidance", info="Choose between 0 and 1000"),
|
| 113 |
gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator."
|
| 114 |
),
|
| 115 |
gr.Dropdown(emotions, value=emotions[0], label="Emotion 1", info="Select first emotion"),
|
flagged/log.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Text you want to synthesize,Count,Guidance,Narrator,Emotion 1,Emotion 2,output,flag,username,timestamp
|
| 2 |
+
Батпақ соры шабындыққа және жыл бойғы жайылымға пайдаланылады.,50,20,Marzhan,angry,neutral,flagged/output/2e9a3b60dc40f07d4db8/audio.wav,,,2024-03-26 16:38:18.508507
|
| 3 |
+
Батпақ соры шабындыққа және жыл бойғы жайылымға пайдаланылады.,50,300,Marzhan,happy,neutral,flagged/output/85c2e39535a1879bccc5/audio.wav,,,2024-03-29 22:08:19.838874
|
flagged/output/2e9a3b60dc40f07d4db8/audio.wav
ADDED
|
Binary file (171 kB). View file
|
|
|
flagged/output/85c2e39535a1879bccc5/audio.wav
ADDED
|
Binary file (173 kB). View file
|
|
|