Spaces:
Running
Running
kv flush before.
Browse files- app.py +20 -20
- audiocraft.py +4 -2
app.py
CHANGED
|
@@ -145,14 +145,14 @@ def audionar_tts(text=None,
|
|
| 145 |
0.49 * x + 0.51 * background_audio,
|
| 146 |
0.51 * background_audio + 0.49 * x
|
| 147 |
], 0)
|
| 148 |
-
|
| 149 |
else:
|
| 150 |
-
|
| 151 |
-
final_audio = x
|
| 152 |
|
| 153 |
|
| 154 |
wavfile = '_vits_.wav'
|
| 155 |
-
audiofile.write(wavfile, final_audio, 16000)
|
| 156 |
return wavfile # 2x file for [audio out & state to pass to the Emotion reco tAB]
|
| 157 |
|
| 158 |
|
|
@@ -166,23 +166,23 @@ def audionar_tts(text=None,
|
|
| 166 |
VOICES = [
|
| 167 |
'jv_ID_google-gmu_04982.wav',
|
| 168 |
'en_US_vctk_p303.wav',
|
| 169 |
-
'en_US_vctk_p306.wav',
|
| 170 |
'en_US_vctk_p318.wav',
|
| 171 |
'en_US_vctk_p269.wav',
|
| 172 |
'en_US_vctk_p316.wav',
|
| 173 |
'en_US_vctk_p362.wav', # cls
|
| 174 |
'fr_FR_tom.wav',
|
| 175 |
-
'bn_multi_5958.wav',
|
| 176 |
'en_US_vctk_p287.wav',
|
| 177 |
'en_US_vctk_p260.wav',
|
| 178 |
-
'en_US_cmu_arctic_fem.wav',
|
| 179 |
'en_US_cmu_arctic_rms.wav',
|
| 180 |
'fr_FR_m-ailabs_nadine_eckert_boulet.wav',
|
| 181 |
'en_US_vctk_p237.wav',
|
| 182 |
'en_US_vctk_p317.wav',
|
| 183 |
'tn_ZA_google-nwu_0378.wav',
|
| 184 |
'nl_pmk.wav',
|
| 185 |
-
'tn_ZA_google-nwu_3342.wav',
|
| 186 |
'ne_NP_ne-google_3997.wav',
|
| 187 |
'tn_ZA_google-nwu_8914.wav',
|
| 188 |
'en_US_vctk_p238.wav',
|
|
@@ -192,13 +192,13 @@ VOICES = [
|
|
| 192 |
'en_US_vctk_p326.wav',
|
| 193 |
'en_US_vctk_p264.wav',
|
| 194 |
'en_US_vctk_p295.wav',
|
| 195 |
-
'en_US_vctk_p294.wav',
|
| 196 |
'en_US_vctk_p330.wav',
|
| 197 |
-
'gu_IN_cmu-indic_cmu_indic_guj_ad.wav',
|
| 198 |
-
'jv_ID_google-gmu_05219.wav',
|
| 199 |
'en_US_vctk_p284.wav',
|
| 200 |
-
'en_US_m-ailabs_mary_ann.wav',
|
| 201 |
-
'bn_multi_01701.wav',
|
| 202 |
'en_US_vctk_p262.wav',
|
| 203 |
'en_US_vctk_p243.wav',
|
| 204 |
'en_US_vctk_p278.wav',
|
|
@@ -216,14 +216,14 @@ VOICES = [
|
|
| 216 |
'en_US_vctk_p230.wav',
|
| 217 |
'af_ZA_google-nwu_7214.wav',
|
| 218 |
'nl_nathalie.wav',
|
| 219 |
-
'en_US_cmu_arctic_lnh.wav',
|
| 220 |
'tn_ZA_google-nwu_6459.wav',
|
| 221 |
-
'tn_ZA_google-nwu_6206.wav',
|
| 222 |
'en_US_vctk_p323.wav',
|
| 223 |
'en_US_m-ailabs_judy_bieber.wav',
|
| 224 |
'en_US_vctk_p261.wav',
|
| 225 |
'fa_haaniye.wav',
|
| 226 |
-
# 'en_US_vctk_p339.wav',
|
| 227 |
'tn_ZA_google-nwu_7896.wav',
|
| 228 |
'en_US_vctk_p258.wav',
|
| 229 |
'tn_ZA_google-nwu_7674.wav',
|
|
@@ -246,14 +246,14 @@ VOICES = [
|
|
| 246 |
'tn_ZA_google-nwu_2839.wav',
|
| 247 |
'af_ZA_google-nwu_7130.wav',
|
| 248 |
'ne_NP_ne-google_2139.wav',
|
| 249 |
-
'jv_ID_google-gmu_04715.wav',
|
| 250 |
'en_US_vctk_p273.wav'
|
| 251 |
]
|
| 252 |
VOICES = [t[:-4] for t in VOICES] # crop .wav for visuals in gr.DropDown
|
| 253 |
|
| 254 |
_tts = StyleTTS2().to('cpu')
|
| 255 |
|
| 256 |
-
with gr.Blocks(
|
| 257 |
with gr.Row():
|
| 258 |
text_input = gr.Textbox(
|
| 259 |
label="Type text for TTS:",
|
|
@@ -272,7 +272,7 @@ with gr.Blocks(theme='huggingface') as demo:
|
|
| 272 |
label="AudioGen Txt"
|
| 273 |
)
|
| 274 |
kv_input = gr.Number(
|
| 275 |
-
label="
|
| 276 |
value=24,
|
| 277 |
)
|
| 278 |
generate_button = gr.Button("Generate Audio", variant="primary")
|
|
@@ -284,4 +284,4 @@ with gr.Blocks(theme='huggingface') as demo:
|
|
| 284 |
inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
|
| 285 |
outputs=[output_audio]
|
| 286 |
)
|
| 287 |
-
demo.launch(debug=True)
|
|
|
|
| 145 |
0.49 * x + 0.51 * background_audio,
|
| 146 |
0.51 * background_audio + 0.49 * x
|
| 147 |
], 0)
|
| 148 |
+
|
| 149 |
else:
|
| 150 |
+
|
| 151 |
+
final_audio = x
|
| 152 |
|
| 153 |
|
| 154 |
wavfile = '_vits_.wav'
|
| 155 |
+
audiofile.write(wavfile, final_audio, 16000)
|
| 156 |
return wavfile # 2x file for [audio out & state to pass to the Emotion reco tAB]
|
| 157 |
|
| 158 |
|
|
|
|
| 166 |
VOICES = [
|
| 167 |
'jv_ID_google-gmu_04982.wav',
|
| 168 |
'en_US_vctk_p303.wav',
|
| 169 |
+
'en_US_vctk_p306.wav',
|
| 170 |
'en_US_vctk_p318.wav',
|
| 171 |
'en_US_vctk_p269.wav',
|
| 172 |
'en_US_vctk_p316.wav',
|
| 173 |
'en_US_vctk_p362.wav', # cls
|
| 174 |
'fr_FR_tom.wav',
|
| 175 |
+
'bn_multi_5958.wav',
|
| 176 |
'en_US_vctk_p287.wav',
|
| 177 |
'en_US_vctk_p260.wav',
|
| 178 |
+
'en_US_cmu_arctic_fem.wav',
|
| 179 |
'en_US_cmu_arctic_rms.wav',
|
| 180 |
'fr_FR_m-ailabs_nadine_eckert_boulet.wav',
|
| 181 |
'en_US_vctk_p237.wav',
|
| 182 |
'en_US_vctk_p317.wav',
|
| 183 |
'tn_ZA_google-nwu_0378.wav',
|
| 184 |
'nl_pmk.wav',
|
| 185 |
+
'tn_ZA_google-nwu_3342.wav',
|
| 186 |
'ne_NP_ne-google_3997.wav',
|
| 187 |
'tn_ZA_google-nwu_8914.wav',
|
| 188 |
'en_US_vctk_p238.wav',
|
|
|
|
| 192 |
'en_US_vctk_p326.wav',
|
| 193 |
'en_US_vctk_p264.wav',
|
| 194 |
'en_US_vctk_p295.wav',
|
| 195 |
+
'en_US_vctk_p294.wav',
|
| 196 |
'en_US_vctk_p330.wav',
|
| 197 |
+
'gu_IN_cmu-indic_cmu_indic_guj_ad.wav',
|
| 198 |
+
'jv_ID_google-gmu_05219.wav',
|
| 199 |
'en_US_vctk_p284.wav',
|
| 200 |
+
'en_US_m-ailabs_mary_ann.wav',
|
| 201 |
+
'bn_multi_01701.wav',
|
| 202 |
'en_US_vctk_p262.wav',
|
| 203 |
'en_US_vctk_p243.wav',
|
| 204 |
'en_US_vctk_p278.wav',
|
|
|
|
| 216 |
'en_US_vctk_p230.wav',
|
| 217 |
'af_ZA_google-nwu_7214.wav',
|
| 218 |
'nl_nathalie.wav',
|
| 219 |
+
'en_US_cmu_arctic_lnh.wav',
|
| 220 |
'tn_ZA_google-nwu_6459.wav',
|
| 221 |
+
'tn_ZA_google-nwu_6206.wav',
|
| 222 |
'en_US_vctk_p323.wav',
|
| 223 |
'en_US_m-ailabs_judy_bieber.wav',
|
| 224 |
'en_US_vctk_p261.wav',
|
| 225 |
'fa_haaniye.wav',
|
| 226 |
+
# 'en_US_vctk_p339.wav',
|
| 227 |
'tn_ZA_google-nwu_7896.wav',
|
| 228 |
'en_US_vctk_p258.wav',
|
| 229 |
'tn_ZA_google-nwu_7674.wav',
|
|
|
|
| 246 |
'tn_ZA_google-nwu_2839.wav',
|
| 247 |
'af_ZA_google-nwu_7130.wav',
|
| 248 |
'ne_NP_ne-google_2139.wav',
|
| 249 |
+
'jv_ID_google-gmu_04715.wav',
|
| 250 |
'en_US_vctk_p273.wav'
|
| 251 |
]
|
| 252 |
VOICES = [t[:-4] for t in VOICES] # crop .wav for visuals in gr.DropDown
|
| 253 |
|
| 254 |
_tts = StyleTTS2().to('cpu')
|
| 255 |
|
| 256 |
+
with gr.Blocks() as demo:
|
| 257 |
with gr.Row():
|
| 258 |
text_input = gr.Textbox(
|
| 259 |
label="Type text for TTS:",
|
|
|
|
| 272 |
label="AudioGen Txt"
|
| 273 |
)
|
| 274 |
kv_input = gr.Number(
|
| 275 |
+
label="Tokens",
|
| 276 |
value=24,
|
| 277 |
)
|
| 278 |
generate_button = gr.Button("Generate Audio", variant="primary")
|
|
|
|
| 284 |
inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
|
| 285 |
outputs=[output_audio]
|
| 286 |
)
|
| 287 |
+
demo.launch(debug=True)
|
audiocraft.py
CHANGED
|
@@ -66,6 +66,7 @@ class AudioGen(torch.nn.Module):
|
|
| 66 |
):
|
| 67 |
torch.manual_seed(42) # https://github.com/facebookresearch/audiocraft/issues/111#issuecomment-1614732858
|
| 68 |
n_draw = int(duration * 50 / (max_tokens * N_REPEAT)) + 1
|
|
|
|
| 69 |
with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
|
| 70 |
gen_tokens = self.lm.generate(
|
| 71 |
text_condition=[prompt] * N_REPEAT + [''] * N_REPEAT,#['dogs', 'dogs...!', '', '']
|
|
@@ -476,6 +477,7 @@ class LMModel(nn.Module):
|
|
| 476 |
max_tokens=None,
|
| 477 |
text_condition=None
|
| 478 |
):
|
|
|
|
| 479 |
x = self.t5(text_condition)
|
| 480 |
bs = x.shape[0] // 2 # has null conditions - bs*2*N_REPEAT applys in builders.py
|
| 481 |
self._scale = .3 * torch.rand(1, 1, self.n_draw, 1, device=x.device) + 1.94
|
|
@@ -549,7 +551,7 @@ class LMModel(nn.Module):
|
|
| 549 |
out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)
|
| 550 |
|
| 551 |
# flush for next API call
|
| 552 |
-
self.transformer._flush()
|
| 553 |
|
| 554 |
return out_codes # SKIP THE 4 fill 2048
|
| 555 |
|
|
@@ -727,5 +729,5 @@ if __name__ == '__main__':
|
|
| 727 |
|
| 728 |
import audiofile # pip uninstall flash-attn
|
| 729 |
model = AudioGen().to('cpu')
|
| 730 |
-
x = model.generate(prompt='swims in lake frogs', duration=
|
| 731 |
audiofile.write('_sound_.wav', x, 16000)
|
|
|
|
| 66 |
):
|
| 67 |
torch.manual_seed(42) # https://github.com/facebookresearch/audiocraft/issues/111#issuecomment-1614732858
|
| 68 |
n_draw = int(duration * 50 / (max_tokens * N_REPEAT)) + 1
|
| 69 |
+
print(f'{n_draw=} {duration=}seconds < {prompt=}')
|
| 70 |
with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
|
| 71 |
gen_tokens = self.lm.generate(
|
| 72 |
text_condition=[prompt] * N_REPEAT + [''] * N_REPEAT,#['dogs', 'dogs...!', '', '']
|
|
|
|
| 477 |
max_tokens=None,
|
| 478 |
text_condition=None
|
| 479 |
):
|
| 480 |
+
self.transformer._flush() # perhaps long kv cache has been filled on previous call for unrelated sounds
|
| 481 |
x = self.t5(text_condition)
|
| 482 |
bs = x.shape[0] // 2 # has null conditions - bs*2*N_REPEAT applys in builders.py
|
| 483 |
self._scale = .3 * torch.rand(1, 1, self.n_draw, 1, device=x.device) + 1.94
|
|
|
|
| 551 |
out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)
|
| 552 |
|
| 553 |
# flush for next API call
|
| 554 |
+
#self.transformer._flush() # Not reached if cancenled ? gradio run - Do flush every at beginning of new generation
|
| 555 |
|
| 556 |
return out_codes # SKIP THE 4 fill 2048
|
| 557 |
|
|
|
|
| 729 |
|
| 730 |
import audiofile # pip uninstall flash-attn
|
| 731 |
model = AudioGen().to('cpu')
|
| 732 |
+
x = model.generate(prompt='swims in lake frogs', duration=6000.4, max_tokens=24).cpu().numpy()
|
| 733 |
audiofile.write('_sound_.wav', x, 16000)
|