Spaces:

Dionyssos
/

speech-analysis2

Running

App Files Files Community

Dionyssos commited on Sep 22

Commit

8f7e85e

1 Parent(s): a7e2983

kv flush before.

Browse files

Files changed (2) hide show

app.py +20 -20
audiocraft.py +4 -2

app.py CHANGED Viewed

@@ -145,14 +145,14 @@ def audionar_tts(text=None,
             0.49 * x + 0.51 * background_audio,
             0.51 * background_audio + 0.49 * x
         ], 0)
     else:
-        final_audio = x
     wavfile = '_vits_.wav'
-    audiofile.write(wavfile, final_audio, 16000)
     return wavfile  # 2x file for [audio out & state to pass to the Emotion reco tAB]
@@ -166,23 +166,23 @@ def audionar_tts(text=None,
 VOICES = [
     'jv_ID_google-gmu_04982.wav',
     'en_US_vctk_p303.wav',
-    'en_US_vctk_p306.wav',
     'en_US_vctk_p318.wav',
     'en_US_vctk_p269.wav',
     'en_US_vctk_p316.wav',
     'en_US_vctk_p362.wav', # cls
     'fr_FR_tom.wav',
-    'bn_multi_5958.wav',
     'en_US_vctk_p287.wav',
     'en_US_vctk_p260.wav',
-    'en_US_cmu_arctic_fem.wav',
     'en_US_cmu_arctic_rms.wav',
     'fr_FR_m-ailabs_nadine_eckert_boulet.wav',
     'en_US_vctk_p237.wav',
     'en_US_vctk_p317.wav',
     'tn_ZA_google-nwu_0378.wav',
     'nl_pmk.wav',
-    'tn_ZA_google-nwu_3342.wav',
     'ne_NP_ne-google_3997.wav',
     'tn_ZA_google-nwu_8914.wav',
     'en_US_vctk_p238.wav',
@@ -192,13 +192,13 @@ VOICES = [
     'en_US_vctk_p326.wav',
     'en_US_vctk_p264.wav',
     'en_US_vctk_p295.wav',
-    'en_US_vctk_p294.wav',
     'en_US_vctk_p330.wav',
-    'gu_IN_cmu-indic_cmu_indic_guj_ad.wav',
-    'jv_ID_google-gmu_05219.wav',
     'en_US_vctk_p284.wav',
-    'en_US_m-ailabs_mary_ann.wav',
-    'bn_multi_01701.wav',
     'en_US_vctk_p262.wav',
     'en_US_vctk_p243.wav',
     'en_US_vctk_p278.wav',
@@ -216,14 +216,14 @@ VOICES = [
     'en_US_vctk_p230.wav',
     'af_ZA_google-nwu_7214.wav',
     'nl_nathalie.wav',
-    'en_US_cmu_arctic_lnh.wav',
     'tn_ZA_google-nwu_6459.wav',
-    'tn_ZA_google-nwu_6206.wav',
     'en_US_vctk_p323.wav',
     'en_US_m-ailabs_judy_bieber.wav',
     'en_US_vctk_p261.wav',
     'fa_haaniye.wav',
-    # 'en_US_vctk_p339.wav',
     'tn_ZA_google-nwu_7896.wav',
     'en_US_vctk_p258.wav',
     'tn_ZA_google-nwu_7674.wav',
@@ -246,14 +246,14 @@ VOICES = [
     'tn_ZA_google-nwu_2839.wav',
     'af_ZA_google-nwu_7130.wav',
     'ne_NP_ne-google_2139.wav',
-    'jv_ID_google-gmu_04715.wav',
     'en_US_vctk_p273.wav'
     ]
 VOICES = [t[:-4] for t in VOICES]  # crop .wav for visuals in gr.DropDown
 _tts = StyleTTS2().to('cpu')
-with gr.Blocks(theme='huggingface') as demo:
     with gr.Row():
         text_input = gr.Textbox(
             label="Type text for TTS:",
@@ -272,7 +272,7 @@ with gr.Blocks(theme='huggingface') as demo:
             label="AudioGen Txt"
         )
         kv_input = gr.Number(
-            label="Num Tokens",
             value=24,
         )
         generate_button = gr.Button("Generate Audio", variant="primary")
@@ -284,4 +284,4 @@ with gr.Blocks(theme='huggingface') as demo:
         inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
         outputs=[output_audio]
     )
-demo.launch(debug=True)

             0.49 * x + 0.51 * background_audio,
             0.51 * background_audio + 0.49 * x
         ], 0)
     else:
+        final_audio = x
     wavfile = '_vits_.wav'
+    audiofile.write(wavfile, final_audio, 16000)
     return wavfile  # 2x file for [audio out & state to pass to the Emotion reco tAB]
 VOICES = [
     'jv_ID_google-gmu_04982.wav',
     'en_US_vctk_p303.wav',
+    'en_US_vctk_p306.wav',
     'en_US_vctk_p318.wav',
     'en_US_vctk_p269.wav',
     'en_US_vctk_p316.wav',
     'en_US_vctk_p362.wav', # cls
     'fr_FR_tom.wav',
+    'bn_multi_5958.wav',
     'en_US_vctk_p287.wav',
     'en_US_vctk_p260.wav',
+    'en_US_cmu_arctic_fem.wav',
     'en_US_cmu_arctic_rms.wav',
     'fr_FR_m-ailabs_nadine_eckert_boulet.wav',
     'en_US_vctk_p237.wav',
     'en_US_vctk_p317.wav',
     'tn_ZA_google-nwu_0378.wav',
     'nl_pmk.wav',
+    'tn_ZA_google-nwu_3342.wav',
     'ne_NP_ne-google_3997.wav',
     'tn_ZA_google-nwu_8914.wav',
     'en_US_vctk_p238.wav',
     'en_US_vctk_p326.wav',
     'en_US_vctk_p264.wav',
     'en_US_vctk_p295.wav',
+    'en_US_vctk_p294.wav',
     'en_US_vctk_p330.wav',
+    'gu_IN_cmu-indic_cmu_indic_guj_ad.wav',
+    'jv_ID_google-gmu_05219.wav',
     'en_US_vctk_p284.wav',
+    'en_US_m-ailabs_mary_ann.wav',
+    'bn_multi_01701.wav',
     'en_US_vctk_p262.wav',
     'en_US_vctk_p243.wav',
     'en_US_vctk_p278.wav',
     'en_US_vctk_p230.wav',
     'af_ZA_google-nwu_7214.wav',
     'nl_nathalie.wav',
+    'en_US_cmu_arctic_lnh.wav',
     'tn_ZA_google-nwu_6459.wav',
+    'tn_ZA_google-nwu_6206.wav',
     'en_US_vctk_p323.wav',
     'en_US_m-ailabs_judy_bieber.wav',
     'en_US_vctk_p261.wav',
     'fa_haaniye.wav',
+    # 'en_US_vctk_p339.wav',
     'tn_ZA_google-nwu_7896.wav',
     'en_US_vctk_p258.wav',
     'tn_ZA_google-nwu_7674.wav',
     'tn_ZA_google-nwu_2839.wav',
     'af_ZA_google-nwu_7130.wav',
     'ne_NP_ne-google_2139.wav',
+    'jv_ID_google-gmu_04715.wav',
     'en_US_vctk_p273.wav'
     ]
 VOICES = [t[:-4] for t in VOICES]  # crop .wav for visuals in gr.DropDown
 _tts = StyleTTS2().to('cpu')
+with gr.Blocks() as demo:
     with gr.Row():
         text_input = gr.Textbox(
             label="Type text for TTS:",
             label="AudioGen Txt"
         )
         kv_input = gr.Number(
+            label="Tokens",
             value=24,
         )
         generate_button = gr.Button("Generate Audio", variant="primary")
         inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
         outputs=[output_audio]
     )
+demo.launch(debug=True)

audiocraft.py CHANGED Viewed

@@ -66,6 +66,7 @@ class AudioGen(torch.nn.Module):
                  ):
         torch.manual_seed(42)  # https://github.com/facebookresearch/audiocraft/issues/111#issuecomment-1614732858
         n_draw = int(duration * 50 / (max_tokens * N_REPEAT)) + 1
         with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
             gen_tokens = self.lm.generate(
                 text_condition=[prompt] * N_REPEAT  + [''] * N_REPEAT,#['dogs', 'dogs...!', '', '']
@@ -476,6 +477,7 @@ class LMModel(nn.Module):
                  max_tokens=None,
                  text_condition=None
                  ):
         x = self.t5(text_condition)
         bs = x.shape[0] // 2  # has null conditions - bs*2*N_REPEAT applys in builders.py
         self._scale = .3 * torch.rand(1, 1, self.n_draw, 1, device=x.device) + 1.94
@@ -549,7 +551,7 @@ class LMModel(nn.Module):
         out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)
         # flush for next API call
-        self.transformer._flush()
         return out_codes  # SKIP THE 4 fill 2048
@@ -727,5 +729,5 @@ if __name__ == '__main__':
     import audiofile  # pip uninstall flash-attn
     model = AudioGen().to('cpu')
-    x = model.generate(prompt='swims in lake frogs', duration=56.4, max_tokens=24).cpu().numpy()
     audiofile.write('_sound_.wav', x, 16000)

                  ):
         torch.manual_seed(42)  # https://github.com/facebookresearch/audiocraft/issues/111#issuecomment-1614732858
         n_draw = int(duration * 50 / (max_tokens * N_REPEAT)) + 1
+        print(f'{n_draw=}            {duration=}seconds < {prompt=}')
         with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
             gen_tokens = self.lm.generate(
                 text_condition=[prompt] * N_REPEAT  + [''] * N_REPEAT,#['dogs', 'dogs...!', '', '']
                  max_tokens=None,
                  text_condition=None
                  ):
+        self.transformer._flush()  # perhaps long kv cache has been filled on previous call for unrelated sounds
         x = self.t5(text_condition)
         bs = x.shape[0] // 2  # has null conditions - bs*2*N_REPEAT applys in builders.py
         self._scale = .3 * torch.rand(1, 1, self.n_draw, 1, device=x.device) + 1.94
         out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)
         # flush for next API call
+        #self.transformer._flush()  # Not reached if cancenled ? gradio run - Do flush every at beginning of new generation
         return out_codes  # SKIP THE 4 fill 2048
     import audiofile  # pip uninstall flash-attn
     model = AudioGen().to('cpu')
+    x = model.generate(prompt='swims in lake frogs', duration=6000.4, max_tokens=24).cpu().numpy()
     audiofile.write('_sound_.wav', x, 16000)