Dionyssos commited on
Commit
8f7e85e
·
1 Parent(s): a7e2983

kv flush before.

Browse files
Files changed (2) hide show
  1. app.py +20 -20
  2. audiocraft.py +4 -2
app.py CHANGED
@@ -145,14 +145,14 @@ def audionar_tts(text=None,
145
  0.49 * x + 0.51 * background_audio,
146
  0.51 * background_audio + 0.49 * x
147
  ], 0)
148
-
149
  else:
150
-
151
- final_audio = x
152
 
153
 
154
  wavfile = '_vits_.wav'
155
- audiofile.write(wavfile, final_audio, 16000)
156
  return wavfile # 2x file for [audio out & state to pass to the Emotion reco tAB]
157
 
158
 
@@ -166,23 +166,23 @@ def audionar_tts(text=None,
166
  VOICES = [
167
  'jv_ID_google-gmu_04982.wav',
168
  'en_US_vctk_p303.wav',
169
- 'en_US_vctk_p306.wav',
170
  'en_US_vctk_p318.wav',
171
  'en_US_vctk_p269.wav',
172
  'en_US_vctk_p316.wav',
173
  'en_US_vctk_p362.wav', # cls
174
  'fr_FR_tom.wav',
175
- 'bn_multi_5958.wav',
176
  'en_US_vctk_p287.wav',
177
  'en_US_vctk_p260.wav',
178
- 'en_US_cmu_arctic_fem.wav',
179
  'en_US_cmu_arctic_rms.wav',
180
  'fr_FR_m-ailabs_nadine_eckert_boulet.wav',
181
  'en_US_vctk_p237.wav',
182
  'en_US_vctk_p317.wav',
183
  'tn_ZA_google-nwu_0378.wav',
184
  'nl_pmk.wav',
185
- 'tn_ZA_google-nwu_3342.wav',
186
  'ne_NP_ne-google_3997.wav',
187
  'tn_ZA_google-nwu_8914.wav',
188
  'en_US_vctk_p238.wav',
@@ -192,13 +192,13 @@ VOICES = [
192
  'en_US_vctk_p326.wav',
193
  'en_US_vctk_p264.wav',
194
  'en_US_vctk_p295.wav',
195
- 'en_US_vctk_p294.wav',
196
  'en_US_vctk_p330.wav',
197
- 'gu_IN_cmu-indic_cmu_indic_guj_ad.wav',
198
- 'jv_ID_google-gmu_05219.wav',
199
  'en_US_vctk_p284.wav',
200
- 'en_US_m-ailabs_mary_ann.wav',
201
- 'bn_multi_01701.wav',
202
  'en_US_vctk_p262.wav',
203
  'en_US_vctk_p243.wav',
204
  'en_US_vctk_p278.wav',
@@ -216,14 +216,14 @@ VOICES = [
216
  'en_US_vctk_p230.wav',
217
  'af_ZA_google-nwu_7214.wav',
218
  'nl_nathalie.wav',
219
- 'en_US_cmu_arctic_lnh.wav',
220
  'tn_ZA_google-nwu_6459.wav',
221
- 'tn_ZA_google-nwu_6206.wav',
222
  'en_US_vctk_p323.wav',
223
  'en_US_m-ailabs_judy_bieber.wav',
224
  'en_US_vctk_p261.wav',
225
  'fa_haaniye.wav',
226
- # 'en_US_vctk_p339.wav',
227
  'tn_ZA_google-nwu_7896.wav',
228
  'en_US_vctk_p258.wav',
229
  'tn_ZA_google-nwu_7674.wav',
@@ -246,14 +246,14 @@ VOICES = [
246
  'tn_ZA_google-nwu_2839.wav',
247
  'af_ZA_google-nwu_7130.wav',
248
  'ne_NP_ne-google_2139.wav',
249
- 'jv_ID_google-gmu_04715.wav',
250
  'en_US_vctk_p273.wav'
251
  ]
252
  VOICES = [t[:-4] for t in VOICES] # crop .wav for visuals in gr.DropDown
253
 
254
  _tts = StyleTTS2().to('cpu')
255
 
256
- with gr.Blocks(theme='huggingface') as demo:
257
  with gr.Row():
258
  text_input = gr.Textbox(
259
  label="Type text for TTS:",
@@ -272,7 +272,7 @@ with gr.Blocks(theme='huggingface') as demo:
272
  label="AudioGen Txt"
273
  )
274
  kv_input = gr.Number(
275
- label="Num Tokens",
276
  value=24,
277
  )
278
  generate_button = gr.Button("Generate Audio", variant="primary")
@@ -284,4 +284,4 @@ with gr.Blocks(theme='huggingface') as demo:
284
  inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
285
  outputs=[output_audio]
286
  )
287
- demo.launch(debug=True)
 
145
  0.49 * x + 0.51 * background_audio,
146
  0.51 * background_audio + 0.49 * x
147
  ], 0)
148
+
149
  else:
150
+
151
+ final_audio = x
152
 
153
 
154
  wavfile = '_vits_.wav'
155
+ audiofile.write(wavfile, final_audio, 16000)
156
  return wavfile # 2x file for [audio out & state to pass to the Emotion reco tAB]
157
 
158
 
 
166
  VOICES = [
167
  'jv_ID_google-gmu_04982.wav',
168
  'en_US_vctk_p303.wav',
169
+ 'en_US_vctk_p306.wav',
170
  'en_US_vctk_p318.wav',
171
  'en_US_vctk_p269.wav',
172
  'en_US_vctk_p316.wav',
173
  'en_US_vctk_p362.wav', # cls
174
  'fr_FR_tom.wav',
175
+ 'bn_multi_5958.wav',
176
  'en_US_vctk_p287.wav',
177
  'en_US_vctk_p260.wav',
178
+ 'en_US_cmu_arctic_fem.wav',
179
  'en_US_cmu_arctic_rms.wav',
180
  'fr_FR_m-ailabs_nadine_eckert_boulet.wav',
181
  'en_US_vctk_p237.wav',
182
  'en_US_vctk_p317.wav',
183
  'tn_ZA_google-nwu_0378.wav',
184
  'nl_pmk.wav',
185
+ 'tn_ZA_google-nwu_3342.wav',
186
  'ne_NP_ne-google_3997.wav',
187
  'tn_ZA_google-nwu_8914.wav',
188
  'en_US_vctk_p238.wav',
 
192
  'en_US_vctk_p326.wav',
193
  'en_US_vctk_p264.wav',
194
  'en_US_vctk_p295.wav',
195
+ 'en_US_vctk_p294.wav',
196
  'en_US_vctk_p330.wav',
197
+ 'gu_IN_cmu-indic_cmu_indic_guj_ad.wav',
198
+ 'jv_ID_google-gmu_05219.wav',
199
  'en_US_vctk_p284.wav',
200
+ 'en_US_m-ailabs_mary_ann.wav',
201
+ 'bn_multi_01701.wav',
202
  'en_US_vctk_p262.wav',
203
  'en_US_vctk_p243.wav',
204
  'en_US_vctk_p278.wav',
 
216
  'en_US_vctk_p230.wav',
217
  'af_ZA_google-nwu_7214.wav',
218
  'nl_nathalie.wav',
219
+ 'en_US_cmu_arctic_lnh.wav',
220
  'tn_ZA_google-nwu_6459.wav',
221
+ 'tn_ZA_google-nwu_6206.wav',
222
  'en_US_vctk_p323.wav',
223
  'en_US_m-ailabs_judy_bieber.wav',
224
  'en_US_vctk_p261.wav',
225
  'fa_haaniye.wav',
226
+ # 'en_US_vctk_p339.wav',
227
  'tn_ZA_google-nwu_7896.wav',
228
  'en_US_vctk_p258.wav',
229
  'tn_ZA_google-nwu_7674.wav',
 
246
  'tn_ZA_google-nwu_2839.wav',
247
  'af_ZA_google-nwu_7130.wav',
248
  'ne_NP_ne-google_2139.wav',
249
+ 'jv_ID_google-gmu_04715.wav',
250
  'en_US_vctk_p273.wav'
251
  ]
252
  VOICES = [t[:-4] for t in VOICES] # crop .wav for visuals in gr.DropDown
253
 
254
  _tts = StyleTTS2().to('cpu')
255
 
256
+ with gr.Blocks() as demo:
257
  with gr.Row():
258
  text_input = gr.Textbox(
259
  label="Type text for TTS:",
 
272
  label="AudioGen Txt"
273
  )
274
  kv_input = gr.Number(
275
+ label="Tokens",
276
  value=24,
277
  )
278
  generate_button = gr.Button("Generate Audio", variant="primary")
 
284
  inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
285
  outputs=[output_audio]
286
  )
287
+ demo.launch(debug=True)
audiocraft.py CHANGED
@@ -66,6 +66,7 @@ class AudioGen(torch.nn.Module):
66
  ):
67
  torch.manual_seed(42) # https://github.com/facebookresearch/audiocraft/issues/111#issuecomment-1614732858
68
  n_draw = int(duration * 50 / (max_tokens * N_REPEAT)) + 1
 
69
  with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
70
  gen_tokens = self.lm.generate(
71
  text_condition=[prompt] * N_REPEAT + [''] * N_REPEAT,#['dogs', 'dogs...!', '', '']
@@ -476,6 +477,7 @@ class LMModel(nn.Module):
476
  max_tokens=None,
477
  text_condition=None
478
  ):
 
479
  x = self.t5(text_condition)
480
  bs = x.shape[0] // 2 # has null conditions - bs*2*N_REPEAT applys in builders.py
481
  self._scale = .3 * torch.rand(1, 1, self.n_draw, 1, device=x.device) + 1.94
@@ -549,7 +551,7 @@ class LMModel(nn.Module):
549
  out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)
550
 
551
  # flush for next API call
552
- self.transformer._flush()
553
 
554
  return out_codes # SKIP THE 4 fill 2048
555
 
@@ -727,5 +729,5 @@ if __name__ == '__main__':
727
 
728
  import audiofile # pip uninstall flash-attn
729
  model = AudioGen().to('cpu')
730
- x = model.generate(prompt='swims in lake frogs', duration=56.4, max_tokens=24).cpu().numpy()
731
  audiofile.write('_sound_.wav', x, 16000)
 
66
  ):
67
  torch.manual_seed(42) # https://github.com/facebookresearch/audiocraft/issues/111#issuecomment-1614732858
68
  n_draw = int(duration * 50 / (max_tokens * N_REPEAT)) + 1
69
+ print(f'{n_draw=} {duration=}seconds < {prompt=}')
70
  with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
71
  gen_tokens = self.lm.generate(
72
  text_condition=[prompt] * N_REPEAT + [''] * N_REPEAT,#['dogs', 'dogs...!', '', '']
 
477
  max_tokens=None,
478
  text_condition=None
479
  ):
480
+ self.transformer._flush() # perhaps long kv cache has been filled on previous call for unrelated sounds
481
  x = self.t5(text_condition)
482
  bs = x.shape[0] // 2 # has null conditions - bs*2*N_REPEAT applys in builders.py
483
  self._scale = .3 * torch.rand(1, 1, self.n_draw, 1, device=x.device) + 1.94
 
551
  out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)
552
 
553
  # flush for next API call
554
+ #self.transformer._flush() # Not reached if cancenled ? gradio run - Do flush every at beginning of new generation
555
 
556
  return out_codes # SKIP THE 4 fill 2048
557
 
 
729
 
730
  import audiofile # pip uninstall flash-attn
731
  model = AudioGen().to('cpu')
732
+ x = model.generate(prompt='swims in lake frogs', duration=6000.4, max_tokens=24).cpu().numpy()
733
  audiofile.write('_sound_.wav', x, 16000)