Dionyssos commited on
Commit
18fcbae
·
1 Parent(s): 2f811de
Files changed (2) hide show
  1. app.py +3 -3
  2. audiocraft.py +3 -4
app.py CHANGED
@@ -119,7 +119,7 @@ def audionar_tts(text=None,
119
  background_audio = audiogen.generate(
120
  soundscape,
121
  duration=target_duration,
122
- max_tokens=min( max(4, int(max_tokens)), 500 ) # limit sounds tokens (clone beyond)
123
  ).numpy()
124
 
125
  # PAD
@@ -263,13 +263,13 @@ with gr.Blocks() as demo:
263
  )
264
  choice_dropdown = gr.Dropdown(
265
  choices=VOICES + language_names,
266
- label="Vox:",
267
  value=VOICES[0]
268
  )
269
  soundscape_input = gr.Textbox(
270
  lines=1,
271
  value="swims in lake frogs",
272
- label="AudioGen Txt"
273
  )
274
  kv_input = gr.Number(
275
  label="Tokens",
 
119
  background_audio = audiogen.generate(
120
  soundscape,
121
  duration=target_duration,
122
+ max_tokens=min( max(7, int(max_tokens)), 288 ) # limit sounds tokens (clone beyond)
123
  ).numpy()
124
 
125
  # PAD
 
263
  )
264
  choice_dropdown = gr.Dropdown(
265
  choices=VOICES + language_names,
266
+ label="Vox :",
267
  value=VOICES[0]
268
  )
269
  soundscape_input = gr.Textbox(
270
  lines=1,
271
  value="swims in lake frogs",
272
+ label="AudioGen Txt:"
273
  )
274
  kv_input = gr.Number(
275
  label="Tokens",
audiocraft.py CHANGED
@@ -76,9 +76,8 @@ class AudioGen(torch.nn.Module):
76
  x = []
77
 
78
 
79
- for i in range(7, gen_tokens.shape[2], self._chunk_len): # min soundscape 2s assures 10 tokens
80
-
81
- decoded_chunk = self.compression_model.decode(gen_tokens[:, :, i-7:i+self._chunk_len])
82
 
83
  x.append(decoded_chunk)
84
 
@@ -729,5 +728,5 @@ if __name__ == '__main__':
729
 
730
  import audiofile # pip uninstall flash-attn
731
  model = AudioGen().to('cpu')
732
- x = model.generate(prompt='swims in lake frogs', duration=6000.4, max_tokens=24).cpu().numpy()
733
  audiofile.write('_sound_.wav', x, 16000)
 
76
  x = []
77
 
78
 
79
+ for i in range(6, gen_tokens.shape[2], self._chunk_len): # has to run at least once
80
+ decoded_chunk = self.compression_model.decode(gen_tokens[:, :, i-6:i+self._chunk_len]) # see 6 token twice
 
81
 
82
  x.append(decoded_chunk)
83
 
 
728
 
729
  import audiofile # pip uninstall flash-attn
730
  model = AudioGen().to('cpu')
731
+ x = model.generate(prompt='swims in lake frogs', duration=6000.4, max_tokens=7).cpu().numpy() # assure 7 for the vocoder loop to run at least once
732
  audiofile.write('_sound_.wav', x, 16000)