Spaces:
Running
Running
fx 7 toks
Browse files- app.py +3 -3
- audiocraft.py +3 -4
app.py
CHANGED
|
@@ -119,7 +119,7 @@ def audionar_tts(text=None,
|
|
| 119 |
background_audio = audiogen.generate(
|
| 120 |
soundscape,
|
| 121 |
duration=target_duration,
|
| 122 |
-
max_tokens=min( max(
|
| 123 |
).numpy()
|
| 124 |
|
| 125 |
# PAD
|
|
@@ -263,13 +263,13 @@ with gr.Blocks() as demo:
|
|
| 263 |
)
|
| 264 |
choice_dropdown = gr.Dropdown(
|
| 265 |
choices=VOICES + language_names,
|
| 266 |
-
label="Vox:",
|
| 267 |
value=VOICES[0]
|
| 268 |
)
|
| 269 |
soundscape_input = gr.Textbox(
|
| 270 |
lines=1,
|
| 271 |
value="swims in lake frogs",
|
| 272 |
-
label="AudioGen Txt"
|
| 273 |
)
|
| 274 |
kv_input = gr.Number(
|
| 275 |
label="Tokens",
|
|
|
|
| 119 |
background_audio = audiogen.generate(
|
| 120 |
soundscape,
|
| 121 |
duration=target_duration,
|
| 122 |
+
max_tokens=min( max(7, int(max_tokens)), 288 ) # limit sounds tokens (clone beyond)
|
| 123 |
).numpy()
|
| 124 |
|
| 125 |
# PAD
|
|
|
|
| 263 |
)
|
| 264 |
choice_dropdown = gr.Dropdown(
|
| 265 |
choices=VOICES + language_names,
|
| 266 |
+
label="Vox :",
|
| 267 |
value=VOICES[0]
|
| 268 |
)
|
| 269 |
soundscape_input = gr.Textbox(
|
| 270 |
lines=1,
|
| 271 |
value="swims in lake frogs",
|
| 272 |
+
label="AudioGen Txt:"
|
| 273 |
)
|
| 274 |
kv_input = gr.Number(
|
| 275 |
label="Tokens",
|
audiocraft.py
CHANGED
|
@@ -76,9 +76,8 @@ class AudioGen(torch.nn.Module):
|
|
| 76 |
x = []
|
| 77 |
|
| 78 |
|
| 79 |
-
for i in range(
|
| 80 |
-
|
| 81 |
-
decoded_chunk = self.compression_model.decode(gen_tokens[:, :, i-7:i+self._chunk_len])
|
| 82 |
|
| 83 |
x.append(decoded_chunk)
|
| 84 |
|
|
@@ -729,5 +728,5 @@ if __name__ == '__main__':
|
|
| 729 |
|
| 730 |
import audiofile # pip uninstall flash-attn
|
| 731 |
model = AudioGen().to('cpu')
|
| 732 |
-
x = model.generate(prompt='swims in lake frogs', duration=6000.4, max_tokens=
|
| 733 |
audiofile.write('_sound_.wav', x, 16000)
|
|
|
|
| 76 |
x = []
|
| 77 |
|
| 78 |
|
| 79 |
+
for i in range(6, gen_tokens.shape[2], self._chunk_len): # has to run at least once
|
| 80 |
+
decoded_chunk = self.compression_model.decode(gen_tokens[:, :, i-6:i+self._chunk_len]) # see 6 token twice
|
|
|
|
| 81 |
|
| 82 |
x.append(decoded_chunk)
|
| 83 |
|
|
|
|
| 728 |
|
| 729 |
import audiofile # pip uninstall flash-attn
|
| 730 |
model = AudioGen().to('cpu')
|
| 731 |
+
x = model.generate(prompt='swims in lake frogs', duration=6000.4, max_tokens=7).cpu().numpy() # assure 7 for the vocoder loop to run at least once
|
| 732 |
audiofile.write('_sound_.wav', x, 16000)
|