Spaces:

Dionyssos
/

speech-analysis2

Running

Dionyssos commited on Sep 22, 2025

Commit

18fcbae

1 Parent(s): 2f811de

fx 7 toks

Files changed (2) hide show

app.py CHANGED Viewed

@@ -119,7 +119,7 @@ def audionar_tts(text=None,
         background_audio = audiogen.generate(
             soundscape,
             duration=target_duration,
-            max_tokens=min( max(4, int(max_tokens)), 500 )  # limit sounds tokens (clone beyond)
          ).numpy()
         # PAD
@@ -263,13 +263,13 @@ with gr.Blocks() as demo:
         )
         choice_dropdown = gr.Dropdown(
             choices=VOICES + language_names,
-            label="Vox:",
             value=VOICES[0]
         )
         soundscape_input = gr.Textbox(
             lines=1,
             value="swims in lake frogs",
-            label="AudioGen Txt"
         )
         kv_input = gr.Number(
             label="Tokens",

         background_audio = audiogen.generate(
             soundscape,
             duration=target_duration,
+            max_tokens=min( max(7, int(max_tokens)), 288 )  # limit sounds tokens (clone beyond)
          ).numpy()
         # PAD
         )
         choice_dropdown = gr.Dropdown(
             choices=VOICES + language_names,
+            label="Vox :",
             value=VOICES[0]
         )
         soundscape_input = gr.Textbox(
             lines=1,
             value="swims in lake frogs",
+            label="AudioGen Txt:"
         )
         kv_input = gr.Number(
             label="Tokens",

audiocraft.py CHANGED Viewed

@@ -76,9 +76,8 @@ class AudioGen(torch.nn.Module):
         x = []
-        for i in range(7, gen_tokens.shape[2], self._chunk_len):  # min soundscape 2s assures 10 tokens
-            decoded_chunk = self.compression_model.decode(gen_tokens[:, :, i-7:i+self._chunk_len])
             x.append(decoded_chunk)
@@ -729,5 +728,5 @@ if __name__ == '__main__':
     import audiofile  # pip uninstall flash-attn
     model = AudioGen().to('cpu')
-    x = model.generate(prompt='swims in lake frogs', duration=6000.4, max_tokens=24).cpu().numpy()
     audiofile.write('_sound_.wav', x, 16000)

         x = []
+        for i in range(6, gen_tokens.shape[2], self._chunk_len):  # has to run at least once
+            decoded_chunk = self.compression_model.decode(gen_tokens[:, :, i-6:i+self._chunk_len])  # see 6 token twice
             x.append(decoded_chunk)
     import audiofile  # pip uninstall flash-attn
     model = AudioGen().to('cpu')
+    x = model.generate(prompt='swims in lake frogs', duration=6000.4, max_tokens=7).cpu().numpy()  # assure 7 for the vocoder loop to run at least once
     audiofile.write('_sound_.wav', x, 16000)