Spaces:

dkounadis
/

audiogen2

Sleeping

App Files Files Community

Dionyssos commited on Sep 27, 2025

Commit

4813448

1 Parent(s): 52d3c83

cache

Browse files

Files changed (3) hide show

README.md +2 -2
app.py +4 -6
audiocraft.py +6 -5

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: Audiogen
 emoji: 🍍
-colorFrom: gray
-colorTo: gray
 sdk: gradio
 sdk_version: 5.41.1
 app_file: app.py

 ---
 title: Audiogen
 emoji: 🍍
+colorFrom: green
+colorTo: blue
 sdk: gradio
 sdk_version: 5.41.1
 app_file: app.py

app.py CHANGED Viewed

@@ -35,7 +35,7 @@ def audionar_tts(text=None,
                  lang='Romanian',
                  soundscape='frogs',
                  max_tokens=24,
-                 cache_lim=3):
     # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
@@ -98,12 +98,11 @@ def audionar_tts(text=None,
         speech_duration_secs = len(x) / 16000
         target_duration = max(speech_duration_secs + 0.74, 2.0)
         # Sink Attn
-        audiogen.cache_lim = min( max(0, int(cache_lim)), 2000)
         background_audio = audiogen.generate(
             soundscape[:64],  # to have shape of cross attention not grow large of T5 Num tokens
             duration=target_duration,
-            max_tokens=min( max(7, int(max_tokens)), 288 )  # limit sounds tokens (clone beyond)
          ).numpy()
         # PAD
@@ -140,7 +139,6 @@ def audionar_tts(text=None,
     soundfile.write(wavfile, final_audio, 16000)   # soundfile needs [time, channels]
     return wavfile
 # TTS
@@ -165,7 +163,7 @@ with gr.Blocks() as demo:
         )
         cache_lim = gr.Number(
             label="Flush kv",
-            value=24,
         )
         n_tokens = gr.Number(
             label="Tokens",

                  lang='Romanian',
                  soundscape='frogs',
                  max_tokens=24,
+                 cache_lim=-1):
     # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
         speech_duration_secs = len(x) / 16000
         target_duration = max(speech_duration_secs + 0.74, 2.0)
         # Sink Attn
         background_audio = audiogen.generate(
             soundscape[:64],  # to have shape of cross attention not grow large of T5 Num tokens
             duration=target_duration,
+            max_tokens=min( max(7, int(max_tokens)), 288 ),  # limit sounds tokens (clone beyond)
+            cache_lim=min( max(6, int(cache_lim)), 2000),
          ).numpy()
         # PAD
     soundfile.write(wavfile, final_audio, 16000)   # soundfile needs [time, channels]
     return wavfile
 # TTS
         )
         cache_lim = gr.Number(
             label="Flush kv",
+            value=71,
         )
         n_tokens = gr.Number(
             label="Tokens",

audiocraft.py CHANGED Viewed

@@ -63,6 +63,7 @@ class AudioGen(torch.nn.Module):
                  prompt='dogs mewo',
                  duration=2.24,  # seconds of audio
                  max_tokens=24,  # actual num of A/R iterations - above is obtained as clone
                  ):
         torch.manual_seed(42)  # https://github.com/facebookresearch/audiocraft/issues/111#issuecomment-1614732858
         n_draw = int(duration * 50 / (max_tokens * N_REPEAT)) + 1
@@ -70,7 +71,8 @@ class AudioGen(torch.nn.Module):
         with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
             gen_tokens = self.lm.generate(
                 text_condition=[prompt] * N_REPEAT  + [''] * N_REPEAT,#['dogs', 'dogs...!', '', '']
-                max_tokens=max_tokens)
         # OOM if vocode all tokens
         x = []
@@ -435,7 +437,6 @@ class LMModel(nn.Module):
         self.card = card # 2048
         self.n_draw = 1  # draw > 1 tokens of different CFG scale
                          # batch size > 1 is slower from n_draw as calls transformer on larger batch
-        self.cache_lim = 71
         self.emb = nn.ModuleList([nn.Embedding(self.card + 1, dim) for _ in range(n_q)])  # EMBEDDING HAS 2049
         self.transformer = StreamingTransformer()
         self.out_norm = nn.LayerNorm(dim, eps=1e-5)
@@ -475,8 +476,8 @@ class LMModel(nn.Module):
     @torch.no_grad()
     def generate(self,
                  max_tokens=None,
-                 text_condition=None
-                 ):
         self.transformer._flush()  # perhaps long kv cache has been filled on previous call for unrelated sounds
         x = self.t5(text_condition)
         bs = x.shape[0] // 2  # has null conditions - bs*2*N_REPEAT applys in builders.py
@@ -540,7 +541,7 @@ class LMModel(nn.Module):
             out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
             # Sink Attn
-            if (offset > 0) and (offset % self.cache_lim) == 0:
                 n_preserve = 4
                 self.transformer._flush(n_preserve=n_preserve)
                 cache_position = n_preserve

                  prompt='dogs mewo',
                  duration=2.24,  # seconds of audio
                  max_tokens=24,  # actual num of A/R iterations - above is obtained as clone
+                 cache_lim=71,
                  ):
         torch.manual_seed(42)  # https://github.com/facebookresearch/audiocraft/issues/111#issuecomment-1614732858
         n_draw = int(duration * 50 / (max_tokens * N_REPEAT)) + 1
         with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
             gen_tokens = self.lm.generate(
                 text_condition=[prompt] * N_REPEAT  + [''] * N_REPEAT,#['dogs', 'dogs...!', '', '']
+                max_tokens=max_tokens,
+                cache_lim=cache_lim)
         # OOM if vocode all tokens
         x = []
         self.card = card # 2048
         self.n_draw = 1  # draw > 1 tokens of different CFG scale
                          # batch size > 1 is slower from n_draw as calls transformer on larger batch
         self.emb = nn.ModuleList([nn.Embedding(self.card + 1, dim) for _ in range(n_q)])  # EMBEDDING HAS 2049
         self.transformer = StreamingTransformer()
         self.out_norm = nn.LayerNorm(dim, eps=1e-5)
     @torch.no_grad()
     def generate(self,
                  max_tokens=None,
+                 text_condition=None,
+                 cache_lim=71):
         self.transformer._flush()  # perhaps long kv cache has been filled on previous call for unrelated sounds
         x = self.t5(text_condition)
         bs = x.shape[0] // 2  # has null conditions - bs*2*N_REPEAT applys in builders.py
             out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
             # Sink Attn
+            if (offset > 0) and (offset % cache_lim) == 0:
                 n_preserve = 4
                 self.transformer._flush(n_preserve=n_preserve)
                 cache_position = n_preserve