Spaces:

Dionyssos
/

speech-analysis2

Running

App Files Files Community

Dionyssos commited on Sep 22

Commit

a7e2983

1 Parent(s): 0f06964

user defines num tokens

Browse files

Files changed (2) hide show

app.py +3 -3
audiocraft.py +9 -8

app.py CHANGED Viewed

@@ -39,7 +39,7 @@ language_names = ['Ancient greek',
 def audionar_tts(text=None,
                  lang='Romanian',
                  soundscape='frogs',
-                 cache_lim=24):
     # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
@@ -119,7 +119,7 @@ def audionar_tts(text=None,
         background_audio = audiogen.generate(
             soundscape,
             duration=target_duration,
-            cache_lim=max(4, int(cache_lim))  # at least allow 10 A/R stEps
          ).numpy()
         # PAD
@@ -272,7 +272,7 @@ with gr.Blocks(theme='huggingface') as demo:
             label="AudioGen Txt"
         )
         kv_input = gr.Number(
-            label="Diversy",
             value=24,
         )
         generate_button = gr.Button("Generate Audio", variant="primary")

 def audionar_tts(text=None,
                  lang='Romanian',
                  soundscape='frogs',
+                 max_tokens=24):
     # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
         background_audio = audiogen.generate(
             soundscape,
             duration=target_duration,
+            max_tokens=max(4, int(max_tokens))  # at least allow 10 A/R stEps
          ).numpy()
         # PAD
             label="AudioGen Txt"
         )
         kv_input = gr.Number(
+            label="Num Tokens",
             value=24,
         )
         generate_button = gr.Button("Generate Audio", variant="primary")

audiocraft.py CHANGED Viewed

@@ -62,15 +62,14 @@ class AudioGen(torch.nn.Module):
     def generate(self,
                  prompt='dogs mewo',
                  duration=2.24,  # seconds of audio
-                 cache_lim=71,   # flush kv cache after cache_lim tok
                  ):
         torch.manual_seed(42)  # https://github.com/facebookresearch/audiocraft/issues/111#issuecomment-1614732858
-        self.lm.cache_lim = cache_lim
-        self.lm.n_draw = int(.8 * duration) + 1  # different beam every 0.47 seconds of audio
         with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
             gen_tokens = self.lm.generate(
                 text_condition=[prompt] * N_REPEAT  + [''] * N_REPEAT,#['dogs', 'dogs...!', '', '']
-                max_tokens=max(int(duration / (N_REPEAT * self.lm.n_draw) * 50) + 5, 12))
         # OOM if vocode all tokens
         x = []
@@ -81,9 +80,11 @@ class AudioGen(torch.nn.Module):
             decoded_chunk = self.compression_model.decode(gen_tokens[:, :, i-7:i+self._chunk_len])
             x.append(decoded_chunk)
         x = torch.cat(x, 2)  # [bs, 1, 114000]
         x = _shift(x)  # clone() to have xN
         return x.reshape(-1) #x / (x.abs().max() + 1e-7)
@@ -430,7 +431,6 @@ class LMModel(nn.Module):
                  dim = 1536
                  ):
         super().__init__()
-        self.cache_lim = -1
         self.t5 = T5()
         self.card = card # 2048
         self.n_draw = 1  # draw > 1 tokens of different CFG scale
@@ -468,6 +468,7 @@ class LMModel(nn.Module):
         # divide large probs with exp(prob) If prob=.001 then 1/exp(1*.001) -> almost by 0  --> exp doesnt really produce (0, Inf)
         p = p.argmax(dim=3, keepdim=True)  # [bs, 4, n_draw, 24]
         tok = ix.gather(dim=3, index=p).to(torch.int64)  # [bs, 4, n_draw, 1]
         return tok[:, :, :, 0].transpose(1, 2)  # [bs, n_draw, 4]
     @torch.no_grad()
@@ -537,7 +538,7 @@ class LMModel(nn.Module):
             out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
             # Sink Attn
-            if (offset > 0) and (offset % self.cache_lim) == 0:
                 n_preserve = 4
                 self.transformer._flush(n_preserve=n_preserve)
                 cache_position = n_preserve
@@ -726,5 +727,5 @@ if __name__ == '__main__':
     import audiofile  # pip uninstall flash-attn
     model = AudioGen().to('cpu')
-    x = model.generate(prompt='swims in lake frogs', duration=6.4).cpu().numpy()
     audiofile.write('_sound_.wav', x, 16000)

     def generate(self,
                  prompt='dogs mewo',
                  duration=2.24,  # seconds of audio
+                 max_tokens=71,  # actual num of A/R iterations - above is obtained as clone
                  ):
         torch.manual_seed(42)  # https://github.com/facebookresearch/audiocraft/issues/111#issuecomment-1614732858
+        n_draw = int(duration * 50 / (max_tokens * N_REPEAT)) + 1
         with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
             gen_tokens = self.lm.generate(
                 text_condition=[prompt] * N_REPEAT  + [''] * N_REPEAT,#['dogs', 'dogs...!', '', '']
+                max_tokens=max_tokens)
         # OOM if vocode all tokens
         x = []
             decoded_chunk = self.compression_model.decode(gen_tokens[:, :, i-7:i+self._chunk_len])
             x.append(decoded_chunk)
         x = torch.cat(x, 2)  # [bs, 1, 114000]
+        x = x.repeat(1, 1, n_draw)
         x = _shift(x)  # clone() to have xN
         return x.reshape(-1) #x / (x.abs().max() + 1e-7)
                  dim = 1536
                  ):
         super().__init__()
         self.t5 = T5()
         self.card = card # 2048
         self.n_draw = 1  # draw > 1 tokens of different CFG scale
         # divide large probs with exp(prob) If prob=.001 then 1/exp(1*.001) -> almost by 0  --> exp doesnt really produce (0, Inf)
         p = p.argmax(dim=3, keepdim=True)  # [bs, 4, n_draw, 24]
         tok = ix.gather(dim=3, index=p).to(torch.int64)  # [bs, 4, n_draw, 1]
         return tok[:, :, :, 0].transpose(1, 2)  # [bs, n_draw, 4]
     @torch.no_grad()
             out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
             # Sink Attn
+            if (offset > 0) and (offset % 476) == 0:
                 n_preserve = 4
                 self.transformer._flush(n_preserve=n_preserve)
                 cache_position = n_preserve
     import audiofile  # pip uninstall flash-attn
     model = AudioGen().to('cpu')
+    x = model.generate(prompt='swims in lake frogs', duration=56.4, max_tokens=24).cpu().numpy()
     audiofile.write('_sound_.wav', x, 16000)