dkounadis
/

artificial-styletts2

@@ -147,7 +147,7 @@ class LMModel(nn.Module):
         super().__init__()
         self.cfg_coef = cfg_coef
-        self.n_draw = 8
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?

         super().__init__()
         self.cfg_coef = cfg_coef
+        self.n_draw = 2
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?

audiocraft/transformer.py CHANGED Viewed

@@ -194,21 +194,12 @@ class StreamingMultiheadAttention(nn.Module):
                 q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
             else:
                 # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
-                if self.k_history is not None:
-                    #
-                    # pk.shape=torch.Size([2, 24, 3, 64]) k.shape=torch.Size([2, 24, 1, 64]) CONCAT
-                    # has to be 4D with batch 1 due to single condition 3=seqlen
-                    # 24 heads 64 dimofh
-                    self.k_history = torch.cat([self.k_history, query], 2)
-                    self.v_history = torch.cat([self.v_history, query], 2)
-                else:
-                    # init on 1st token (for all 47 transf layers)
-                    self.k_history = query
-                    self.v_history = query
                 projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
                 if self.kv_repeat == 1:
                     # if time_dim == 2:
@@ -217,7 +208,21 @@ class StreamingMultiheadAttention(nn.Module):
                     #     bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
                     q, k, v = ops.unbind(packed, dim=2)
@@ -235,8 +240,7 @@ class StreamingMultiheadAttention(nn.Module):
                 # k, v = self._complete_kv(k, v)
                 # print(k.sum(), v.sum(), k.shape, v.shape,'ATTNext')
-            if self.attention_as_float32:
-                q, k, v = [x.float() for x in [q, k, v]]
             if self.memory_efficient:
                 # print('EVER IN MEMORY EFFICIENT A')

                 q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
             else:
+                # 1st projected makes k,v (instantaneous)
+                # 2nd cat
                 # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
                 projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
                 if self.kv_repeat == 1:
                     # if time_dim == 2:
                     #     bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
                     q, k, v = ops.unbind(packed, dim=2)
+                if self.k_history is not None:
+                    #
+                    # pk.shape=torch.Size([2, 24, 3, 64]) k.shape=torch.Size([2, 24, 1, 64]) CONCAT
+                    # has to be 4D with batch 1 due to single condition 3=seqlen
+                    # 24 heads 64 dimofh
+                    self.k_history = torch.cat([self.k_history, k], 2)
+                    self.v_history = torch.cat([self.v_history, v], 2)
+                else:
+                    # init on 1st token (for all 47 transf layers)
+                    self.k_history = k
+                    self.v_history = v
+                k = self.k_history
+                v = self.v_history
                 # k, v = self._complete_kv(k, v)
                 # print(k.sum(), v.sum(), k.shape, v.shape,'ATTNext')
+            print(f'{self.attention_as_float32=}')
             if self.memory_efficient:
                 # print('EVER IN MEMORY EFFICIENT A')