dkounadis
/

artificial-styletts2

@@ -147,7 +147,7 @@ class LMModel(nn.Module):
         super().__init__()
         self.cfg_coef = cfg_coef
-        self.n_draw = 5
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?
@@ -235,19 +235,16 @@ class LMModel(nn.Module):
     def forward(self,
                 sequence,
                 condition_tensors=None,
-                stage = -1):
         B, K, S = sequence.shape    # linears are n_q
-        input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
         # input_, cross_attention_input = self.fuser(input_, condition_tensors)
         cross_attention_input = condition_tensors['description'][0]
-        # print(f'{input_.shape=}  {cross_attention_input.shape=}  FUSER LLM')
-        out = self.transformer(input_, cross_attention_src=cross_attention_input,
-                               src_mask=(self.attn_mask_per_stage[stage] if stage >= 0 else None))
         if self.out_norm:
             out = self.out_norm(out)
         # K = 2 because of llm producing 2 tokens?
@@ -323,38 +320,23 @@ class LMModel(nn.Module):
                             ]
-        for offset in range(1, _gen_sequence.shape[2]):  # gen_sequence shape is [B, K, S]):
-            # print(f'{_gen_sequence.shape=}')  # [1,4,16]
-            # starts from 1 not 0 thus uses the 0:1 as curr sequence
-            # although this is empty contains -1 ?
-            # ====================== SAMPLE NEXT TOK
-            # next_token = self._sample_next_token(
-            #     _gen_sequence[..., :offset],
-            #     cfg_conditions)  # [5, 4, 1]
-            # --
-            # def _sample_next_token(self,
-            #            sequence,
-            #            cfg_conditions):
-            model = self if self._fsdp is None else self._fsdp
-            logits = model(_gen_sequence[..., :offset],
-                        condition_tensors=cfg_conditions)
-            # print(logits.shape, 'Next Logits')  # [1, 4, 2, 2048] why 2 tokens on query
-            # use cfg
-            # logits = (3 * logits[1, :, :, :] - 2.4 * logits[0, :, :, :]).transpose(1,0)
-            # or use 1 of logits
-            logits = logits[0, :, 0:1, :]  # [1,4,2048]
             next_token = utils.sample_top_k(logits, n_draw=self.n_draw)  # [1,4,2048] logits
-            # =================================
-            _gen_sequence[:, :, offset] = next_token[0, :, 0]  #gen_sequence.shape=torch.Size([1, 4, 39])
             duplicate_draw.append(next_token)
@@ -396,7 +378,10 @@ class LMModel(nn.Module):
         # <=> CODES out_codes.shape=torch.Size([1, 4, 35]) 30  2024
         return out_codes  #

         super().__init__()
         self.cfg_coef = cfg_coef
+        self.n_draw = 8
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?
     def forward(self,
                 sequence,
                 condition_tensors=None,
+                token_count=None):
         B, K, S = sequence.shape    # linears are n_q
+        input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
         # input_, cross_attention_input = self.fuser(input_, condition_tensors)
         cross_attention_input = condition_tensors['description'][0]
+        print(f'{input_.shape=}')
+        out = self.transformer(input_,
+                               cross_attention_src=cross_attention_input,
+                               token_count=token_count)
         if self.out_norm:
             out = self.out_norm(out)
         # K = 2 because of llm producing 2 tokens?
                             ]
+        for offset in range(1, _gen_sequence.shape[2]):
+            logits = self.forward(_gen_sequence[:, :, offset-1:offset],  # bs/n_draw, 4, 1
+                                  condition_tensors=cfg_conditions,
+                                  token_count=offset)
+            # print(f'BEF {logits.shape=} BEF utils.SampleTop5')  # AGREES 4 BEF logits.shape=torch.Size([1, 4, 1, 2048]) BEF utils.SampleTop5
             next_token = utils.sample_top_k(logits, n_draw=self.n_draw)  # [1,4,2048] logits
+            _gen_sequence[:, :, offset] = next_token[0, :, 0]  #  next_token=[1,4,6] gen_seq=[1, 4, 39]
             duplicate_draw.append(next_token)
         # <=> CODES out_codes.shape=torch.Size([1, 4, 35]) 30  2024
+        # Clean Transformer MHA k_history v_history
+        for lay in self.transformer.layers:
+             lay.self_attn.k_history = None
+             lay.self_attn.v_history = None
         return out_codes  #

audiocraft/transformer.py CHANGED Viewed

@@ -3,26 +3,36 @@ from einops import rearrange
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from xformers import ops
 _efficient_attention_backend: str = 'torch'
-def set_efficient_attention_backend(backend: str = 'torch'):
-    # Using torch by default, it seems a bit faster on older P100 GPUs (~20% faster).
-    global _efficient_attention_backend
-    assert _efficient_attention_backend in ['xformers', 'torch']
-    _efficient_attention_backend = backend
-def create_norm_fn(norm_type, dim, **kwargs):
     if norm_type == 'layer_norm':
         return nn.LayerNorm(dim, eps=1e-5, **kwargs)
     else:
@@ -48,11 +58,27 @@ def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float =
     adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
     max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype)  # avoid sync point
     phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
-    # print('==============CONCAT 3 ============'
-    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
@@ -62,36 +88,37 @@ class StreamingMultiheadAttention(nn.Module):
     def __init__(self,
                  embed_dim,
-                 num_heads,
-                 dropout=0.0,
-                 bias: bool = True,
-                 causal: bool = False,
-                 past_context: tp.Optional[int] = None,
-                 custom: bool = False,
-                 memory_efficient: bool = False,
-                 attention_as_float32: bool = False,
                  cross_attention: bool = False,
-                 qk_layer_norm: bool = False,
                  kv_repeat: int = 1,
                  device=None, dtype=None):
         super().__init__()
         factory_kwargs = {'device': device, 'dtype': dtype}
         if past_context is not None:
             assert causal
         self.embed_dim = embed_dim
-        self.causal = causal
-        self.past_context = past_context
         self.memory_efficient = memory_efficient
         self.attention_as_float32 = attention_as_float32
         self.cross_attention = cross_attention
         self.num_heads = num_heads
         self.dropout = dropout
         self.kv_repeat = kv_repeat
-        if cross_attention:
-            assert not causal, "Causal cannot work with cross attention."
-        if memory_efficient:
-            _verify_xformers_memory_efficient_compat()
-        self.custom = _is_custom(custom, memory_efficient)
         if self.custom:
             out_dim = embed_dim
             assert num_heads % kv_repeat == 0
@@ -109,12 +136,11 @@ class StreamingMultiheadAttention(nn.Module):
             if bias:
                 self.out_proj.bias.data.zero_()
         else:
-            print('mha ini else')
-        self.qk_layer_norm = qk_layer_norm
-        if qk_layer_norm:
-            print('QK norm')
     def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
         if not self.custom:
@@ -124,185 +150,140 @@ class StreamingMultiheadAttention(nn.Module):
                 if prefix + key in state_dict:
                     state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
     def forward(self,
                 query,
-                key,
-                value,
-                key_padding_mask=None,
-                need_weights=False,
-                attn_mask=None,
-                is_causal=False):
-# 2=cond/uncond
-# 24=heads
-# 1=seqlen
-# 64=channel
-#
-# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 7, 64]) v.shape=torch.Size([2, 24, 7, 64]) CROSSSattn
-# 43
-# ____________
-# SELF
-# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 25, 64]) v.shape=torch.Size([2, 24, 25, 64]) CROSSSattn
-# sa_ x.shape=torch.Size([2, 1, 1536])
-# X
-# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 7, 64]) v.shape=torch.Size([2, 24, 7, 64]) CROSSSattn
-# 44
-# ____________
-# SELF
-# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 25, 64]) v.shape=torch.Size([2, 24, 25, 64]) CROSSSattn
-# sa_ x.shape=torch.Size([2, 1, 1536])
-# X
-# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 7, 64]) v.shape=torch.Size([2, 24, 7, 64]) CROSSSattn
-# 45
-# ____________
-# SELF
-# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 25, 64]) v.shape=torch.Size([2, 24, 25, 64]) CROSSSattn
-# sa_ x.shape=torch.Size([2, 1, 1536])
-# X
-# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 7, 64]) v.shape=torch.Size([2, 24, 7, 64]) CROSSSattn
-# 46
-# ____________
-# SELF
-# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 25, 64]) v.shape=torch.Size([2, 24, 25, 64]) CROSSSattn
-# sa_ x.shape=torch.Size([2, 1, 1536])
-# X
-# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 7, 64]) v.shape=torch.Size([2, 24, 7, 64]) CROSSSattn
-# 47
-# ____________
-# SELF
-# q.shape=torch.Size([2, 24, 1, 64]) k.shape=torch.Size([2, 24, 25, 64]) v.shape=torch.Size([2, 24, 25, 64]) CROSSSattn
-# sa_ x.shape=torch.Size([2, 1, 1536])
-        assert not is_causal, ("New param added in torch 2.0.1 not supported, "
-                               "use the causal args in the constructor.")
-        # print(f'{query.shape=} {key.shape=} {value.shape=} MHA')
-        time_dim = 2
-        if time_dim == 2:
-            layout = "b h t d"
-        else:
-            layout = "b t h d"
-        dtype = query.dtype
-        custom_attn_mask = attn_mask is not None
         if self.custom:
             if self.cross_attention:
-                # print('\n\n\n\nCROSS\n\n\n\n')
                 dim = self.in_proj_weight.shape[0] // 3
                 if self.in_proj_bias is None:
                     bias_q, bias_k, bias_v = None, None, None
                 else:
-                    print('no self proj bi')
                 q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
-                # print(f'{q.shape=} TRANSF FORW who concaten')
                 # todo: when streaming, we could actually save k, v and check the shape actually match.
                 k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
                 v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
-                if self.qk_layer_norm is True:
-                    q = self.q_layer_norm(q)
-                    k = self.k_layer_norm(k)
                 q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
             else:
                 projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
                 if self.kv_repeat == 1:
-                    if time_dim == 2:
-                        bound_layout = "b h p t d"
-                    else:
-                        bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
-                    # print(f'{query.shape=} before unbind')  # [2, 1, 4 , 2048] already bs=2
                     q, k, v = ops.unbind(packed, dim=2)
-                    # print(f'{q.shape=} {v.shape=} @L331 trasnforemr.py') # packed is bs=2
-                else:
-                    print("ELSE kv rp")
-                if self.qk_layer_norm is True:
-                    print('QL lay norm')
-                if self.kv_repeat > 1:
-                    print('Expand repear 2')
-            if self.attention_as_float32:
-                print('AS FLOAT32')
             if self.memory_efficient:
-                if custom_attn_mask:
-                    print('CUSTOM ATTN MSK')
                 p = self.dropout if self.training else 0
                 if _efficient_attention_backend == 'torch':
-                    # print(f'{q.shape=} {k.shape=} {v.shape=}   90')
-                    print(f'{x.sum()=} {q.sum()=} {k.sum()=} {v.sum()=}   90 variation of qkv during 47')
-                    # the k.sum(),v.sum() changes over the 47transfs how is that possible if self._sa
-                    # has q-len = 1.
-                    #
-                    #
                     x = torch.nn.functional.scaled_dot_product_attention(
-                        q, k, v, is_causal=attn_mask is not None, dropout_p=p)
-                else:
-                    print('MHA OPS')
-            else:
-                print('CONSISTENCY ')
-            x = x.to(dtype)
             x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
             x = self.out_proj(x)
-        else:
-            raise NotImplementedError
-        return x, None
-class StreamingTransformerLayer(nn.TransformerEncoderLayer):
-    def __init__(self,
-                 d_model,
-                 num_heads,
-                 dim_feedforward=2048,
-                 dropout=0.1,
-                 bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
-                 past_context: tp.Optional[int] = None, custom: bool = False,
-                 memory_efficient: bool = False, attention_as_float32: bool = False,
-                 qk_layer_norm: bool = False, qk_layer_norm_cross: bool = False,
-                 cross_attention: bool = False,
-                #  rope=None,
                  attention_dropout: tp.Optional[float] = None,
-                 kv_repeat: int = 1, norm: str = 'layer_norm', device=None, dtype=None, **kwargs):
-        super().__init__(d_model, num_heads, dim_feedforward, dropout,
-                         device=device, dtype=dtype, batch_first=True, **kwargs)
         factory_kwargs = {'device': device, 'dtype': dtype}
         # Redefine self_attn to our streaming multi-head attention
         attn_kwargs: tp.Dict[str, tp.Any] = {
@@ -314,123 +295,84 @@ class StreamingTransformerLayer(nn.TransformerEncoderLayer):
             'memory_efficient': memory_efficient,
             'attention_as_float32': attention_as_float32,
         }
-        self.self_attn=StreamingMultiheadAttention(
-            causal=causal,
-            past_context=past_context,
-            # rope=rope,
-            qk_layer_norm=qk_layer_norm,
-            kv_repeat=kv_repeat, **attn_kwargs, **factory_kwargs)  # type: ignore
         # Redefine feedforward layers to expose bias parameter
         self.linear1 = nn.Linear(d_model, dim_feedforward, bias=bias_ff, **factory_kwargs)
         self.linear2 = nn.Linear(dim_feedforward, d_model, bias=bias_ff, **factory_kwargs)
-        self.cross_attention = None  # default
         if cross_attention:
             self.cross_attention = StreamingMultiheadAttention(
-                cross_attention=True, qk_layer_norm=qk_layer_norm_cross,
-                **attn_kwargs, **factory_kwargs)
-            # Norm and dropout
             self.dropout_cross = nn.Dropout(dropout)
-            # eps value matching that used in PyTorch reference implementation.
-            self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
         self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
         self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
-# ENVS....d4/lib/python3.10/site-packages/torch/nn/modules/transformer.py @TransformerEncoderLayer
-    def _sa_block(self, q, k, v):
-        x = self.self_attn(q,
-                           k,
-                           v,
-                           attn_mask=None,
-                           key_padding_mask=None,
-                           need_weights=False,
-                           is_causal=None)[0]
-        return self.dropout1(x)
-    def _cross_attention_block(self,
-                               src,
-                               cross_attention_src):
-        # queries are from src, keys and values from cross_attention_src.
-        x = self.cross_attention(
-            src, cross_attention_src, cross_attention_src, need_weights=False)[0]
-        return self.dropout_cross(x)  # type: ignore
-    def forward(self,
-                src,
-                src_mask=None,
-                src_key_padding_mask=None,  # key = value = looooong I think I pass them inversed
-                cross_attention_src=None):
-        if self.norm_first:
-            print('selfattn')
-            history = self.norm1(src)
-            x = history[:, -1:, :]
-            # THIS IS COMPUTED with 1 timestep
-            # just before the call there is cat([past_k, k])
-            # Thus we just
-            x = x + self._sa_block(x,  # THIS should be square as the history is updated
-                                   # then the -1 item of history goes to the text x text
-                                   #
-                                   history,
-                                   history)
-            print('crossattn')
-            if cross_attention_src is not None:
-                x = x + self._cross_attention_block(
-                        self.norm_cross(x),
-                        cross_attention_src)
-            else:
-                print('NOT IMPL')
-            x = x + self._ff_block(self.norm2(x))
-        else:
-            print('NLAST')
         return x
 class StreamingTransformer(nn.Module):
-    '''layer_class=<class 'audiocraft.transformer.StreamingTransformerLayer'>  StrTrnsf'''
-    def __init__(self, d_model: int, num_heads: int, num_layers: int, dim_feedforward: int = 2048,
-                 dropout: float = 0.1, bias_ff: bool = True, bias_attn: bool = True,
-                 causal: bool = False, past_context: tp.Optional[int] = None,
-                 custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False,
                  cross_attention: bool = False,
-                 positional_embedding: str = 'sin', max_period: float = 10_000, positional_scale: float = 1.,
-                 xpos=False,
-                 lr=None,
-                 weight_decay=None,
                  layer_class=StreamingTransformerLayer,
-                 checkpointing='none',
                  device=None,
-                 dtype=None,
-                 **kwargs):
         super().__init__()
         assert d_model % num_heads == 0
         self.positional_embedding = positional_embedding
         self.max_period = max_period
         self.positional_scale = positional_scale
-        self.weight_decay = weight_decay
-        self.lr = lr
-        assert positional_embedding in ['sin', 'rope', 'sin_rope']
         self.checkpointing = checkpointing
-        assert checkpointing in ['none', 'torch', 'xformers_default', 'xformers_mm']
-        if self.checkpointing.startswith('xformers'):
-            _verify_xformers_internal_compat()
         self.layers = nn.ModuleList()
         for idx in range(num_layers):
@@ -438,90 +380,35 @@ class StreamingTransformer(nn.Module):
                 layer_class(
                     d_model=d_model, num_heads=num_heads, dim_feedforward=dim_feedforward,
                     dropout=dropout, bias_ff=bias_ff, bias_attn=bias_attn,
-                    causal=causal, past_context=past_context, custom=custom,
                     memory_efficient=memory_efficient, attention_as_float32=attention_as_float32,
-                    cross_attention=cross_attention,
-                    # rope=self.rope,
                     device=device, dtype=dtype, **kwargs))
         if self.checkpointing != 'none':
-            print('Checkpointing????????????')
             for layer in self.layers:
                 # see audiocraft/optim/fsdp.py, magic signal to indicate this requires fixing the
                 # backward hook inside of FSDP...
                 layer._magma_checkpointed = True  # type: ignore
     def forward(self, x: torch.Tensor, *args, **kwargs):
-        # print(f'{x.shape=} StreamingTransf')   # [1, 1, 1536]  Always no batch==2 here
-        # why is this called with time-len = 1? Shouldnt be called with context?
-        B, T, C = x.shape
-        if self.positional_embedding in ['sin',
-                                         'sin_rope']:
-            positions = torch.arange(T, device=x.device).view(1, -1, 1)
             pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
             x = x + self.positional_scale * pos_emb
-        # 47x transformer layers for frozen history
-        #       -> history is updated by self._sa() althought her length is fixed
-        #       -> the q that comes out of the text x text cross attn
-        #          is given as q to the next lay's self._sa() with updated history
-        #       ->
-        #       ->
-        for _, lay in enumerate(self.layers):
-            print(f'_________________\n{_}')
-            # 1 q = last_token x history x history
-            # 2 next_token = q x text x text
-            # x preserves full history for self._sa(). After all transformers we return only last -1 tok
-            x, history = lay(
-                x,
-                history=history,   # only updated by self_attn (the cross sees only last token)
-                cross_attention_src=kwargs["cross_attention_src"],
-                src_mask=kwargs['src_mask']
-                )  # x : [bs, 24, 37, 64]
-        return x
-# special attention related function
-def _verify_xformers_memory_efficient_compat():
-    try:
-        from xformers.ops import memory_efficient_attention, LowerTriangularMask  # noqa
-    except ImportError:
-        raise ImportError(
-            "xformers is not installed. Please install it and try again.\n"
-            "To install on AWS and Azure, run \n"
-            "FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST='8.0'\\\n"
-            "pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n"
-            "To install on FAIR Cluster, run \n"
-            "FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST='6.0;7.0'\\\n"
-            "pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n")
-def _verify_xformers_internal_compat():
-    try:
-        from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy  # noqa
-    except ImportError:
-        raise ImportError(
-            "Francisco's fairinternal xformers is not installed. Please install it and try again.\n"
-            "To install on AWS and Azure, run \n"
-            "FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST='8.0'\\\n"
-            "pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n"
-            "To install on FAIR Cluster, run \n"
-            "FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST='6.0;7.0'\\\n"
-            "pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n")
-def _is_custom(custom: bool, memory_efficient: bool):
-    return custom or memory_efficient

 import torch
 import torch.nn as nn
 from torch.nn import functional as F
+from torch.utils.checkpoint import checkpoint as torch_checkpoint
 from xformers import ops
 _efficient_attention_backend: str = 'torch'
+def _get_attention_time_dimension(memory_efficient: bool) -> int:
+    if _efficient_attention_backend == 'torch' and memory_efficient:
+        return 2
+    else:
+        return 1
+def create_norm_fn(norm_type: str, dim: int, **kwargs) -> nn.Module:
+    """Create normalization module for transformer encoder layer.
+    Args:
+        norm_type (str): Normalization method.
+        dim (int): Dimension of the normalized layer.
+        **kwargs (dict): Additional parameters for normalization layer.
+    Returns:
+        nn.Module: Normalization module.
+    """
     if norm_type == 'layer_norm':
         return nn.LayerNorm(dim, eps=1e-5, **kwargs)
     else:
     adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
     max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype)  # avoid sync point
     phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
+    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
+def expand_repeated_kv(x: torch.Tensor, n_rep: int, memory_efficient: bool) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers."""
+    if n_rep == 1:
+        return x
+    if _efficient_attention_backend == 'torch' and memory_efficient:
+        bs, n_kv_heads, slen, head_dim = x.shape
+        return (
+            x[:, :, None, :, :]
+            .expand(bs, n_kv_heads, n_rep, slen, head_dim)
+            .reshape(bs, n_kv_heads * n_rep, slen, head_dim)
+        )
+    else:
+        bs, slen, n_kv_heads, head_dim = x.shape
+        return (
+            x[:, :, :, None, :]
+            .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+            .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+        )
     def __init__(self,
                  embed_dim,
+                 num_heads, dropout: float = 0.0, bias: bool = True,
+                 causal: bool = False, past_context: tp.Optional[int] = None, custom: bool = False,
+                 memory_efficient: bool = False, attention_as_float32: bool = False,
                  cross_attention: bool = False,
                  kv_repeat: int = 1,
                  device=None, dtype=None):
         super().__init__()
         factory_kwargs = {'device': device, 'dtype': dtype}
         if past_context is not None:
             assert causal
         self.embed_dim = embed_dim
+        self.k_history = None  # previous k from the previous tokens seen in the current generation - only for selt.attn
+        self.v_history = None  # clean up IN LM after finishing GENERATION - Each 1...47 mha has different kv history
         self.memory_efficient = memory_efficient
         self.attention_as_float32 = attention_as_float32
         self.cross_attention = cross_attention
         self.num_heads = num_heads
         self.dropout = dropout
         self.kv_repeat = kv_repeat
+        self.custom = True #_is_custom(custom, memory_efficient)
+        if not self.custom:
+            print(f'{self.custom}')
         if self.custom:
             out_dim = embed_dim
             assert num_heads % kv_repeat == 0
             if bias:
                 self.out_proj.bias.data.zero_()
         else:
+            assert kv_repeat == 1
+            self.mha = nn.MultiheadAttention(
+                embed_dim, num_heads, dropout=dropout, bias=bias, batch_first=True,
+                **factory_kwargs)
     def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
         if not self.custom:
                 if prefix + key in state_dict:
                     state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
     def forward(self,
                 query,
+                key=None,   # ignores those 2 args if not self.cross_attn
+                value=None):
+        # time_dim = _get_attention_time_dimension(self.memory_efficient)
+        # if time_dim == 2:
+        layout = "b h t d"
+        # else:
+        #     layout = "b t h d"
+        # dtype = query.dtype
         if self.custom:
             if self.cross_attention:
+                # Different queries, keys, values, we have to spit manually the weights
+                # before applying the linear.
                 dim = self.in_proj_weight.shape[0] // 3
                 if self.in_proj_bias is None:
                     bias_q, bias_k, bias_v = None, None, None
                 else:
+                    bias_q = self.in_proj_bias[:dim]
+                    bias_k = self.in_proj_bias[dim: 2 * dim]
+                    bias_v = self.in_proj_bias[2 * dim:]
                 q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
                 # todo: when streaming, we could actually save k, v and check the shape actually match.
                 k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
                 v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
                 q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
             else:
+                # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
+                if self.k_history is not None:
+                    #
+                    # pk.shape=torch.Size([2, 24, 3, 64]) k.shape=torch.Size([2, 24, 1, 64]) CONCAT
+                    # has to be 4D with batch 1 due to single condition 3=seqlen
+                    # 24 heads 64 dimofh
+                    self.k_history = torch.cat([self.k_history, query], 2)
+                    self.v_history = torch.cat([self.v_history, query], 2)
+                else:
+                    # init on 1st token (for all 47 transf layers)
+                    self.k_history = query
+                    self.v_history = query
                 projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
                 if self.kv_repeat == 1:
+                    # if time_dim == 2:
+                    bound_layout = "b h p t d"
+                    # else:
+                    #     bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
                     q, k, v = ops.unbind(packed, dim=2)
+                # KV COMPLETION ONLY ON SELF ATTENTION
+                #======================================================
+                # so the previous layer passes you here the k,v having concatenated all previous
+                #
+                # also return those 2 for the next transformer layer
+                #
+                # also clean up after ending the transformer? NOOOOOOOOOOOOO is goes along tokens
+                #
+                # also why completekv does not grow longer during the 47 transformers but changes sum
+                # k, v = self._complete_kv(k, v)
+                # print(k.sum(), v.sum(), k.shape, v.shape,'ATTNext')
+            if self.attention_as_float32:
+                q, k, v = [x.float() for x in [q, k, v]]
             if self.memory_efficient:
+                # print('EVER IN MEMORY EFFICIENT A')
                 p = self.dropout if self.training else 0
                 if _efficient_attention_backend == 'torch':
+                    # print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(), 'CROSSopen')
                     x = torch.nn.functional.scaled_dot_product_attention(
+                        q, k, v, is_causal=False, dropout_p=p
+                    )
+            x = x.to(q.dtype)
             x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
             x = self.out_proj(x)
+        return x
+class StreamingTransformerLayer(nn.Module): #nn.TransformerEncoderLayer):
+    # INHERITS MHA !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    def __init__(self,
+                 d_model: int,
+                 num_heads: int,
+                 dim_feedforward: int = 2048,
+                 dropout: float = 0.1,
+                 bias_ff: bool = True,
+                 bias_attn: bool = True,
+                 custom: bool = False,
+                 memory_efficient: bool = False,
+                 attention_as_float32: bool = False,
+                 cross_attention: bool = False,
                  attention_dropout: tp.Optional[float] = None,
+                 kv_repeat: int = 1,
+                 norm: str = 'layer_norm',
+                 device=None,
+                 dtype=None,
+                 **kwargs):
+        super().__init__() #d_model, num_heads, dim_feedforward, dropout,
+                         #device=device, dtype=dtype, batch_first=True, **kwargs)
+        # print(kwargs['activation'], 'ACTIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\n\n\n\n')
+        # -- EN Layer
+        # DOES NOT INHERIT NO VARIABLE FROM nn.TransformerEncoderLayer only the _sa_block function
+        # -- EN layer
         factory_kwargs = {'device': device, 'dtype': dtype}
         # Redefine self_attn to our streaming multi-head attention
         attn_kwargs: tp.Dict[str, tp.Any] = {
             'memory_efficient': memory_efficient,
             'attention_as_float32': attention_as_float32,
         }
+        self.self_attn = StreamingMultiheadAttention(
+            kv_repeat=kv_repeat,
+            **attn_kwargs,
+            **factory_kwargs)  # type: ignore
         # Redefine feedforward layers to expose bias parameter
         self.linear1 = nn.Linear(d_model, dim_feedforward, bias=bias_ff, **factory_kwargs)
         self.linear2 = nn.Linear(dim_feedforward, d_model, bias=bias_ff, **factory_kwargs)
+        # print('LAYER scale', layer_scale, '\n\n\n\n\n\n\n\n\n')   # always
+        self.cross_attention= None
         if cross_attention:
             self.cross_attention = StreamingMultiheadAttention(
+                cross_attention=True,
+                **attn_kwargs,
+                **factory_kwargs)
             self.dropout_cross = nn.Dropout(dropout)
+            self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
         self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
         self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
+    def forward(self,
+                src,
+                cross_attention_src=None):  # txtcond
+        '''T layer'''
+        x = src
+        x = x + self.self_attn(self.norm1(x))
+        if cross_attention_src is not None:
+            x = x + self.cross_attention(
+                                    query = self.norm_cross(x),
+                                    key   = cross_attention_src,
+                                    value = cross_attention_src)  # txtcondition
+        x = x + self.linear2(F.gelu(self.linear1(   self.norm2(x)    )))
         return x
 class StreamingTransformer(nn.Module):
+    def __init__(self, d_model: int,
+                 num_heads: int,
+                 num_layers: int,
+                 dim_feedforward: int = 2048,
+                 dropout: float = 0.1,
+                 bias_ff: bool = True,
+                 bias_attn: bool = True,
+                 custom: bool = False,
+                 memory_efficient: bool = False,
+                 attention_as_float32: bool = False,
                  cross_attention: bool = False,
+                 positional_embedding: str = 'sin',
+                 max_period: float = 10_000,
+                 positional_scale: float = 1,
                  layer_class=StreamingTransformerLayer,
+                 checkpointing: str = 'none',
                  device=None,
+                 dtype=None, **kwargs):
         super().__init__()
         assert d_model % num_heads == 0
         self.positional_embedding = positional_embedding
         self.max_period = max_period
         self.positional_scale = positional_scale
+        # self._stream_off = 0  # the llm should reinitialize this at ery generate()
         self.checkpointing = checkpointing
         self.layers = nn.ModuleList()
         for idx in range(num_layers):
                 layer_class(
                     d_model=d_model, num_heads=num_heads, dim_feedforward=dim_feedforward,
                     dropout=dropout, bias_ff=bias_ff, bias_attn=bias_attn,
+                    custom=custom,
                     memory_efficient=memory_efficient, attention_as_float32=attention_as_float32,
+                    cross_attention=cross_attention,
                     device=device, dtype=dtype, **kwargs))
         if self.checkpointing != 'none':
             for layer in self.layers:
                 # see audiocraft/optim/fsdp.py, magic signal to indicate this requires fixing the
                 # backward hook inside of FSDP...
                 layer._magma_checkpointed = True  # type: ignore
     def forward(self, x: torch.Tensor, *args, **kwargs):
+        B, T, C = x.shape
+        if self.positional_embedding in ['sin', 'sin_rope']:
+            positions = torch.arange(T, device=x.device).view(1, -1, 1)
+            positions = positions + kwargs['token_count']  #offsets.view(-1, 1, 1)
             pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
             x = x + self.positional_scale * pos_emb
+        for j, lay in enumerate(self.layers):
+            print(f'_________________________{j}___________________')
+            x = lay(x, cross_attention_src=kwargs["cross_attention_src"])  # txt cond
+            # each layer (mha) keeps history of its own k,v for all tokens
+        return x

demo.py CHANGED Viewed

@@ -4,10 +4,10 @@ import numpy as np
 print('\n\n\n\n___________________')
-txt = 'dogs in the street'
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
-sound_generator.set_generation_params(duration=.74)   # why is generating so long at 14 seconds
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7

 print('\n\n\n\n___________________')
+txt = 'dogs barging in the street'
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
+sound_generator.set_generation_params(duration=.46)   # why is generating so long at 14 seconds
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7