dkounadis
/

artificial-styletts2

@@ -67,6 +67,10 @@ CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=
 Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
 ```
 python tts.py --text assets/ocr.txt --image assets/ocr.jpg --soundscape "battle hero" --voice romanian
 ```

 Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
+### Foreign Lang TTS
+This will produce the following [video](https://www.youtube.com/watch?v=UeJEAsKxRZU)
 ```
 python tts.py --text assets/ocr.txt --image assets/ocr.jpg --soundscape "battle hero" --voice romanian
 ```

audiocraft/transformer.py CHANGED Viewed

@@ -3,44 +3,30 @@ import torch.nn as nn
 from torch.nn import functional as F
 from einops import rearrange
-def create_sin_embedding(positions,
-                         dim,
-                         max_period = 10000,
-                         dtype = torch.float32):
-    """Create sinusoidal positional embedding, with shape `[B, T, C]`.
-    Args:
-        positions (torch.Tensor): LongTensor of positions.
-        dim (int): Dimension of the embedding.
-        max_period (float): Maximum period of the cosine/sine functions.
-        dtype (torch.dtype or str): dtype to use to generate the embedding.
-    Returns:
-        torch.Tensor: Sinusoidal positional embedding.
-    """
-    # We aim for BTC format
     assert dim % 2 == 0
     half_dim = dim // 2
-    positions = positions.to(dtype)
-    adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
-    max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype)  # avoid sync point
     phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
-    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
 class StreamingMultiheadAttention(nn.Module):
-    def __init__(self,
-                 embed_dim,
                  num_heads,
                  cross_attention = False,
                  ):
         super().__init__()
         self.cross_attention = cross_attention
         self.embed_dim = embed_dim
         self.k_history = None  # previous k from the previous tokens seen in the current generation - only for selt.attn
-        self.v_history = None  # clean up IN LM after finishing GENERATION - Each 1...47 mha has different kv history
         self.num_heads = num_heads
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
         self.register_buffer('in_proj_weight', torch.ones((3 * embed_dim, embed_dim),
@@ -52,58 +38,56 @@ class StreamingMultiheadAttention(nn.Module):
                 value=None):
         layout = "b h t d"
         if self.cross_attention:
             # Different queries, keys, values, we have to spit manually the in_proj_weight
             dim = self.in_proj_weight.shape[0] // 3
             q = nn.functional.linear(query, self.in_proj_weight[:dim])
             k = nn.functional.linear(key,   self.in_proj_weight[dim: 2 * dim])
             v = nn.functional.linear(value, self.in_proj_weight[2 * dim:])
             q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
             # print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
         else:
             # 1st projected makes k,v (instantaneous)
             # Here else is self_attention for audio with itself (above is cross attention txt)
             # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
             projected = nn.functional.linear(query, self.in_proj_weight)
             bound_layout = "b h p t d"
             packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
             q, k, v = packed.unbind(dim=2)
             if self.k_history is not None:
-                # flush
                 if self.k_history.shape[2] > 71:
                     self.k_history = torch.cat([self.k_history[:, :, :4, :], self.k_history[:, :, -1:, :]], 2)
                     self.v_history = torch.cat([self.v_history[:, :, :4, :], self.v_history[:, :, -1:, :]], 2)
-                # fill new k/v
                 self.k_history = torch.cat([self.k_history, k], 2)  # IF ctrl^c here during live demo it is non-atomic k!=v
                 self.v_history = torch.cat([self.v_history, v], 2)  # thus it will try to continue with incompatible k/v dims!
-            else:
                 # init
                 self.k_history = k
-                self.v_history = v
             # For self attn prepare
             k = self.k_history
             v = self.v_history
             # KV COMPLETION ONLY ON SELF ATTENTION
         x = torch.nn.functional.scaled_dot_product_attention(
             q, k, v, is_causal=False, dropout_p=0
         )
-        x = x.to(q.dtype)
         x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
         x = self.out_proj(x)
         return x
@@ -111,14 +95,14 @@ class StreamingMultiheadAttention(nn.Module):
 class StreamingTransformerLayer(nn.Module):
-    def __init__(self,
-                 d_model,
-                 num_heads,
                  dim_feedforward):
         super().__init__()
         self.self_attn = StreamingMultiheadAttention(embed_dim=d_model,
                                                      num_heads=num_heads)
         self.linear1 = nn.Linear(d_model, dim_feedforward, bias=False)
@@ -126,7 +110,7 @@ class StreamingTransformerLayer(nn.Module):
         self.cross_attention = StreamingMultiheadAttention(embed_dim=d_model,
                                                            num_heads=num_heads,
                                                            cross_attention=True)
-        self.norm_cross = nn.LayerNorm(d_model, eps=1e-5)
         self.norm1 = nn.LayerNorm(d_model, eps=1e-5)
         self.norm2 = nn.LayerNorm(d_model, eps=1e-5)
@@ -135,30 +119,30 @@ class StreamingTransformerLayer(nn.Module):
                 src,
                 cross_attention_src=None):  # txtcond
         '''T is saved float16 weights - should we cast src to float16'''
         x = src
         x = x + self.self_attn(self.norm1(x))
         if cross_attention_src is not None:
             x = x + self.cross_attention(
-                                    query = self.norm_cross(x),
-                                    key   = cross_attention_src,
                                     value = cross_attention_src)  # txtcondition
         x = x + self.linear2(F.gelu(self.linear1(   self.norm2(x)    )))
         return x
 class StreamingTransformer(nn.Module):
-    def __init__(self,
-                 d_model=1536,
-                 num_heads=24,
-                 num_layers=48,
                  dim_feedforward=6144,
                  cross_attention = True,
-                 positional_embedding: str = 'sin',
                  max_period: float = 10_000
                  ):
         super().__init__()
@@ -170,23 +154,22 @@ class StreamingTransformer(nn.Module):
         for idx in range(num_layers):
             self.layers.append(
                 StreamingTransformerLayer(
-                    d_model=d_model,
-                    num_heads=num_heads,
                     dim_feedforward=dim_feedforward
                     )
             )
-    def forward(self,
-                x,
-                token_count=None,
                 cross_attention_src=None):
-        B, T, C = x.shape
         if self.positional_embedding in ['sin', 'sin_rope']:
-            positions = torch.arange(T, device=x.device).view(1, -1, 1)
-            pos_emb = create_sin_embedding(positions + token_count, C, max_period=self.max_period, dtype=x.dtype)
             x = x + pos_emb
         for j, lay in enumerate(self.layers):
-            # print(f'Transf Layer c{j}      {pos_emb.sum()=} {pos_emb.shape=}{x.sum()=}___________________')
             x = lay(x, cross_attention_src=cross_attention_src)  # cross_attention_src = txt-cond x audio
                                                                  # self attn = audio x audio
                                                                  # Every layer (mha) keeps itsw own kv cachE

 from torch.nn import functional as F
 from einops import rearrange
+def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float = 10000):
     assert dim % 2 == 0
     half_dim = dim // 2
+    positions = positions.to(torch.float)
+    adim = torch.arange(half_dim, device=positions.device, dtype=torch.float).view(1, 1, -1)
+    max_period_tensor = torch.full([], max_period, device=positions.device, dtype=torch.float)  # avoid sync point
     phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
+    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)  # OFFICIAL is torch.float32 HOWEVER self_attn.in_prod_weight = torch.float16
 class StreamingMultiheadAttention(nn.Module):
+    def __init__(self,
+                 embed_dim,
                  num_heads,
                  cross_attention = False,
                  ):
         super().__init__()
         self.cross_attention = cross_attention
         self.embed_dim = embed_dim
         self.k_history = None  # previous k from the previous tokens seen in the current generation - only for selt.attn
+        self.v_history = None  # clean up IN LM after finishing GENERATION - Each 1...47 mha has different kv history
         self.num_heads = num_heads
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
         self.register_buffer('in_proj_weight', torch.ones((3 * embed_dim, embed_dim),
                 value=None):
         layout = "b h t d"
         if self.cross_attention:
             # Different queries, keys, values, we have to spit manually the in_proj_weight
             dim = self.in_proj_weight.shape[0] // 3
             q = nn.functional.linear(query, self.in_proj_weight[:dim])
             k = nn.functional.linear(key,   self.in_proj_weight[dim: 2 * dim])
             v = nn.functional.linear(value, self.in_proj_weight[2 * dim:])
             q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
             # print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
         else:
             # 1st projected makes k,v (instantaneous)
             # Here else is self_attention for audio with itself (above is cross attention txt)
             # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
             projected = nn.functional.linear(query, self.in_proj_weight)
             bound_layout = "b h p t d"
             packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
             q, k, v = packed.unbind(dim=2)
             if self.k_history is not None:
+                # flush
                 if self.k_history.shape[2] > 71:
                     self.k_history = torch.cat([self.k_history[:, :, :4, :], self.k_history[:, :, -1:, :]], 2)
                     self.v_history = torch.cat([self.v_history[:, :, :4, :], self.v_history[:, :, -1:, :]], 2)
+                # fill new k/v
                 self.k_history = torch.cat([self.k_history, k], 2)  # IF ctrl^c here during live demo it is non-atomic k!=v
                 self.v_history = torch.cat([self.v_history, v], 2)  # thus it will try to continue with incompatible k/v dims!
+            else:
                 # init
                 self.k_history = k
+                self.v_history = v
             # For self attn prepare
             k = self.k_history
             v = self.v_history
             # KV COMPLETION ONLY ON SELF ATTENTION
         x = torch.nn.functional.scaled_dot_product_attention(
             q, k, v, is_causal=False, dropout_p=0
         )
         x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
         x = self.out_proj(x)
         return x
 class StreamingTransformerLayer(nn.Module):
+    def __init__(self,
+                 d_model,
+                 num_heads,
                  dim_feedforward):
         super().__init__()
         self.self_attn = StreamingMultiheadAttention(embed_dim=d_model,
                                                      num_heads=num_heads)
         self.linear1 = nn.Linear(d_model, dim_feedforward, bias=False)
         self.cross_attention = StreamingMultiheadAttention(embed_dim=d_model,
                                                            num_heads=num_heads,
                                                            cross_attention=True)
+        self.norm_cross = nn.LayerNorm(d_model, eps=1e-5)
         self.norm1 = nn.LayerNorm(d_model, eps=1e-5)
         self.norm2 = nn.LayerNorm(d_model, eps=1e-5)
                 src,
                 cross_attention_src=None):  # txtcond
         '''T is saved float16 weights - should we cast src to float16'''
         x = src
         x = x + self.self_attn(self.norm1(x))
         if cross_attention_src is not None:
             x = x + self.cross_attention(
+                                    query = self.norm_cross(x),
+                                    key   = cross_attention_src,
                                     value = cross_attention_src)  # txtcondition
         x = x + self.linear2(F.gelu(self.linear1(   self.norm2(x)    )))
         return x
 class StreamingTransformer(nn.Module):
+    def __init__(self,
+                 d_model=1536,
+                 num_heads=24,
+                 num_layers=48,
                  dim_feedforward=6144,
                  cross_attention = True,
+                 positional_embedding: str = 'sin',
                  max_period: float = 10_000
                  ):
         super().__init__()
         for idx in range(num_layers):
             self.layers.append(
                 StreamingTransformerLayer(
+                    d_model=d_model,
+                    num_heads=num_heads,
                     dim_feedforward=dim_feedforward
                     )
             )
+    def forward(self,
+                x,
+                token_count=None,
                 cross_attention_src=None):
         if self.positional_embedding in ['sin', 'sin_rope']:
+            pos_emb = create_sin_embedding(torch.tensor([[[.0]], [[.0]]], device=x.device) + token_count, x.shape[2], max_period=self.max_period)
             x = x + pos_emb
         for j, lay in enumerate(self.layers):
             x = lay(x, cross_attention_src=cross_attention_src)  # cross_attention_src = txt-cond x audio
                                                                  # self attn = audio x audio
                                                                  # Every layer (mha) keeps itsw own kv cachE