Cleanup model implementation

- Added support for inference cache.
- Refactor common code in attention
- Removed unused code (fragments from another project)

Files changed (1) hide show

modelling_walsh.py +64 -294

modelling_walsh.py CHANGED Viewed

@@ -340,7 +340,6 @@ class HFCausalModel(PreTrainedModel):
             ):
                 attention_mask = attention_mask[:, -max_cache_length:]
-        # NOTE: "RSWalsh" models don't need to have their absolute positions adjusted to zero; they are trained for this.
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
             # create position_ids on the fly for batch generation
@@ -420,6 +419,7 @@ class HFCausalModel(PreTrainedModel):
             num_heads=config.num_attention_heads,
             attn_type=attn_type,
             layer_idx=layer_idx,
             **config.attention_args,
         )
@@ -516,25 +516,6 @@ class Transformer(nn.Module):
         init.constant_(self.output_projection.bias, 0.)
         init.normal_(self.embedding.weight, std=self.d_model**-0.5)
-# A vanilla positional encoder
-class PositionalEncoder(nn.Module):
-    def __init__(self, d_embed, max_seq):
-        super().__init__()
-        self.d_embed = d_embed
-        self.max_seq = max_seq
-        weight = torch.zeros(max_seq, d_embed)
-        position = torch.arange(0, max_seq, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, d_embed, 2).float() * (-math.log(10000.0) / d_embed))
-        weight[:, 0::2] = torch.sin(position * div_term)
-        weight[:, 1::2] = torch.cos(position * div_term)
-        weight = weight.unsqueeze(0)
-        self.register_buffer('weight', weight)
-    def forward(self, x):
-        seq_len = x.size(-2)
-        return x + self.weight[:, :seq_len]
 # Converts a torch array of integers into their equivalent binary codes.
 def binary_tensor(x, bits):
     mask = 2**torch.arange(bits).to(x.device, x.dtype)
@@ -791,42 +772,6 @@ class FeedforwardLayer(nn.Module):
         init.constant_(self.linear1.bias, 0.)
         init.constant_(self.linear2.bias, 0.)
-# GLU Variants Improve Transformer
-# https://arxiv.org/pdf/2002.05202v1.pdf
-class SwiGLUFeedforwardLayer(nn.Module):
-    def __init__(
-        self,
-        d_model,
-        d_feedforward,
-        layer_idx,
-        beta=1.0,
-        dropout=0.1
-    ):
-        super().__init__()
-        self.d_model = d_model
-        self.d_feedforward = d_feedforward
-        self.beta = 1.0
-        self.linear1 = nn.Linear(self.d_model, self.d_feedforward * 2, bias=False)
-        self.linear2 = nn.Linear(self.d_feedforward, self.d_model, bias=False)
-        self.dropout = nn.Dropout(dropout)
-        self.reset_parameters()
-    def forward(self, x):
-        x, gate = self.linear1(x).chunk(2, dim=-1)
-        x = x * F.silu(gate)
-        x = self.dropout(x)
-        x = self.linear2(x)
-        return x
-    def reset_parameters(self):
-        # Deepnet initialization
-        # https://arxiv.org/pdf/2203.00555.pdf
-        w, g = self.linear1.weight.chunk(2, dim=0)
-        init.xavier_uniform_(w, gain=self.beta)
-        init.xavier_uniform_(g, gain=self.beta)
-        init.xavier_uniform_(self.linear2.weight, gain=self.beta)
 class CausalSelfAttention(nn.Module):
     def __init__(
         self,
@@ -838,6 +783,7 @@ class CausalSelfAttention(nn.Module):
         #   flash2: Use Flash-Attention2 implementation; fastest; limited to int16 and bfloat16 types; least memory usage.
         attn_type,
         layer_idx,
         beta=1.0,
         dropout=0.1,
     ):
@@ -847,6 +793,7 @@ class CausalSelfAttention(nn.Module):
         self.beta = beta
         self.attn_type = attn_type
         self.layer_idx = layer_idx
         assert d_model % num_heads == 0, "d_model must be evenly divisible by num_heads"
@@ -877,9 +824,21 @@ class CausalSelfAttention(nn.Module):
         init.constant_(self.in_proj.bias, 0.)
         init.constant_(self.output_linear.bias, 0.)
-    def project_input(self, qkv):
         proj = self.in_proj(qkv)
-        return proj.chunk(chunks=3, dim=-1)
     def forward(
         self,
@@ -888,7 +847,15 @@ class CausalSelfAttention(nn.Module):
         past_key_values,
         use_cache,
     ):
-        if self.attn_type == "flash2":
             if use_cache is None or use_cache == False:
                 return self.flash2_forward(qkv)
             else:
@@ -898,21 +865,15 @@ class CausalSelfAttention(nn.Module):
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
-        query, key, value = self.project_input(qkv)
-        # Split projections into multiple heads and swap position of sequence / heads dimension
-        query = query.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
-        key = key.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
-        value = value.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
-        # Update the cache values.
-        if past_key_values is not None:
-            key, value = past_key_values.update(key, value, self.layer_idx)
         # Default to returning empty attention weights.
         attentions = None
-        if self.attn_type == "torch":
             # This context manager can be used to force which implementation to use.
             #with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
             attended_values = F.scaled_dot_product_attention(
@@ -921,7 +882,7 @@ class CausalSelfAttention(nn.Module):
                 value,
                 attn_mask=None,
                 dropout_p=self.dropout.p if self.training else 0.0,
-                is_causal=True,
                 scale=self.dot_product_scale
             )
         # "native" scaled-dot-product attention implementation.
@@ -930,13 +891,14 @@ class CausalSelfAttention(nn.Module):
             scores = torch.matmul(query, key.transpose(-2, -1)) * self.dot_product_scale
             # Mask future positions from the past
-            scores.masked_fill_(
-                torch.tril(
-                    torch.ones(seq_len, seq_len, dtype=torch.bool, device=qkv.device),
-                    diagonal=0,
-                ).logical_not(),
-                float('-inf'),
-            )
             # Calculate the attention weights; avoid NANs that might emerge from zeros in softmax's denominator
             attentions = self.dropout(torch.softmax(scores, dim=-1).clamp(min=1e-10))
@@ -956,10 +918,10 @@ class CausalSelfAttention(nn.Module):
         return dict(
             hidden_states=attended_values,
             attentions=attentions,
-            # Unimplemented...
-            past_key_values=None
         )
     def flash2_forward(
         self,
         qkv,
@@ -977,9 +939,9 @@ class CausalSelfAttention(nn.Module):
             -1,
             (3, self.num_heads, self.d_head)
         )
         attended_values = flash_attn_qkvpacked_func(
-            qkv.bfloat16(),
             dropout_p=self.dropout.p if self.training else 0.0,
             softmax_scale=self.dot_product_scale,
             causal=True,
@@ -1007,18 +969,8 @@ class CausalSelfAttention(nn.Module):
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
-        query, key, value = self.project_input(qkv)
-        # TODO: Refactor -- this code is repeated in the baseline implementation.
-        # Split projections into multiple heads and swap position of sequence / heads dimension
-        query = query.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
-        key = key.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
-        value = value.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
-        if past_key_values is not None:
-            key, value = past_key_values.update(key, value, self.layer_idx)
-        #query, key, value = self._downcast_to_float16(query, key, value)
         # Expected inputs to flash2:
         # q: (batch_size, seqlen, nheads, headdim)
@@ -1049,204 +1001,22 @@ class CausalSelfAttention(nn.Module):
             past_key_values=past_key_values
         )
-    @staticmethod
-    def _downcast_to_float16(query, key, value):
-        # Copied section for Transformers to handle this
-        # TODO: Revist other Flash2 impelementation, above
-        input_dtype = query.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-            query = query.to(target_dtype)
-            key = key.to(target_dtype)
-            value = value.to(target_dtype)
-        return query, key, value
-########### TODO: Update to newer API, with inference cache
-# Attention layer with ALiBi relative positional encoding
-# TRAIN SHORT, TEST LONG: ATTENTION WITH LINEAR BIASES ENABLES INPUT LENGTH EXTRAPOLATION
-# https://arxiv.org/pdf/2108.12409.pdf
-def alibi_biases(query_len, key_len, device='cpu'):
-    x = torch.arange(key_len, device=device)[None, :]
-    y = torch.arange(query_len, device=device)[:, None]
-    return x - y
-class CausalAlibiAttention(nn.Module):
-    def __init__(
-        self,
-        d_model,
-        num_heads,
-        beta=1.0,
-        dropout=0.1,
-        # values:
-        #   native: Use local impementation; slowest option; good for debugging; useful when experimenting with non-standard stuff.
-        #   torch: Use pytorch "scaled_dot_product_attention()"; faster; generally good compatibility; does not support returning attn weights.
-        #   flash2: Use Flash-Attention2 implementation; fastest; limited to int16 and bfloat16 types; can't train Alibi weights; least memory usage.
-        # Note: You can perform initial training with "torch," then switch to "flash2," after the Alibi weights have settled.
-        window_size=None,
-        attn_type="native",
-        freeze_alibi=True,
-    ):
-        super().__init__()
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.beta = beta
-        self.attn_type = attn_type
-        assert d_model % num_heads == 0, "d_model must be evenly divisible by num_heads"
-        # The dimension of each head.
-        self.d_head = d_model // num_heads
-        # We scale the attention scores by the inverse-square-root of the head dimension
-        # this shifts the temerature of softmax.
-        self.dot_product_scale = 1.0 / math.sqrt(self.d_head)
-        self.in_proj = nn.Parameter(torch.empty(3 * self.d_model, self.d_model))
-        self.output_linear = nn.Linear(self.d_model, self.d_model, bias=False)
-        if window_size is not None:
-            self.window_size=(window_size, -1)
-        else:
-            self.window_size = (-1, -1)
-        self.dropout = nn.Dropout(dropout)
-        # This generates the original slope distribution from the paper.
-        # Observations with trainable slopes suggest that the high half of the slopes shift
-        # towards / past 1.0 and the low half approach zero or even go slightly negative.
-        # alibi_slopes = 1.0 / torch.logspace(1, 8, self.num_heads, base=2, dtype=torch.float)
-        # These appear to work better, as initial values, in practice.
-        alibi_slopes = 1.0 / torch.logspace(0, 7, self.num_heads, base=2, dtype=torch.float)
-        # If not trainable, it can improve performance somewhat if the low half are set to zero. Apparently
-        # making roughly half of the slopes position-agnostic is somehow closer to optimal?
-        # alibi_slopes.masked_fill_(torch.where(torch.arange(0, self.num_heads) >= (self.num_heads / 2), True, False), 0)
-        self.alibi_slopes = nn.Parameter(alibi_slopes)
-        # Optionally, allow/disallow training of ALiBi slopes.
-        self.alibi_slopes.requires_grad = (not freeze_alibi)
-        self.reset_parameters()
-    def extra_repr(self) -> str:
-        return f'd_model={self.d_model}, num_heads={self.num_heads}, beta={self.beta}, attn_type={self.attn_type}, window_size={self.window_size}, dropout={self.dropout}'
-    def reset_parameters(self):
-        # Deepnet initialization
-        # https://arxiv.org/pdf/2203.00555.pdf
-        q, k, v = self.in_proj.chunk(3)
-        init.xavier_uniform_(q, gain=1.0)
-        init.xavier_uniform_(k, gain=1.0)
-        init.xavier_uniform_(v, gain=self.beta)
-        init.xavier_uniform_(self.output_linear.weight, gain=self.beta)
-    def project_input(self, qkv):
-        proj = F.linear(qkv, self.in_proj)
-        return proj.chunk(chunks=3, dim=-1)
-    def forward(self, qkv, need_weights):
-        if self.attn_type == "flash2":
-            return self.flash2_forward(qkv)
-        # qkv: (batch_size, seq_len, d_embed)
-        batch_size, seq_len, d_embed = qkv.shape
-        # Feed the inputs through the K, Q, V matrices.
-        query, key, value = self.project_input(qkv)
-        # Split projections into multiple heads and swap position of sequence / heads dimension
-        query = query.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
-        key = key.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
-        value = value.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
-        # Apply Alibi relative positional biases.
-        attn_bias = alibi_biases(seq_len, seq_len, device=query.device) * self.alibi_slopes.view(-1, 1, 1)
-        # Mask future positions from the past
-        causal_mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.bool, device=qkv.device), diagonal=0)
-        attn_bias.masked_fill_(causal_mask.logical_not(), float('-inf'))
-        del causal_mask
-        # Default to returning empty attention weights.
-        attention_weights = None
-        if self.attn_type == "torch":
-            # This context manager can be used to force which implementation to use.
-            #with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
-            attended_values = F.scaled_dot_product_attention(
-                query,
-                key,
-                value,
-                attn_mask=attn_bias.to(dtype=query.dtype),
-                dropout_p=self.dropout.p if self.training else 0.0,
-                is_causal=False,
-                scale=self.dot_product_scale
-            )
-        # "native" scaled-dot-product attention implementation.
         else:
-            # Compute attention scores
-            scores = torch.matmul(query, key.transpose(-2, -1)) * self.dot_product_scale
-            # Adjust scores with attn_mask
-            scores += attn_bias
-            # Calculate the attention weights; avoid NANs that might emerge from zeros in softmax's denominator
-            attention_weights = self.dropout(torch.softmax(scores, dim=-1).clamp(min=1e-10))
-            # Use the attention weights to get a weighted combination of value vectors
-            attended_values = torch.matmul(attention_weights, value)
-            if not output_attentions:
-                attention_weights = None
-        # Concatenate attention heads and project to original embedding size using the output linear layer
-        attended_values = attended_values.transpose(1, 2).contiguous().view(batch_size, seq_len, d_embed)
-        # Project the concatenated output through the output matrix.
-        attended_values = self.output_linear(attended_values)
-        return attended_values, attention_weights
-    def flash2_forward(self, qkv):
-        batch_size, seq_len, d_embed = qkv.shape
-        # Feed the inputs through the K, Q, V matrices.
-        # query : (batch_size, seq_len, d_model)
-        # qkv : (batch_size, seq_len, 3, num_heads, d_kq)
-        qkv = F.linear(
-            qkv,
-            self.in_proj,
-        ).unflatten(
-            -1,
-            (3, self.num_heads, self.d_head)
         )
-        attended_values = flash_attn_qkvpacked_func(
-            qkv.bfloat16(),
-            dropout_p=self.dropout.p if self.training else 0.0,
-            softmax_scale=self.dot_product_scale,
-            causal=True,
-            window_size=self.window_size,
-            alibi_slopes=self.alibi_slopes.float(),
-        ).to(dtype=qkv.dtype)
-        # attended_values: (batch_size, seqlen, nheads, headdim)
-        # Concatentate heads back into d_embed
-        attended_values = attended_values.view(batch_size, seq_len, d_embed)
-        # Project the concatenated output through the output matrix.
-        attended_values = self.output_linear(attended_values)
-        return attended_values, None

             ):
                 attention_mask = attention_mask[:, -max_cache_length:]
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
             # create position_ids on the fly for batch generation
             num_heads=config.num_attention_heads,
             attn_type=attn_type,
             layer_idx=layer_idx,
+            config=config,
             **config.attention_args,
         )
         init.constant_(self.output_projection.bias, 0.)
         init.normal_(self.embedding.weight, std=self.d_model**-0.5)
 # Converts a torch array of integers into their equivalent binary codes.
 def binary_tensor(x, bits):
     mask = 2**torch.arange(bits).to(x.device, x.dtype)
         init.constant_(self.linear1.bias, 0.)
         init.constant_(self.linear2.bias, 0.)
 class CausalSelfAttention(nn.Module):
     def __init__(
         self,
         #   flash2: Use Flash-Attention2 implementation; fastest; limited to int16 and bfloat16 types; least memory usage.
         attn_type,
         layer_idx,
+        config,
         beta=1.0,
         dropout=0.1,
     ):
         self.beta = beta
         self.attn_type = attn_type
         self.layer_idx = layer_idx
+        self.config = config
         assert d_model % num_heads == 0, "d_model must be evenly divisible by num_heads"
         init.constant_(self.in_proj.bias, 0.)
         init.constant_(self.output_linear.bias, 0.)
+    # Project QKV input through input matrices, reshape to (batch_size, n_heads, seq_len, d_model), and apply cache.
+    def project_input(self, qkv, past_key_values):
+        batch_size, seq_len, d_embed = qkv.shape
         proj = self.in_proj(qkv)
+        query, key, value = proj.chunk(chunks=3, dim=-1)
+        # Split projections into multiple heads and swap position of sequence / heads dimension
+        query = query.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
+        key = key.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
+        value = value.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
+         # Update the cache values.
+        if past_key_values is not None:
+            key, value = past_key_values.update(key, value, self.layer_idx)
+        return query, key, value
     def forward(
         self,
         past_key_values,
         use_cache,
     ):
+        attn_type = self.attn_type
+        if output_attentions and attn_type != "native":
+            logger.warning_once(
+                "CausalSelfAttention(output_attentions=True) and attn_type is not 'native': "
+                "Forcing native attention."
+            )
+            attn_type = "native"
+        if attn_type == "flash2":
             if use_cache is None or use_cache == False:
                 return self.flash2_forward(qkv)
             else:
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
+        query, key, value = self.project_input(qkv, past_key_values)
+        kv_seq_len = key.shape[-2]
         # Default to returning empty attention weights.
         attentions = None
+        # https://github.com/pytorch/pytorch/issues/112577
+        if attn_type == "torch":
             # This context manager can be used to force which implementation to use.
             #with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
             attended_values = F.scaled_dot_product_attention(
                 value,
                 attn_mask=None,
                 dropout_p=self.dropout.p if self.training else 0.0,
+                is_causal=(seq_len > 1),
                 scale=self.dot_product_scale
             )
         # "native" scaled-dot-product attention implementation.
             scores = torch.matmul(query, key.transpose(-2, -1)) * self.dot_product_scale
             # Mask future positions from the past
+            if seq_len > 1:
+                scores.masked_fill_(
+                    torch.tril(
+                        torch.ones(seq_len, kv_seq_len, dtype=torch.bool, device=qkv.device),
+                        diagonal=0,
+                    ).logical_not(),
+                    float('-inf'),
+                )
             # Calculate the attention weights; avoid NANs that might emerge from zeros in softmax's denominator
             attentions = self.dropout(torch.softmax(scores, dim=-1).clamp(min=1e-10))
         return dict(
             hidden_states=attended_values,
             attentions=attentions,
+            past_key_values=past_key_values
         )
+    # No cache support, but faster
     def flash2_forward(
         self,
         qkv,
             -1,
             (3, self.num_heads, self.d_head)
         )
         attended_values = flash_attn_qkvpacked_func(
+            self._downcast_to_float16(qkv)[0],
             dropout_p=self.dropout.p if self.training else 0.0,
             softmax_scale=self.dot_product_scale,
             causal=True,
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
+        query, key, value = self.project_input(qkv, past_key_values)
+        query, key, value = self._downcast_to_float16(query, key, value)
         # Expected inputs to flash2:
         # q: (batch_size, seqlen, nheads, headdim)
             past_key_values=past_key_values
         )
+    def _downcast_to_float16(self, *args):
+        if args[0].dtype != torch.float32:
+            return args
+        if torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        # Handle the case where the model is quantized
+        elif hasattr(self.config, "_pre_quantization_dtype"):
+            target_dtype = self.config._pre_quantization_dtype
         else:
+            target_dtype = self.output_linear.weight.dtype
+        logger.warning_once(
+            f"The input hidden states seems to be silently casted in float32, this might be related to"
+            f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+            f" {target_dtype}."
         )
+        return (arg.to(target_dtype) for arg in args)