ltg
/

norbert4-large

@@ -96,6 +96,7 @@ class CastedLinearIn(nn.Linear):
 class MultiCastedLinearOrthoIn(nn.Module):
     def __init__(self, in_features, out_features, bias):
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
@@ -178,15 +179,10 @@ class Embedding(nn.Module):
     def __init__(self, config: GptBertConfig):
         super().__init__()
-        assert hasattr(config, "vocab_size"), "The config must have a vocab_size attribute!"
-        assert hasattr(config, "hidden_size"), "The config must have a hidden_size attribute!"
-        assert hasattr(config, "embedding_dropout_p"), "The model must have a embedding_dropout_p attribute!"
         self.word_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
-        self.word_norm = nn.LayerNorm(config.hidden_size, eps=config.word_norm_eps, elementwise_affine=False, bias=False)
         self.word_scale = nn.Parameter(torch.zeros(config.hidden_size))
-        self.dropout = nn.Dropout(config.embedding_dropout_p)
     def forward(self, input_ids: torch.Tensor):
         word_embedding = self.word_embedding(input_ids)
@@ -200,9 +196,10 @@ class Classifier(nn.Module):
     def __init__(self, config: GptBertConfig, n_labels: int):
         super().__init__()
-        self.pre_norm = nn.LayerNorm(config.hidden_size, eps=config.classifier_pre_norm_eps, elementwise_affine=config.classifier_pre_norm_affine)
         self.projection = CastedLinearIn(config.hidden_size, config.hidden_size, bias=False)
-        self.post_norm = nn.LayerNorm(config.hidden_size, eps=config.classifier_post_norm_eps, elementwise_affine=config.classifier_post_norm_affine)
         self.emb2vocab = CastedLinearIn(config.hidden_size, n_labels, bias=True)
     def forward(self, x: torch.Tensor):
@@ -210,22 +207,13 @@ class Classifier(nn.Module):
         x = self.projection(x)
         x = gelu_new(x)
         x = self.post_norm(x.float()).type_as(x)
         x = self.emb2vocab(x)
         return x
-def flash_attention_forward(
-    qkv: torch.Tensor,
-    rotary_emb: UnpaddedRotaryEmbedding,
-    cu_seqlens: torch.Tensor,
-    max_seqlen: int,
-    causal: bool,
-    local_attention: Tuple[int, int],
-    dropout_p: float,
-    deterministic: bool,
-    target_dtype: torch.dtype = torch.bfloat16,
-    **_kwargs,
-):
     qkv = rotary_emb(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen)
     convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
@@ -265,8 +253,8 @@ class SelfAttention(nn.Module):
         self.config = config
         self.layer_idx = layer_idx
-        self.d_qk = config.d_qk
-        self.d_v = config.d_v
         self.num_attention_heads = config.num_attention_heads
         self.num_kv_heads = config.num_kv_heads
         self.hidden_size = config.hidden_size
@@ -279,23 +267,21 @@ class SelfAttention(nn.Module):
         self.v_proj = CastedLinearIn(self.hidden_size, self.v_out_dim, bias=False)
         self.out_proj = CastedLinearIn(self.d_v*self.num_attention_heads, self.hidden_size, bias=False)
-        self.pre_v_norm = nn.LayerNorm(config.hidden_size, eps=config.attention_pre_norm_eps, elementwise_affine=config.attention_pre_norm_affine)
-        self.pre_qk_norm = nn.LayerNorm(config.hidden_size, eps=config.attention_pre_norm_eps, elementwise_affine=config.attention_pre_norm_affine)
-        self.inter_norm = nn.LayerNorm(self.d_v * self.num_attention_heads, eps=config.attention_inter_norm_eps, elementwise_affine=config.attention_inter_norm_affine)
-        self.q_norm = nn.LayerNorm(config.d_qk, eps=config.attention_pre_norm_eps, elementwise_affine=False, bias=False)
-        self.k_norm = nn.LayerNorm(config.d_qk, eps=config.attention_pre_norm_eps, elementwise_affine=False, bias=False)
-        self.k_scale = nn.Parameter(torch.ones(self.num_kv_heads, config.d_qk))
-        self.q_scale = nn.Parameter(torch.ones(self.num_attention_heads, config.d_qk))
-        self.dropout = nn.Dropout(config.attention_output_dropout_p)
-        self.attention_dropout = config.attention_dropout if hasattr(config, "attention_dropout") else 0.0
-        self.deterministic_flash_attn = getattr(config, "deterministic_flash_attn", False)
         theta = 160_000 if (layer_idx + 1) % config.short_long_ratio == 0 else 10_000
         # Initialize rotary embeddings based on whether FlashAttention is available
         if self.config._attn_implementation == "flash_attention_2":
-            self.rope_embedding = UnpaddedRotaryEmbedding(dim=config.d_qk, base=theta, max_seqlen=config.max_sequence_length)
         else:
             self.rope_embedding = RotaryPositionalEmbeddings(config, theta)
@@ -338,7 +324,7 @@ class SelfAttention(nn.Module):
             key=key,
             value=value,
             attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
             is_causal=self.is_causal
         )
         return output
@@ -394,8 +380,8 @@ class SelfAttention(nn.Module):
                 max_seqlen,
                 self.is_causal,
                 local_attention,
-                self.attention_dropout if self.training else 0.0,
-                self.deterministic_flash_attn
             )
             # Reshape output back
@@ -434,12 +420,12 @@ class SelfAttention(nn.Module):
 class FeedForward(nn.Module):
     def __init__(self, config: GptBertConfig):
         super().__init__()
-        self.pre_norm = nn.LayerNorm(config.hidden_size, eps=config.feed_forward_pre_norm_eps, elementwise_affine=config.feed_forward_pre_norm_affine)
         self.up_proj = MultiCastedLinearOrthoIn(config.hidden_size, [config.intermediate_size, config.intermediate_size], bias=False)
         self.activation = GeGLU()
-        self.inter_norm = nn.LayerNorm(config.intermediate_size, eps=config.feed_forward_inter_norm_eps, elementwise_affine=config.feed_forward_inter_norm_affine)
         self.down_proj = CastedLinearIn(config.intermediate_size, config.hidden_size, bias=False)
-        self.dropout = nn.Dropout(config.feed_forward_dropout_p)
     def forward(self, x: torch.Tensor):
         x = self.pre_norm(x.float()).type_as(x)
@@ -451,16 +437,10 @@ class FeedForward(nn.Module):
         return x
 class ApplyRotaryEmbUnpad(torch.autograd.Function):
     @staticmethod
-    def forward(
-        ctx,
-        qkv,
-        cos,
-        sin,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[int] = None,
-    ):
         # (total_nnz, 3, nheads, headdim)
         qkv = qkv.contiguous()
         total_nnz, _three, _nheads, headdim = qkv.shape
@@ -468,16 +448,7 @@ class ApplyRotaryEmbUnpad(torch.autograd.Function):
         # we get the same tensor
         # qk = rearrange(qkv[:, :2], "b_s t h d -> b_s (t h) d")
         qk = qkv[:, :2].view(total_nnz, -1, headdim)
-        apply_rotary(
-            qk,
-            cos,
-            sin,
-            seqlen_offsets=0,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
-            interleaved=False,
-            inplace=True,
-        )
         ctx.save_for_backward(cos, sin, cu_seqlens)
         ctx.max_seqlen = max_seqlen
@@ -506,10 +477,12 @@ class ApplyRotaryEmbUnpad(torch.autograd.Function):
         return do, None, None, None, None, None, None
 def apply_rotary_unpadded(qkv, cos, sin, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[int] = None):
     return ApplyRotaryEmbUnpad.apply(qkv, cos, sin, cu_seqlens, max_seqlen)
 class UnpaddedRotaryEmbedding(RotaryEmbedding):
     def __init__(self, dim: int, base: float = 10000.0, max_seqlen: Optional[int] = None):
         super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=None, interleaved=False)
@@ -537,19 +510,7 @@ class RotaryPositionalEmbeddings(nn.Module):
     def __init__(self, config, theta: int):
         super().__init__()
-        assert hasattr(config, "d_qk"), "The config must have a d_qk attribute!"
-        assert hasattr(config, "max_sequence_length"), "The config must have a max_sequence_length attribute!"
-        self.inv_freq: torch.Tensor
-        self.cos_matrix: torch.Tensor
-        self.sin_matrix: torch.Tensor
-        head_size: int
-        max_seq_len: int
-        inv_freq: torch.Tensor
-        pos: torch.Tensor
-        embedding: torch.Tensor
-        head_size = config.d_qk
         assert head_size % 2 == 0
         max_seq_len = config.max_sequence_length
@@ -561,12 +522,6 @@ class RotaryPositionalEmbeddings(nn.Module):
         self.register_buffer("sin_matrix", embedding.sin(), persistent=False)
     def forward(self, x: torch.Tensor):
-        seq_len: int
-        cos_matrix: torch.Tensor
-        sin_matrix: torch.Tensor
-        x_rotate_half: torch.Tensor
-        out: torch.Tensor
         hidden_layer = x.float()
         seq_len = x.shape[2]

 class MultiCastedLinearOrthoIn(nn.Module):
     def __init__(self, in_features, out_features, bias):
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
     def __init__(self, config: GptBertConfig):
         super().__init__()
         self.word_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.word_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False, bias=False)
         self.word_scale = nn.Parameter(torch.zeros(config.hidden_size))
+        self.dropout = nn.Dropout(config.embedding_dropout)
     def forward(self, input_ids: torch.Tensor):
         word_embedding = self.word_embedding(input_ids)
     def __init__(self, config: GptBertConfig, n_labels: int):
         super().__init__()
+        self.pre_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
         self.projection = CastedLinearIn(config.hidden_size, config.hidden_size, bias=False)
+        self.post_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.dropout = nn.Dropout(config.classifier_dropout)
         self.emb2vocab = CastedLinearIn(config.hidden_size, n_labels, bias=True)
     def forward(self, x: torch.Tensor):
         x = self.projection(x)
         x = gelu_new(x)
         x = self.post_norm(x.float()).type_as(x)
+        x = self.dropout(x)
         x = self.emb2vocab(x)
         return x
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
+def flash_attention_forward(qkv: torch.Tensor, rotary_emb: UnpaddedRotaryEmbedding, cu_seqlens: torch.Tensor, max_seqlen: int, causal: bool, local_attention: Tuple[int, int], dropout_p: float, deterministic: bool, target_dtype: torch.dtype = torch.bfloat16, **_kwargs):
     qkv = rotary_emb(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen)
     convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
         self.config = config
         self.layer_idx = layer_idx
+        self.d_qk = config.query_key_head_size
+        self.d_v = config.value_head_size
         self.num_attention_heads = config.num_attention_heads
         self.num_kv_heads = config.num_kv_heads
         self.hidden_size = config.hidden_size
         self.v_proj = CastedLinearIn(self.hidden_size, self.v_out_dim, bias=False)
         self.out_proj = CastedLinearIn(self.d_v*self.num_attention_heads, self.hidden_size, bias=False)
+        self.pre_v_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.pre_qk_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.inter_norm = nn.LayerNorm(self.d_v * self.num_attention_heads, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.q_norm = nn.LayerNorm(self.d_qk, eps=config.layer_norm_eps, elementwise_affine=False, bias=False)
+        self.k_norm = nn.LayerNorm(self.d_qk, eps=config.layer_norm_eps, elementwise_affine=False, bias=False)
+        self.k_scale = nn.Parameter(torch.ones(self.num_kv_heads, self.d_qk))
+        self.q_scale = nn.Parameter(torch.ones(self.num_attention_heads, self.d_qk))
+        self.dropout = nn.Dropout(config.hidden_dropout)
         theta = 160_000 if (layer_idx + 1) % config.short_long_ratio == 0 else 10_000
         # Initialize rotary embeddings based on whether FlashAttention is available
         if self.config._attn_implementation == "flash_attention_2":
+            self.rope_embedding = UnpaddedRotaryEmbedding(dim=self.d_qk, base=theta, max_seqlen=config.max_sequence_length)
         else:
             self.rope_embedding = RotaryPositionalEmbeddings(config, theta)
             key=key,
             value=value,
             attn_mask=attention_mask,
+            dropout_p=self.config.attention_dropout if self.training else 0.0,
             is_causal=self.is_causal
         )
         return output
                 max_seqlen,
                 self.is_causal,
                 local_attention,
+                self.config.attention_dropout if self.training else 0.0,
+                self.config.deterministic_flash_attn
             )
             # Reshape output back
 class FeedForward(nn.Module):
     def __init__(self, config: GptBertConfig):
         super().__init__()
+        self.pre_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
         self.up_proj = MultiCastedLinearOrthoIn(config.hidden_size, [config.intermediate_size, config.intermediate_size], bias=False)
         self.activation = GeGLU()
+        self.inter_norm = nn.LayerNorm(config.intermediate_size, eps=config.layer_norm_eps, elementwise_affine=False)
         self.down_proj = CastedLinearIn(config.intermediate_size, config.hidden_size, bias=False)
+        self.dropout = nn.Dropout(config.hidden_dropout)
     def forward(self, x: torch.Tensor):
         x = self.pre_norm(x.float()).type_as(x)
         return x
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
 class ApplyRotaryEmbUnpad(torch.autograd.Function):
     @staticmethod
+    def forward(ctx, qkv, cos, sin, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[int] = None):
         # (total_nnz, 3, nheads, headdim)
         qkv = qkv.contiguous()
         total_nnz, _three, _nheads, headdim = qkv.shape
         # we get the same tensor
         # qk = rearrange(qkv[:, :2], "b_s t h d -> b_s (t h) d")
         qk = qkv[:, :2].view(total_nnz, -1, headdim)
+        apply_rotary(qk, cos, sin, seqlen_offsets=0, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, interleaved=False, inplace=True)
         ctx.save_for_backward(cos, sin, cu_seqlens)
         ctx.max_seqlen = max_seqlen
         return do, None, None, None, None, None, None
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
 def apply_rotary_unpadded(qkv, cos, sin, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[int] = None):
     return ApplyRotaryEmbUnpad.apply(qkv, cos, sin, cu_seqlens, max_seqlen)
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
 class UnpaddedRotaryEmbedding(RotaryEmbedding):
     def __init__(self, dim: int, base: float = 10000.0, max_seqlen: Optional[int] = None):
         super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=None, interleaved=False)
     def __init__(self, config, theta: int):
         super().__init__()
+        head_size = config.query_key_head_size
         assert head_size % 2 == 0
         max_seq_len = config.max_sequence_length
         self.register_buffer("sin_matrix", embedding.sin(), persistent=False)
     def forward(self, x: torch.Tensor):
         hidden_layer = x.float()
         seq_len = x.shape[2]