koichi12 commited on Feb 12, 2025

Commit

d33aea4

verified ·

1 Parent(s): 640f355

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/bert_padding.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/flash_attn_interface.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/flash_attn_triton.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/flash_attn_triton_og.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/flash_blocksparse_attention.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/flash_blocksparse_attn_interface.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/fused_softmax.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/layers/__init__.py +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/layers/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/layers/__pycache__/patch_embed.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/layers/__pycache__/rotary.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/layers/patch_embed.py +67 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/layers/rotary.py +481 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/losses/__init__.py +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/losses/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/losses/__pycache__/cross_entropy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/losses/cross_entropy.py +85 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__init__.py +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/baichuan.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/bert.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/bigcode.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/btlm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/falcon.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/gpt.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/gpt_neox.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/gptj.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/llama.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/opt.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/vit.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/baichuan.py +151 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/bert.py +764 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/bigcode.py +233 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/btlm.py +102 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/falcon.py +143 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/gpt.py +1080 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/gpt_neox.py +124 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/gptj.py +109 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/llama.py +422 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/opt.py +116 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/vit.py +373 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/__init__.py +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/__pycache__/activations.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/__pycache__/fused_dense.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/__pycache__/layer_norm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/__pycache__/rms_norm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/activations.py +135 -0
.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/fused_dense.py +688 -0

.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (606 Bytes). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/bert_padding.cpython-311.pyc ADDED Viewed

Binary file (11 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/flash_attn_interface.cpython-311.pyc ADDED Viewed

Binary file (46.3 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/flash_attn_triton.cpython-311.pyc ADDED Viewed

Binary file (44.5 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/flash_attn_triton_og.cpython-311.pyc ADDED Viewed

Binary file (16.2 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/flash_blocksparse_attention.cpython-311.pyc ADDED Viewed

Binary file (8.15 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/flash_blocksparse_attn_interface.cpython-311.pyc ADDED Viewed

Binary file (7.53 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/__pycache__/fused_softmax.cpython-311.pyc ADDED Viewed

Binary file (9.45 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/layers/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/xformers/_flash_attn/layers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (200 Bytes). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/layers/__pycache__/patch_embed.cpython-311.pyc ADDED Viewed

Binary file (3.34 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/layers/__pycache__/rotary.cpython-311.pyc ADDED Viewed

Binary file (20.1 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# We use the same API as https://github.com/rwightman/pytorch-image-models/blob/v0.6.11/timm/models/layers/patch_embed.py
+# But we use nn.Linear instead of Conv2d and it's about 8x faster.
+from functools import partial
+import torch.nn as nn
+from einops import rearrange
+from torch import _assert
+from torch.nn.modules.utils import _pair
+try:
+    from flash_attn.ops.fused_dense import FusedDense
+except ImportError:
+    FusedDense = None
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+        bias=True,
+        fused_bias_fc=False,
+    ):
+        super().__init__()
+        img_size = _pair(img_size)
+        patch_size = _pair(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        if fused_bias_fc and FusedDense is None:
+            raise ImportError("fused_dense is not installed")
+        linear_cls = nn.Linear if not fused_bias_fc or not bias else FusedDense
+        self.proj = linear_cls(in_chans * patch_size[0] * patch_size[1], embed_dim, bias=bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        _, _, H, W = x.shape
+        _assert(
+            H == self.img_size[0],
+            f"Input image height ({H}) doesn't match model ({self.img_size[0]}).",
+        )
+        _assert(
+            W == self.img_size[1],
+            f"Input image width ({W}) doesn't match model ({self.img_size[1]}).",
+        )
+        x = self.proj(
+            rearrange(
+                x,
+                "b c (h p1) (w p2) -> b h w (c p1 p2)",
+                p1=self.patch_size[0],
+                p2=self.patch_size[1],
+            )
+        )
+        if self.flatten:
+            x = rearrange(x, "b h w c -> b (h w) c")
+        x = self.norm(x)
+        return x

.venv/lib/python3.11/site-packages/xformers/_flash_attn/layers/rotary.py ADDED Viewed

	@@ -0,0 +1,481 @@

+# Copyright (c) 2023, Tri Dao.
+import math
+from typing import Optional, Tuple, Union
+import torch
+from einops import rearrange, repeat
+from flash_attn.ops.triton.rotary import apply_rotary
+def rotate_half(x, interleaved=False):
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
+def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    return torch.cat(
+        [x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]],
+        dim=-1,
+    )
+class ApplyRotaryEmb(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x,
+        cos,
+        sin,
+        interleaved=False,
+        inplace=False,
+        seqlen_offsets: Union[int, torch.Tensor] = 0,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
+    ):
+        out = apply_rotary(
+            x,
+            cos,
+            sin,
+            seqlen_offsets=seqlen_offsets,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+            interleaved=interleaved,
+            inplace=inplace,
+        )
+        if isinstance(seqlen_offsets, int):
+            ctx.save_for_backward(cos, sin, cu_seqlens)  # Can't save int with save_for_backward
+            ctx.seqlen_offsets = seqlen_offsets
+        else:
+            ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
+            ctx.seqlen_offsets = None
+        ctx.interleaved = interleaved
+        ctx.inplace = inplace
+        ctx.max_seqlen = max_seqlen
+        return out if not inplace else x
+    @staticmethod
+    def backward(ctx, do):
+        seqlen_offsets = ctx.seqlen_offsets
+        if seqlen_offsets is None:
+            cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
+        else:
+            cos, sin, cu_seqlens = ctx.saved_tensors
+        # TD [2023-09-02]: For some reason Triton (2.0.0.post1) errors with
+        # "[CUDA]: invalid device context", and cloning makes it work. Idk why. Triton 2.1.0 works.
+        if not ctx.interleaved and not ctx.inplace:
+            do = do.clone()
+        dx = apply_rotary(
+            do,
+            cos,
+            sin,
+            seqlen_offsets=seqlen_offsets,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=ctx.max_seqlen,
+            interleaved=ctx.interleaved,
+            inplace=ctx.inplace,
+            conjugate=True,
+        )
+        return dx, None, None, None, None, None, None, None
+def apply_rotary_emb(
+    x,
+    cos,
+    sin,
+    interleaved=False,
+    inplace=False,
+    seqlen_offsets: Union[int, torch.Tensor] = 0,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    max_seqlen: Optional[int] = None,
+):
+    """
+    Arguments:
+        x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
+            else (total_seqlen, nheads, headdim)
+        cos, sin: (seqlen_rotary, rotary_dim / 2)
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
+            of 1st half and 2nd half (GPT-NeoX style).
+        inplace: if True, apply rotary embedding in-place.
+        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
+            Most commonly used in inference when we have KV cache.
+        cu_seqlens: (batch + 1,) or None
+        max_seqlen: int
+    Return:
+        out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
+            else (total_seqlen, nheads, headdim)
+    rotary_dim must be <= headdim
+    Apply rotary embedding to the first rotary_dim of x.
+    """
+    return ApplyRotaryEmb.apply(
+        x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen
+    )
+# For backward compatibility
+apply_rotary_emb_func = apply_rotary_emb
+class ApplyRotaryEmbQKV_(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        qkv,
+        cos,
+        sin,
+        cos_k=None,
+        sin_k=None,
+        interleaved=False,
+        seqlen_offsets: Union[int, torch.Tensor] = 0,
+    ):
+        batch, seqlen, three, nheads, headdim = qkv.shape
+        assert three == 3
+        if cos_k is None and sin_k is None and qkv.is_contiguous():
+            # Call 1 kernel instead of 2 kernels
+            # We need qkv to be contiguous so that when we reshape to combine (3, nheads)
+            # dimensions, we get the same tensor
+            # qk = rearrange(qkv[:, :, :2], "b s t h d -> b s (t h) d")
+            qk = qkv[:, :, :2].reshape(batch, seqlen, -1, headdim)
+            apply_rotary(
+                qk, cos, sin, seqlen_offsets=seqlen_offsets, interleaved=interleaved, inplace=True
+            )
+        else:
+            cos_k = cos if cos_k is None else cos_k
+            sin_k = sin if sin_k is None else sin_k
+            q, k = qkv[:, :, 0], qkv[:, :, 1]
+            apply_rotary(q, cos, sin, seqlen_offsets, interleaved=interleaved, inplace=True)
+            apply_rotary(k, cos_k, sin_k, seqlen_offsets, interleaved=interleaved, inplace=True)
+            ctx.save_for_backward(cos, sin, cos_k, sin_k)
+        if isinstance(seqlen_offsets, int):
+            ctx.save_for_backward(cos, sin, cos_k, sin_k)
+            ctx.seqlen_offsets = seqlen_offsets
+        else:
+            ctx.save_for_backward(cos, sin, cos_k, sin_k, seqlen_offsets)
+            ctx.seqlen_offsets = None
+        ctx.interleaved = interleaved
+        return qkv
+    @staticmethod
+    def backward(ctx, dqkv):
+        seqlen_offsets = ctx.seqlen_offsets
+        if seqlen_offsets is None:
+            cos, sin, cos_k, sin_k, seqlen_offsets = ctx.saved_tensors
+        else:
+            cos, sin, cos_k, sin_k = ctx.saved_tensors
+        if cos_k is None and sin_k is None and dqkv.is_contiguous():
+            # Call 1 kernel instead of 2 kernels
+            # We need dqkv to be contiguous so that when we reshape to combine (3, nheads)
+            # dimensions, we get the same tensor
+            dqk = rearrange(dqkv[:, :, :2], "b s t h d -> b s (t h) d")
+            apply_rotary(
+                dqk,
+                cos,
+                sin,
+                seqlen_offsets=seqlen_offsets,
+                interleaved=ctx.interleaved,
+                inplace=True,
+                conjugate=True,
+            )
+        else:
+            cos_k = cos if cos_k is None else cos_k
+            sin_k = sin if sin_k is None else sin_k
+            dq, dk = dqkv[:, :, 0], dqkv[:, :, 1]
+            apply_rotary(
+                dq, cos, sin, seqlen_offsets, interleaved=ctx.interleaved, inplace=True, conjugate=True
+            )
+            apply_rotary(
+                dk,
+                cos_k,
+                sin_k,
+                seqlen_offsets,
+                interleaved=ctx.interleaved,
+                inplace=True,
+                conjugate=True,
+            )
+        return dqkv, None, None, None, None, None, None
+def apply_rotary_emb_qkv_(
+    qkv,
+    cos,
+    sin,
+    cos_k=None,
+    sin_k=None,
+    interleaved=False,
+    seqlen_offsets: Union[int, torch.Tensor] = 0,
+):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, headdim)
+        cos, sin: (seqlen, rotary_dim / 2)
+        cos_k, sin_k: (seqlen, rotary_dim / 2), optional
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
+            1st half and 2nd half (GPT-NeoX style).
+        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
+            Most commonly used in inference when we have KV cache.
+    Return:
+        qkv: (batch_size, seqlen, 3, nheads, headdim)
+    rotary_dim must be <= headdim
+    Apply rotary embedding *inplace* to the first rotary_dim of Q and K.
+    """
+    return ApplyRotaryEmbQKV_.apply(qkv, cos, sin, cos_k, sin_k, interleaved, seqlen_offsets)
+class ApplyRotaryEmbKV_(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kv, cos, sin, interleaved=False, seqlen_offsets: Union[int, torch.Tensor] = 0):
+        batch, seqlen, two, nheads, headdim = kv.shape
+        assert two == 2
+        k = kv[:, :, 0]
+        apply_rotary(
+            k, cos, sin, seqlen_offsets=seqlen_offsets, interleaved=interleaved, inplace=True
+        )
+        if isinstance(seqlen_offsets, int):
+            ctx.save_for_backward(cos, sin)  # Can't save int with save_for_backward
+            ctx.seqlen_offsets = seqlen_offsets
+        else:
+            ctx.save_for_backward(cos, sin, seqlen_offsets)
+            ctx.seqlen_offsets = None
+        ctx.interleaved = interleaved
+        return kv
+    @staticmethod
+    def backward(ctx, dkv):
+        seqlen_offsets = ctx.seqlen_offsets
+        if seqlen_offsets is None:
+            cos, sin, seqlen_offsets = ctx.saved_tensors
+        else:
+            cos, sin = ctx.saved_tensors
+        apply_rotary(
+            dkv[:, :, 0],
+            cos,
+            sin,
+            seqlen_offsets=seqlen_offsets,
+            interleaved=ctx.interleaved,
+            inplace=True,
+            conjugate=True,
+        )
+        return dkv, None, None, None, None
+apply_rotary_emb_kv_ = ApplyRotaryEmbKV_.apply
+def apply_rotary_emb_kv_(
+    kv,
+    cos,
+    sin,
+    interleaved=False,
+    seqlen_offsets: Union[int, torch.Tensor] = 0,
+):
+    """
+    Arguments:
+        kv: (batch_size, seqlen, 2, nheads, headdim)
+        cos, sin: (seqlen, rotary_dim / 2)
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
+            1st half and 2nd half (GPT-NeoX style).
+        seqlen_offsets: (batch_size,) or int. Each sequence in Q and K is shifted by this amount.
+            Most commonly used in inference when we have KV cache.
+    Return:
+        kv: (batch_size, seqlen, 2, nheads, headdim)
+    rotary_dim must be <= headdim
+    Apply rotary embedding *inplace* to the first rotary_dim of K.
+    """
+    return ApplyRotaryEmbKV_.apply(kv, cos, sin, interleaved, seqlen_offsets)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+    If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
+    A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
+    Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
+    """
+    def __init__(
+        self,
+        dim: int,
+        base=10000.0,
+        interleaved=False,
+        scale_base=None,
+        pos_idx_in_fp32=True,
+        device=None,
+    ):
+        """
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
+            of 1st half and 2nd half (GPT-NeoX style).
+        pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
+            otherwise they might be in lower precision.
+            This option was added because previously (before 2023-07-02), when we construct
+            the position indices, we use the dtype of self.inv_freq. In most cases this would
+            be fp32, but if the model is trained in pure bf16 (not mixed precision), then
+            self.inv_freq would be bf16, and the position indices are also in bf16.
+            Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
+            embeddings for some positions will coincide.
+            To maintain compatibility with models previously trained in pure bf16,
+            we add this option.
+        """
+        super().__init__()
+        self.dim = dim
+        self.base = float(base)
+        self.pos_idx_in_fp32 = pos_idx_in_fp32
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = self._compute_inv_freq(device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.interleaved = interleaved
+        self.scale_base = scale_base
+        scale = (
+            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
+            if scale_base is not None
+            else None
+        )
+        self.register_buffer("scale", scale, persistent=False)
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+    def _compute_inv_freq(self, device=None):
+        return 1.0 / (
+            self.base
+            ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
+        )
+    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
+        # Reset the tables if the sequence length has changed,
+        # if we're on a new device (possibly due to tracing for instance),
+        # or if we're switching from inference mode to training
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached is None
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+            or (self.training and self._cos_cached.is_inference())
+        ):
+            self._seq_len_cached = seqlen
+            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
+            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
+            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
+            if self.pos_idx_in_fp32:
+                t = torch.arange(seqlen, device=device, dtype=torch.float32)
+                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
+                # will be large. Having it in bf16 will lose a lot of precision and cause the
+                # cos & sin output to change significantly.
+                # We want to recompute self.inv_freq if it was not loaded in fp32
+                if self.inv_freq.dtype != torch.float32:
+                    inv_freq = self._compute_inv_freq(device=device)
+                else:
+                    inv_freq = self.inv_freq
+            else:
+                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+                inv_freq = self.inv_freq
+            # Don't do einsum, it converts fp32 to fp16 under AMP
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            freqs = torch.outer(t, inv_freq)
+            if self.scale is None:
+                self._cos_cached = torch.cos(freqs).to(dtype)
+                self._sin_cached = torch.sin(freqs).to(dtype)
+            else:
+                power = (
+                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
+                    - seqlen // 2
+                ) / self.scale_base
+                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
+                # We want the multiplication by scale to happen in fp32
+                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
+                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
+                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
+                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
+    def forward(
+        self,
+        qkv: torch.Tensor,
+        kv: Optional[torch.Tensor] = None,
+        seqlen_offset: Union[int, torch.Tensor] = 0,
+        max_seqlen: Optional[int] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        qkv: (batch, seqlen, 3, nheads, headdim) if kv is none,
+             else it's just q of shape (batch, seqlen, nheads, headdim)
+        kv: (batch, seqlen, 2, nheads, headdim)
+        seqlen_offset: (batch_size,) or int. Each sequence in x is shifted by this amount.
+            Most commonly used in inference when we have KV cache.
+            If it's a tensor of shape (batch_size,), then to update the cos / sin cache, one
+            should pass in max_seqlen, which will update the cos / sin cache up to that length.
+        Apply rotary embedding *inplace* to qkv and / or kv.
+        """
+        seqlen = qkv.shape[1]
+        if max_seqlen is not None:
+            self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
+        elif isinstance(seqlen_offset, int):
+            self._update_cos_sin_cache(seqlen + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
+        if kv is None:
+            if self.scale is None:
+                return apply_rotary_emb_qkv_(
+                    qkv,
+                    self._cos_cached,
+                    self._sin_cached,
+                    interleaved=self.interleaved,
+                    seqlen_offsets=seqlen_offset,
+                )
+            else:
+                return apply_rotary_emb_qkv_(
+                    qkv,
+                    self._cos_cached,
+                    self._sin_cached,
+                    self._cos_k_cached,
+                    self._sin_k_cached,
+                    interleaved=self.interleaved,
+                    seqlen_offsets=seqlen_offset,
+                )
+        else:
+            q = qkv
+            q = apply_rotary_emb_func(
+                q,
+                self._cos_cached,
+                self._sin_cached,
+                interleaved=self.interleaved,
+                inplace=True,
+                seqlen_offsets=seqlen_offset,
+            )
+            if self.scale is None:
+                kv = apply_rotary_emb_kv_(
+                    kv,
+                    self._cos_cached,
+                    self._sin_cached,
+                    interleaved=self.interleaved,
+                    seqlen_offsets=seqlen_offset,
+                )
+            else:
+                kv = apply_rotary_emb_kv_(
+                    kv,
+                    self._cos_k_cached,
+                    self._sin_k_cached,
+                    interleaved=self.interleaved,
+                    seqlen_offsets=seqlen_offset,
+                )
+            return q, kv

.venv/lib/python3.11/site-packages/xformers/_flash_attn/losses/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/xformers/_flash_attn/losses/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (200 Bytes). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/losses/__pycache__/cross_entropy.cpython-311.pyc ADDED Viewed

Binary file (3.89 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/losses/cross_entropy.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) 2024, Tri Dao.
+import torch
+import torch.nn as nn
+from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
+class CrossEntropyLoss(nn.Module):
+    def __init__(
+        self,
+        ignore_index=-100,
+        reduction="mean",
+        label_smoothing=0.0,
+        logit_scale=1.0,
+        lse_square_scale=0.0,
+        inplace_backward=False,
+        process_group=None,
+        return_z_loss=False,
+    ):
+        """
+        Arguments:
+            ignore_index: int. If labels == ignore_index, the loss is set to 0.0.
+            label_smoothing: float
+            lse_square_scale: float. If > 0, we add lse_square_scale * lse(logits) ^ 2 to the loss.
+                This is also referred to as "z-loss".
+            inplace_backward: bool. If True, we do the backward pass in-place by modifying the logits.
+                This saves memory.
+            process_group: if not None, we're doing Tensor Parallel: each process is responsible for
+                one part of the vocab. The loss will be aggregated across processes.
+            return_z_loss: bool. If True, we return the component of the loss contributed by
+                the lse_square_scale value. This value is only for logging and does not support
+                backprop.
+        """
+        super().__init__()
+        if reduction not in ["mean", "none", "sum"]:
+            raise NotImplementedError("Only support reduction = 'mean' or 'none' or 'sum'")
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+        self.label_smoothing = label_smoothing
+        self.logit_scale = logit_scale
+        self.lse_square_scale = lse_square_scale
+        self.inplace_backward = inplace_backward
+        self.process_group = process_group
+        self.return_z_loss = return_z_loss
+    def forward(self, input, target, precomputed_lse=None):
+        """
+        Arguments:
+            input: (batch, vocab_size)
+            target: (batch,)
+        Returns:
+            losses: (batch,) if reduction is 'none', else (1,), dtype float
+            z_loss: (batch,) if reduction is 'none', else (1,), dtype float (if self.return_z_loss)
+        """
+        assert input.is_cuda and target.is_cuda, "Only support CUDA tensors"
+        loss, z_loss = cross_entropy_loss(
+            input,
+            target,
+            precomputed_lse=precomputed_lse,
+            label_smoothing=self.label_smoothing,
+            logit_scale=self.logit_scale,
+            lse_square_scale=self.lse_square_scale,
+            ignore_index=self.ignore_index,
+            inplace_backward=self.inplace_backward,
+            process_group=self.process_group,
+        )
+        if self.reduction == "mean":
+            loss = loss.sum() / (target != self.ignore_index).sum()
+        elif self.reduction == "sum":
+            loss = loss.sum()
+        else:
+            loss = loss
+        if not self.return_z_loss:
+            return loss
+        if self.reduction == "mean":
+            z_loss = z_loss.sum() / (target != self.ignore_index).sum()
+        elif self.reduction == "sum":
+            z_loss = z_loss.sum()
+        else:
+            z_loss = z_loss
+        return loss, z_loss

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (200 Bytes). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/baichuan.cpython-311.pyc ADDED Viewed

Binary file (7.82 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/bert.cpython-311.pyc ADDED Viewed

Binary file (41.9 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/bigcode.cpython-311.pyc ADDED Viewed

Binary file (13.1 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/btlm.cpython-311.pyc ADDED Viewed

Binary file (7.7 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/falcon.cpython-311.pyc ADDED Viewed

Binary file (8.53 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/gpt.cpython-311.pyc ADDED Viewed

Binary file (54.6 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/gpt_neox.cpython-311.pyc ADDED Viewed

Binary file (7.87 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/gptj.cpython-311.pyc ADDED Viewed

Binary file (7.35 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/llama.cpython-311.pyc ADDED Viewed

Binary file (23.1 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/opt.cpython-311.pyc ADDED Viewed

Binary file (7.75 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/__pycache__/vit.cpython-311.pyc ADDED Viewed

Binary file (16.7 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/baichuan.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) 2023, GGGGGGXY, Tri Dao.
+import math
+import json
+import re
+from pathlib import Path
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import GPT2Config, AutoConfig, PretrainedConfig
+def remap_state_dict_hf_baichuan(state_dict, config):
+    def key_mapping_layers(key):
+        return re.sub(r"^model.", "transformer.", key)
+    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
+    # Word embedding
+    def key_mapping_emb(key):
+        return re.sub(
+            r"^transformer.embed_tokens.",
+            "transformer.embeddings.word_embeddings.",
+            key,
+        )
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple)
+        * pad_vocab_size_multiple
+    )
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    if getattr(config, "tie_word_embeddings"):
+        state_dict["lm_head.weight"] = state_dict[
+            "transformer.embeddings.word_embeddings.weight"
+        ]
+    else:
+        output_embeddings = state_dict.pop("lm_head.weight")
+        # Need to recompute vocab_size since Baichuan shards the word embeddings and output embeddings
+        # differently.
+        vocab_size = (
+            math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple)
+            * pad_vocab_size_multiple
+        )
+        # It's possible that vocab_size is padded to be a multiple of 8, for example.
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.norm.", r"transformer.ln_f.", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).input_layernorm.",
+            r"transformer.layers.\1.norm1.",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).post_attention_layernorm.",
+            r"transformer.layers.\1.norm2.",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    # MLP
+    for l in range(config.n_layer):
+        w1 = state_dict.pop(f"transformer.layers.{l}.mlp.gate_proj.weight")
+        w3 = state_dict.pop(f"transformer.layers.{l}.mlp.up_proj.weight")
+        # Our ordering is different
+        state_dict[f"transformer.layers.{l}.mlp.fc1.weight"] = torch.cat(
+            [w3, w1], dim=0
+        )
+    def key_mapping_mlp(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).mlp.down_proj.",
+            r"transformer.layers.\1.mlp.fc2.",
+            key,
+        )
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # Attention
+    def key_mapping_attn(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attn.W_pack.",
+            r"transformer.layers.\1.mixer.Wqkv.",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attn.o_proj.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    for l in range(config.n_layer):
+        # pop rotary_emb.inv_freq from state dict
+        state_dict.pop(f"transformer.layers.{l}.self_attn.rotary_emb.inv_freq", None)
+    return state_dict
+def baichuan_config_to_gpt2_config(baichuan_config: PretrainedConfig) -> GPT2Config:
+    # HACK: the config doesn't have say whether it's rotary or alibi.
+    # So we have to infer from the hidden size (7B -> rotary, 13B -> alibi).
+    # HACK: the config doesn't have say whether it uses norm head.
+    # So we have to infer from the vocab size
+    # (v1, vocab size 64k, no norm head; v2, vocab size 128k, norm head).
+    use_rotary = baichuan_config.hidden_size < 5000
+    return GPT2Config(
+        vocab_size=baichuan_config.vocab_size,
+        n_positions=0,  # No absolute position embedding
+        n_embd=baichuan_config.hidden_size,
+        n_layer=baichuan_config.num_hidden_layers,
+        n_head=baichuan_config.num_attention_heads,
+        n_inner=baichuan_config.intermediate_size,
+        activation_function="swiglu",  # Hardcode since HF calls it 'silu'
+        # baichuan doesn't have dropout, idk if it's because they only release the inference code
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        layer_norm_epsilon=baichuan_config.rms_norm_eps,
+        initializer_range=baichuan_config.initializer_range,
+        bos_token_id=baichuan_config.bos_token_id,
+        eos_token_id=baichuan_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        pad_token_id=baichuan_config.pad_token_id,  # Idk if this does anything
+        rms_norm=True,
+        rotary_emb_fraction=1.0 if use_rotary else 0.0,
+        rotary_emb_interleaved=False,
+        use_alibi=not use_rotary,
+        use_flash_attn=not use_rotary,  # Alibi code path requires flash_attn
+        tie_word_embeddings=False,
+        norm_head=baichuan_config.vocab_size > 70000,
+        qkv_proj_bias=False,
+        out_proj_bias=False,
+        mlp_fc1_bias=False,
+        mlp_fc2_bias=False,
+    )

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/bert.py ADDED Viewed

	@@ -0,0 +1,764 @@

+# Copyright (c) 2022, Tri Dao.
+# This BERT implementation is based on our MLPerf 2.0 and MLPerf 2.1 BERT implementation.
+# https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
+# https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py
+# Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
+import logging
+import re
+from collections import OrderedDict
+from collections.abc import Sequence
+from functools import partial
+from typing import Any, Mapping
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import BertConfig, PretrainedConfig
+from transformers.models.bert.modeling_bert import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    BertForPreTrainingOutput,
+)
+from flash_attn.bert_padding import (
+    index_first_axis,
+    index_first_axis_residual,
+    pad_input,
+    unpad_input,
+)
+from flash_attn.modules.block import Block
+from flash_attn.modules.embedding import BertEmbeddings
+from flash_attn.modules.mha import MHA
+from flash_attn.modules.mlp import FusedMLP, Mlp
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+try:
+    from flash_attn.ops.fused_dense import FusedDense
+except ImportError:
+    FusedDense = None
+try:
+    from flash_attn.ops.triton.layer_norm import layer_norm_fn
+except ImportError:
+    layer_norm_fn = None
+try:
+    from flash_attn.losses.cross_entropy import CrossEntropyLoss
+except ImportError:
+    CrossEntropyLoss = None
+logger = logging.getLogger(__name__)
+def create_mixer_cls(config, cross_attn=False, return_residual=False):
+    use_flash_attn = getattr(config, "use_flash_attn", False)
+    fused_bias_fc = getattr(config, "fused_bias_fc", False)
+    rotary_kwargs = {}
+    if config.position_embedding_type == "rotary":
+        rotary_kwargs["rotary_emb_dim"] = getattr(config, "rotary_emb_dim", config.hidden_size)
+        rotary_kwargs["rotary_emb_base"] = getattr(config, "rotary_emb_base", 10000.0)
+        rotary_kwargs["rotary_emb_scale_base"] = getattr(config, "rotary_emb_scale_base", None)
+        rotary_kwargs["rotary_emb_interleaved"] = getattr(config, "rotary_emb_interleaved", False)
+    mixer_cls = partial(
+        MHA,
+        num_heads=config.num_attention_heads,
+        cross_attn=cross_attn,
+        dropout=config.attention_probs_dropout_prob,
+        causal=False,
+        fused_bias_fc=fused_bias_fc,
+        use_flash_attn=use_flash_attn,
+        return_residual=return_residual,
+        **rotary_kwargs,
+    )
+    return mixer_cls
+def create_mlp_cls(config, layer_idx=None, return_residual=False):
+    inner_dim = config.intermediate_size
+    fused_mlp = getattr(config, "fused_mlp", False)
+    if fused_mlp:
+        assert config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"], (
+            "fused_mlp only " "supports approximate gelu"
+        )
+    if not fused_mlp:
+        approximate = (
+            "tanh"
+            if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
+            else "none"
+        )
+        mlp_cls = partial(
+            Mlp,
+            hidden_features=inner_dim,
+            activation=partial(F.gelu, approximate=approximate),
+            return_residual=return_residual,
+        )
+    else:
+        if FusedMLP is None:
+            raise ImportError("fused_dense is not installed")
+        mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
+        # mlp_checkpoint_lvl could be a list, which contains the checkpoint_lvl for each layer
+        if isinstance(mlp_checkpoint_lvl, Sequence):
+            assert layer_idx is not None
+            mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx]
+        mlp_cls = partial(
+            FusedMLP,
+            hidden_features=inner_dim,
+            checkpoint_lvl=mlp_checkpoint_lvl,
+            return_residual=return_residual,
+        )
+    return mlp_cls
+def create_block(config, layer_idx=None):
+    last_layer_subset = getattr(config, "last_layer_subset", False)
+    cross_attn = last_layer_subset and layer_idx == config.num_hidden_layers - 1
+    # TD [2022-12-19]: For cross attention (last layer), we actually want to return the
+    # residual x_kv, not residual x. But it's annoying to change the API (and it only affects
+    # one layer) so we just choose not to return residual in this case.
+    return_residual = not cross_attn
+    mixer_cls = create_mixer_cls(config, cross_attn, return_residual=return_residual)
+    mlp_cls = create_mlp_cls(config, layer_idx, return_residual=return_residual)
+    norm_cls = partial(nn.LayerNorm, eps=config.layer_norm_eps)
+    block = Block(
+        config.hidden_size,
+        mixer_cls,
+        mlp_cls,
+        norm_cls=norm_cls,
+        prenorm=False,
+        resid_dropout1=config.hidden_dropout_prob,
+        resid_dropout2=config.hidden_dropout_prob,
+        fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
+        return_residual=return_residual,
+    )
+    return block
+# https://github.com/huggingface/transformers/blob/7032e0203262ebb2ebf55da8d2e01f873973e835/src/transformers/models/bert/modeling_bert.py#L748
+def _init_weights(module, initializer_range=0.02):
+    if isinstance(module, nn.Linear):
+        nn.init.normal_(module.weight, std=initializer_range)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+        if module.padding_idx is not None:
+            nn.init.zeros_(module.weight[module.padding_idx])
+class BertEncoder(nn.Module):
+    def __init__(self, config: BertConfig):
+        super().__init__()
+        self.use_flash_attn = getattr(config, "use_flash_attn", False)
+        self.layers = nn.ModuleList(
+            [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
+        )
+    def forward(self, hidden_states, key_padding_mask=None, subset_mask=None):
+        """If subset_mask is not None, we only want output for the subset of the sequence.
+        This means that we only compute the last layer output for these tokens.
+        subset_mask: (batch, seqlen), dtype=torch.bool
+        """
+        if key_padding_mask is None or not self.use_flash_attn:
+            mixer_kwargs = (
+                {"key_padding_mask": key_padding_mask} if key_padding_mask is not None else None
+            )
+            for layer in self.layers:
+                hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
+            if subset_mask is not None:
+                hidden_states = hidden_states[subset_mask]
+        else:
+            batch, seqlen = hidden_states.shape[:2]
+            hidden_states, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
+                hidden_states, key_padding_mask
+            )
+            mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch}
+            if subset_mask is None:
+                for layer in self.layers:
+                    hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
+                hidden_states = pad_input(hidden_states, indices, batch, seqlen)
+            else:
+                for layer in self.layers[:-1]:
+                    hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
+                if key_padding_mask is not None:
+                    subset_idx = torch.nonzero(
+                        subset_mask[key_padding_mask], as_tuple=False
+                    ).flatten()
+                    subset_seqlens = (subset_mask & key_padding_mask).sum(dim=-1, dtype=torch.int32)
+                    subset_cu_seqlens = F.pad(
+                        torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
+                    )
+                else:
+                    subset_idx = torch.nonzero(subset_mask, as_tuple=False).flatten()
+                    subset_seqlens = subset_mask.sum(dim=-1, dtype=torch.int32)
+                    subset_cu_seqlens = F.pad(
+                        torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
+                    )
+                hidden_states_subset, hidden_states = index_first_axis_residual(
+                    hidden_states, subset_idx
+                )
+                # It's ok to set max_seqlen_q to be much larger
+                mixer_kwargs = {
+                    "x_kv": hidden_states,
+                    "cu_seqlens": subset_cu_seqlens,
+                    "max_seqlen": max_seqlen_in_batch,
+                    "cu_seqlens_k": cu_seqlens,
+                    "max_seqlen_k": max_seqlen_in_batch,
+                }
+                hidden_states = self.layers[-1](hidden_states_subset, mixer_kwargs=mixer_kwargs)
+        return hidden_states
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        fused_bias_fc = getattr(config, "fused_bias_fc", False)
+        if fused_bias_fc and FusedDense is None:
+            raise ImportError("fused_dense is not installed")
+        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
+        self.dense = linear_cls(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states, pool=True):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0] if pool else hidden_states
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        fused_bias_fc = getattr(config, "fused_bias_fc", False)
+        if fused_bias_fc and FusedDense is None:
+            raise ImportError("fused_dense is not installed")
+        self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
+        if self.fused_dropout_add_ln and layer_norm_fn is None:
+            raise ImportError("Triton is not installed")
+        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
+        self.dense = linear_cls(config.hidden_size, config.hidden_size)
+        approximate = (
+            "tanh"
+            if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
+            else "none"
+        )
+        self.transform_act_fn = nn.GELU(approximate=approximate)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        if not self.fused_dropout_add_ln:
+            hidden_states = self.layer_norm(hidden_states)
+        else:
+            hidden_states = layer_norm_fn(
+                hidden_states, self.layer_norm.weight, self.layer_norm.bias, eps=self.layer_norm.eps
+            )
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        fused_bias_fc = getattr(config, "fused_bias_fc", False)
+        if fused_bias_fc and FusedDense is None:
+            raise ImportError("fused_dense is not installed")
+        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = linear_cls(config.hidden_size, config.vocab_size, bias=True)
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+class BertPreTrainedModel(nn.Module):
+    """An abstract class to handle weights initialization and
+    a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__()
+        if not isinstance(config, BertConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                )
+            )
+        self.config = config
+    @classmethod
+    def from_pretrained(cls, model_name, config, *inputs, **kwargs):
+        """
+        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+        Params:
+            pretrained_model_name_or_path: either:
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a BertForPretraining instance
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        load_return = model.load_state_dict(
+            remap_state_dict(state_dict_from_pretrained(model_name), config), strict=False
+        )
+        logger.info(load_return)
+        return model
+class BertModel(BertPreTrainedModel):
+    def __init__(self, config: BertConfig, add_pooling_layer=True):
+        super().__init__(config)
+        self.pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+        if config.vocab_size % self.pad_vocab_size_multiple != 0:
+            config.vocab_size += self.pad_vocab_size_multiple - (
+                config.vocab_size % self.pad_vocab_size_multiple
+            )
+        self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
+        if self.fused_dropout_add_ln and layer_norm_fn is None:
+            raise ImportError("Triton is not installed")
+        assert config.hidden_act in ["gelu", "gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
+        self.embeddings = BertEmbeddings(
+            config.hidden_size,
+            config.vocab_size,
+            config.max_position_embeddings,
+            config.type_vocab_size,
+            padding_idx=config.pad_token_id,
+        )
+        self.emb_drop = nn.Dropout(config.hidden_dropout_prob)
+        self.emb_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        masked_tokens_mask=None,
+    ):
+        """If masked_tokens_mask is not None (i.e. last_layer_subset == True in BertForPreTraining),
+        we only want the output for the masked tokens. This means that we only compute the last
+        layer output for these tokens.
+        masked_tokens_mask: (batch, seqlen), dtype=torch.bool
+        """
+        hidden_states = self.embeddings(
+            input_ids, position_ids=position_ids, token_type_ids=token_type_ids
+        )
+        # TD [2022-12:18]: Don't need to force residual in fp32
+        # BERT puts embedding LayerNorm before embedding dropout.
+        if not self.fused_dropout_add_ln:
+            hidden_states = self.emb_ln(hidden_states)
+        else:
+            hidden_states = layer_norm_fn(
+                hidden_states, self.emb_ln.weight, self.emb_ln.bias, eps=self.emb_ln.eps
+            )
+        hidden_states = self.emb_drop(hidden_states)
+        if masked_tokens_mask is not None:
+            batch_size, seqlen = input_ids.shape[:2]
+            # We also need the first column for the CLS token
+            first_col_mask = torch.zeros(
+                batch_size, seqlen, dtype=torch.bool, device=input_ids.device
+            )
+            first_col_mask[:, 0] = True
+            subset_mask = masked_tokens_mask | first_col_mask
+        else:
+            subset_mask = None
+        sequence_output = self.encoder(
+            hidden_states, key_padding_mask=attention_mask, subset_mask=subset_mask
+        )
+        if masked_tokens_mask is None:
+            pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        else:
+            # TD [2022-03-01]: the indexing here is very tricky.
+            if attention_mask is not None:
+                subset_idx = subset_mask[attention_mask]
+                pool_input = sequence_output[first_col_mask[attention_mask][subset_idx]]
+                sequence_output = sequence_output[masked_tokens_mask[attention_mask][subset_idx]]
+            else:
+                pool_input = sequence_output[first_col_mask[subset_mask]]
+                sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
+            pooled_output = self.pooler(pool_input, pool=False) if self.pooler is not None else None
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+        )
+class BertForPreTraining(BertPreTrainedModel):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        # If dense_seq_output, we only need to pass the hidden states for the masked out tokens
+        # (around 15%) to the classifier heads.
+        self.dense_seq_output = getattr(config, "dense_seq_output", False)
+        # If last_layer_subset, we only need the compute the last layer for a subset of tokens
+        # (e.g., the tokens we need to compute the masked LM loss and the next-sentence prediction).
+        self.last_layer_subset = getattr(config, "last_layer_subset", False)
+        if self.last_layer_subset:
+            assert self.dense_seq_output, "last_layer_subset requires dense_seq_output"
+        use_xentropy = getattr(config, "use_xentropy", False)
+        if use_xentropy and CrossEntropyLoss is None:
+            raise ImportError("xentropy_cuda is not installed")
+        loss_cls = (
+            nn.CrossEntropyLoss
+            if not use_xentropy
+            else partial(CrossEntropyLoss, inplace_backward=True)
+        )
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+        self.mlm_loss = loss_cls(ignore_index=0)
+        self.nsp_loss = loss_cls(ignore_index=-1)
+        # Initialize weights and apply final processing
+        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
+        self.tie_weights()
+    def tie_weights(self):
+        self.cls.predictions.decoder.weight = self.bert.embeddings.word_embeddings.weight
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None,
+        next_sentence_label=None,
+    ):
+        """
+        If labels are provided, they must be 0 for masked out tokens (as specified in the attention
+        mask).
+        Outputs:
+            if `labels` and `next_sentence_label` are not `None`:
+                Outputs the total_loss which is the sum of the masked language modeling loss and the next
+                sentence classification loss.
+            if `labels` or `next_sentence_label` is `None`:
+                Outputs a tuple comprising
+                - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+                - the next sentence classification logits of shape [batch_size, 2].
+        """
+        masked_tokens_mask = labels > 0 if (self.last_layer_subset and labels is not None) else None
+        outputs = self.bert(
+            input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask.bool() if attention_mask is not None else None,
+            masked_tokens_mask=masked_tokens_mask,
+        )
+        sequence_output, pooled_output = outputs.last_hidden_state, outputs.pooler_output
+        if self.dense_seq_output and labels is not None:
+            masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten()
+            if not self.last_layer_subset:
+                sequence_output = index_first_axis(
+                    rearrange(sequence_output, "b s d -> (b s) d"), masked_token_idx
+                )
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            if (
+                self.dense_seq_output and labels is not None
+            ):  # prediction_scores are already flattened
+                masked_lm_loss = self.mlm_loss(
+                    prediction_scores, labels.flatten()[masked_token_idx]
+                )
+            else:
+                masked_lm_loss = self.mlm_loss(
+                    rearrange(prediction_scores, "... v -> (...) v"),
+                    rearrange(labels, "... -> (...)"),
+                )
+            next_sentence_loss = self.nsp_loss(
+                rearrange(seq_relationship_score, "... t -> (...) t"),
+                rearrange(next_sentence_label, "... -> (...)"),
+            )
+            total_loss = masked_lm_loss.float() + next_sentence_loss.float()
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+        )
+def remap_state_dict(state_dict, config: PretrainedConfig):
+    """
+    Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
+    """
+    # LayerNorm
+    def key_mapping_ln_gamma_beta(key):
+        key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
+        key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key)
+        return key
+    state_dict = OrderedDict((key_mapping_ln_gamma_beta(k), v) for k, v in state_dict.items())
+    # Layers
+    def key_mapping_layers(key):
+        return re.sub(r"^bert.encoder.layer.", "bert.encoder.layers.", key)
+    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^bert.embeddings.LayerNorm.", "bert.emb_ln.", key)
+        key = re.sub(
+            r"^bert.encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)",
+            r"bert.encoder.layers.\1.norm1.\2",
+            key,
+        )
+        key = re.sub(
+            r"^bert.encoder.layers.(\d+).output.LayerNorm.(weight|bias)",
+            r"bert.encoder.layers.\1.norm2.\2",
+            key,
+        )
+        key = re.sub(
+            r"^cls.predictions.transform.LayerNorm.(weight|bias)",
+            r"cls.predictions.transform.layer_norm.\1",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    # MLP
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^bert.encoder.layers.(\d+).intermediate.dense.(weight|bias)",
+            r"bert.encoder.layers.\1.mlp.fc1.\2",
+            key,
+        )
+        key = re.sub(
+            r"^bert.encoder.layers.(\d+).output.dense.(weight|bias)",
+            r"bert.encoder.layers.\1.mlp.fc2.\2",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # Attention
+    last_layer_subset = getattr(config, "last_layer_subset", False)
+    for d in range(config.num_hidden_layers):
+        Wq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.weight")
+        Wk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.weight")
+        Wv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.weight")
+        bq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.bias")
+        bk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.bias")
+        bv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.bias")
+        if not (last_layer_subset and d == config.num_hidden_layers - 1):
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.weight"] = torch.cat(
+                [Wq, Wk, Wv], dim=0
+            )
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
+        else:
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wq.weight"] = Wq
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.weight"] = torch.cat([Wk, Wv], dim=0)
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wq.bias"] = bq
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.bias"] = torch.cat([bk, bv], dim=0)
+    def key_mapping_attn(key):
+        return re.sub(
+            r"^bert.encoder.layers.(\d+).attention.output.dense.(weight|bias)",
+            r"bert.encoder.layers.\1.mixer.out_proj.\2",
+            key,
+        )
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    def key_mapping_decoder_bias(key):
+        return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key)
+    state_dict = OrderedDict((key_mapping_decoder_bias(k), v) for k, v in state_dict.items())
+    # Word embedding
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    if pad_vocab_size_multiple > 1:
+        word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"]
+        state_dict["bert.embeddings.word_embeddings.weight"] = F.pad(
+            word_embeddings, (0, 0, 0, config.vocab_size - word_embeddings.shape[0])
+        )
+        decoder_weight = state_dict["cls.predictions.decoder.weight"]
+        state_dict["cls.predictions.decoder.weight"] = F.pad(
+            decoder_weight, (0, 0, 0, config.vocab_size - decoder_weight.shape[0])
+        )
+        # If the vocab was padded, we want to set the decoder bias for those padded indices to be
+        # strongly negative (i.e. the decoder shouldn't predict those indices).
+        # TD [2022-05-09]: I don't think it affects the MLPerf training.
+        decoder_bias = state_dict["cls.predictions.decoder.bias"]
+        state_dict["cls.predictions.decoder.bias"] = F.pad(
+            decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0
+        )
+    return state_dict
+def inv_remap_state_dict(state_dict, config: PretrainedConfig):
+    """
+    Map the state_dict of a flash_attn model to be Huggingface BERT compatible.
+    This function is meant to be the inverse of remap_state_dict.
+    """
+    # Word embedding
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    if pad_vocab_size_multiple > 1:
+        word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"]
+        decoder_weight = state_dict["cls.predictions.decoder.weight"]
+        decoder_bias = state_dict["cls.predictions.decoder.bias"]
+        # unpad embeddings
+        state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings[
+            : config.orig_vocab_size, :
+        ]
+        state_dict["cls.predictions.decoder.weight"] = decoder_weight[: config.orig_vocab_size, :]
+        state_dict["cls.predictions.decoder.bias"] = decoder_bias[: config.orig_vocab_size]
+    for d in range(config.num_hidden_layers):
+        last_layer_subset = getattr(config, "last_layer_subset", False)
+        if not last_layer_subset or d != (config.num_hidden_layers - 1):
+            Wqkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.weight")
+            Wqkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.bias")
+            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wqkv_weights[
+                : Wqkv_weights.shape[0] // 3, :
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wqkv_weights[
+                Wqkv_weights.shape[0] // 3 : 2 * Wqkv_weights.shape[0] // 3, :
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wqkv_weights[
+                2 * Wqkv_weights.shape[0] // 3 :, :
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wqkv_biases[
+                : Wqkv_biases.shape[0] // 3
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wqkv_biases[
+                Wqkv_biases.shape[0] // 3 : 2 * Wqkv_biases.shape[0] // 3
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wqkv_biases[
+                2 * Wqkv_biases.shape[0] // 3 :
+            ]
+        else:
+            Wq_weight = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.weight")
+            Wkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.weight")
+            Wq_bias = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.bias")
+            Wkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.bias")
+            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wq_weight
+            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wkv_weights[
+                : Wkv_weights.shape[0] // 2, :
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wkv_weights[
+                Wkv_weights.shape[0] // 2 :, :
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wq_bias
+            state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wkv_biases[
+                : Wkv_biases.shape[0] // 2
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wkv_biases[
+                Wkv_biases.shape[0] // 2 :
+            ]
+    def inv_key_mapping_ln(key):
+        key = re.sub(r"bert.emb_ln.", "bert.embeddings.LayerNorm.", key)
+        key = re.sub(
+            r"bert.encoder.layers.(\d+).norm1.(weight|bias)",
+            r"bert.encoder.layers.\1.attention.output.LayerNorm.\2",
+            key,
+        )
+        key = re.sub(
+            r"bert.encoder.layers.(\d+).norm2.(weight|bias)",
+            r"bert.encoder.layers.\1.output.LayerNorm.\2",
+            key,
+        )
+        key = re.sub(
+            r"cls.predictions.transform.layer_norm.(weight|bias)",
+            r"cls.predictions.transform.LayerNorm.\1",
+            key,
+        )
+        return key
+    def inv_key_mapping_ln_gamma_beta(key):
+        key = re.sub(r"LayerNorm.weight$", "LayerNorm.gamma", key)
+        key = re.sub(r"LayerNorm.bias$", "LayerNorm.beta", key)
+        return key
+    def inv_key_mapping_layers(key):
+        return re.sub(r"bert.encoder.layers.", "bert.encoder.layer.", key)
+    def inv_key_mapping_mlp(key):
+        key = re.sub(
+            r"bert.encoder.layer.(\d+).mlp.fc1.(weight|bias)",
+            r"bert.encoder.layer.\1.intermediate.dense.\2",
+            key,
+        )
+        key = re.sub(
+            r"bert.encoder.layer.(\d+).mlp.fc2.(weight|bias)",
+            r"bert.encoder.layer.\1.output.dense.\2",
+            key,
+        )
+        return key
+    def inv_key_mapping_attn(key):
+        return re.sub(
+            r"bert.encoder.layer.(\d+).mixer.out_proj.(weight|bias)",
+            r"bert.encoder.layer.\1.attention.output.dense.\2",
+            key,
+        )
+    def inv_key_mapping_decoder_bias(key):
+        return re.sub(r"cls.predictions.decoder.bias", "cls.predictions.bias", key)
+    state_dict = OrderedDict((inv_key_mapping_ln(key), value) for key, value in state_dict.items())
+    state_dict = OrderedDict(
+        (inv_key_mapping_ln_gamma_beta(key), value) for key, value in state_dict.items()
+    )
+    state_dict = OrderedDict(
+        (inv_key_mapping_layers(key), value) for key, value in state_dict.items()
+    )
+    state_dict = OrderedDict((inv_key_mapping_mlp(key), value) for key, value in state_dict.items())
+    state_dict = OrderedDict(
+        (inv_key_mapping_attn(key), value) for key, value in state_dict.items()
+    )
+    state_dict = OrderedDict(
+        (inv_key_mapping_decoder_bias(key), value) for key, value in state_dict.items()
+    )
+    return state_dict

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/bigcode.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import math
+import re
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+from transformers import GPT2Config, GPTBigCodeConfig, PretrainedConfig
+def remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig):
+    """
+    Map the state_dict of a Huggingface BigCode model to be flash_attn compatible.
+    """
+    # Word embedding and position embedding
+    def key_mapping_pos_emb(key):
+        return re.sub(r"^transformer.wpe.", "transformer.embeddings.position_embeddings.", key)
+    state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.wte.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
+        key = re.sub(
+            r"^transformer.h.(\d+).ln_(1|2).(weight|bias)",
+            r"transformer.layers.\1.norm\2.\3",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^transformer.h.(\d+).mlp.c_fc.weight",
+            r"transformer.layers.\1.mlp.fc1.weight",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.h.(\d+).mlp.c_proj.weight",
+            r"transformer.layers.\1.mlp.fc2.weight",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.h.(\d+).mlp.c_fc.bias",
+            r"transformer.layers.\1.mlp.fc1.bias",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.h.(\d+).mlp.c_proj.bias",
+            r"transformer.layers.\1.mlp.fc2.bias",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # TODO: add support for multi-head attention
+    assert config.multi_query, "Only multi-query attention is supported"
+    # Attention
+    for d in range(config.num_hidden_layers):
+        embed_dim = config.n_embd
+        head_dim = embed_dim // config.n_head
+        c_attn_weight = state_dict.pop(f"transformer.h.{d}.attn.c_attn.weight")
+        # with multi-query attention, the weights have shape (embed_dim, embed_dim + head_dim + head_dim)
+        # see https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py#L112
+        # see also https://github.com/ggerganov/ggml/blob/dd1d575956e54c5bdc07632f25506b3b1884dbd2/examples/starcoder/convert-hf-to-ggml.py#L183
+        # ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim)
+        q, k, v = torch.split(c_attn_weight, [embed_dim, head_dim, head_dim], dim=0)
+        # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
+        k = torch.tile(k, (config.n_head, 1))
+        v = torch.tile(v, (config.n_head, 1))
+        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = torch.cat((q, k, v), dim=0)
+        # same deal with the bias
+        c_attn_bias = state_dict.pop(f"transformer.h.{d}.attn.c_attn.bias")
+        # ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim)
+        q, k, v = torch.split(c_attn_bias, [embed_dim, head_dim, head_dim], dim=0)
+        # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
+        k = torch.tile(k, (config.n_head,))
+        v = torch.tile(v, (config.n_head,))
+        state_dict[f"transformer.layers.{d}.mixer.Wqkv.bias"] = torch.cat((q, k, v), dim=0)
+    def key_mapping_attn(key):
+        key = re.sub(
+            r"^transformer.h.(\d+).attn.c_proj.weight",
+            r"transformer.layers.\1.mixer.out_proj.weight",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.h.(\d+).attn.c_proj.bias",
+            r"transformer.layers.\1.mixer.out_proj.bias",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    return state_dict
+def inv_remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig):
+    """
+    Map the state_dict of a flash_attn model to be Huggingface BigCode compatible.
+    This function is meant to be the inverse of remap_state_dict_hf_bigcode.
+    """
+    # Word embedding and position embeddings
+    def inv_key_mapping_pos_emb(key):
+        return re.sub(r"^transformer.embeddings.position_embeddings.", "transformer.wpe.", key)
+    state_dict = OrderedDict((inv_key_mapping_pos_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    word_embeddings = word_embeddings[:, : config.vocab_size]
+    state_dict["transformer.wte.weight"] = word_embeddings
+    state_dict["lm_head.weight"] = word_embeddings
+    # LayerNorm
+    def inv_key_mapping_ln(key):
+        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).norm(1|2).(weight|bias)",
+            r"transformer.h.\1.ln_\2.\3",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((inv_key_mapping_ln(k), v) for k, v in state_dict.items())
+    # MLPs
+    def inv_key_mapping_mlp(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.fc1.weight",
+            r"transformer.h.\1.mlp.c_fc.weight",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.fc2.weight",
+            r"transformer.h.\1.mlp.c_proj.weight",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.fc1.bias",
+            r"transformer.h.\1.mlp.c_fc.bias",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.fc2.bias",
+            r"transformer.h.\1.mlp.c_proj.bias",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((inv_key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # Attention
+    for d in range(config.num_hidden_layers):
+        embed_dim = config.n_embd
+        head_dim = embed_dim // config.n_head
+        Wqkv_weight = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.weight")
+        q, k, v = torch.split(
+            Wqkv_weight, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0
+        )
+        c_attn_weight = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0)
+        state_dict[f"transformer.h.{d}.attn.c_attn.weight"] = c_attn_weight
+        # Same deal with the bias
+        Wqkv_bias = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.bias")
+        q, k, v = torch.split(
+            Wqkv_bias, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0
+        )
+        c_attn_bias = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0)
+        state_dict[f"transformer.h.{d}.attn.c_attn.bias"] = c_attn_bias
+    def inv_key_mapping_attn(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).mixer.out_proj.weight",
+            r"transformer.h.\1.attn.c_proj.weight",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mixer.out_proj.bias",
+            r"transformer.h.\1.attn.c_proj.bias",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((inv_key_mapping_attn(k), v) for k, v in state_dict.items())
+    return state_dict
+def bigcode_config_to_gpt2_config(bigcode_config: GPTBigCodeConfig) -> GPT2Config:
+    return GPT2Config(
+        activation_function=bigcode_config.activation_function,
+        attn_pdrop=bigcode_config.attn_pdrop,
+        bos_token_id=bigcode_config.bos_token_id,
+        embd_pdrop=bigcode_config.embd_pdrop,
+        eos_token_id=bigcode_config.eos_token_id,
+        initializer_range=bigcode_config.initializer_range,
+        layer_norm_epsilon=bigcode_config.layer_norm_epsilon,
+        max_batch_size=bigcode_config.max_batch_size,
+        max_sequence_length=bigcode_config.max_sequence_length,
+        model_type=bigcode_config.model_type,
+        multi_query=bigcode_config.multi_query,
+        n_embd=bigcode_config.n_embd,
+        n_head=bigcode_config.n_head,
+        n_inner=bigcode_config.n_inner,
+        n_layer=bigcode_config.n_layer,
+        n_positions=bigcode_config.n_positions,
+        resid_pdrop=bigcode_config.resid_pdrop,
+        scale_attn_weights=bigcode_config.scale_attn_weights,
+        summary_activation=bigcode_config.summary_activation,
+        summary_first_dropout=bigcode_config.summary_first_dropout,
+        summary_proj_to_labels=bigcode_config.summary_proj_to_labels,
+        summary_type=bigcode_config.summary_type,
+        summary_use_proj=bigcode_config.summary_use_proj,
+        use_cache=bigcode_config.use_cache,
+        vocab_size=bigcode_config.vocab_size,
+    )

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/btlm.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (c) 2023, Tri Dao.
+import math
+import json
+import re
+from pathlib import Path
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import GPT2Config, AutoConfig, PretrainedConfig
+def remap_state_dict_hf_btlm(state_dict, config):
+    # Word embedding and position embedding
+    def key_mapping_pos_emb(key):
+        return re.sub(r"^transformer.wpe.", "transformer.embeddings.position_embeddings.", key)
+    if "transformer.wpe.weight" in state_dict:
+        state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.wte.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
+        key = re.sub(r"^transformer.h.(\d+).ln_(1|2).(weight|bias)", r"transformer.layers.\1.norm\2.\3", key)
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    # MLP
+    for d in range(config.num_hidden_layers):
+        W1 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc.weight")
+        W3 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc2.weight")
+        state_dict[f"transformer.layers.{d}.mlp.fc1.weight"] = torch.cat([W1.t(), W3.t()], dim=0)
+        b1 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc.bias")
+        b3 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc2.bias")
+        state_dict[f"transformer.layers.{d}.mlp.fc1.bias"] = torch.cat([b1, b3], dim=0)
+        W2 = state_dict.pop(f"transformer.h.{d}.mlp.c_proj.weight")
+        state_dict[f"transformer.layers.{d}.mlp.fc2.weight"] = W2.t()
+    def key_mapping_mlp(key):
+        key = re.sub(r"^transformer.h.(\d+).mlp.c_proj.bias", r"transformer.layers.\1.mlp.fc2.bias", key)
+        return key
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # Attention
+    for d in range(config.num_hidden_layers):
+        Wqkv = state_dict.pop(f"transformer.h.{d}.attn.c_attn.weight")
+        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = Wqkv.t()
+        Wout = state_dict.pop(f"transformer.h.{d}.attn.c_proj.weight")
+        state_dict[f"transformer.layers.{d}.mixer.out_proj.weight"] = Wout.t()
+    state_dict.pop(f"transformer.relative_pe.slopes")  # We don't store the Alibi slopes
+    def key_mapping_attn(key):
+        key = re.sub(r"^transformer.h.(\d+).attn.c_attn.bias", r"transformer.layers.\1.mixer.Wqkv.bias", key)
+        key = re.sub(
+            r"^transformer.h.(\d+).attn.c_proj.bias", r"transformer.layers.\1.mixer.out_proj.bias", key
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    return state_dict
+def btlm_config_to_gpt2_config(btlm_config: PretrainedConfig) -> GPT2Config:
+    return GPT2Config(
+        vocab_size=btlm_config.vocab_size,
+        n_positions=0 if btlm_config.position_embedding_type == "alibi" else btlm_config.n_positions,
+        n_embd=btlm_config.hidden_size,
+        n_layer=btlm_config.num_hidden_layers,
+        n_head=btlm_config.num_attention_heads,
+        n_inner=btlm_config.n_inner,
+        activation_function=btlm_config.activation_function,
+        resid_pdrop=btlm_config.resid_pdrop,
+        embd_pdrop=btlm_config.embd_pdrop,
+        attn_pdrop=btlm_config.attn_pdrop,
+        layer_norm_epsilon=btlm_config.layer_norm_epsilon,
+        initializer_range=btlm_config.initializer_range,
+        bos_token_id=btlm_config.bos_token_id,
+        eos_token_id=btlm_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        use_alibi=btlm_config.position_embedding_type == "alibi",
+        use_flash_attn=btlm_config.position_embedding_type == "alibi",  # Alibi code path requires flash_attn
+        mup_width_scale=btlm_config.mup_width_scale,
+        mup_embeddings_multiplier=btlm_config.mup_embeddings_scale,
+        mup_output_multiplier=btlm_config.mup_output_alpha,
+        mup_scale_qk_dot_by_d=btlm_config.mup_scale_qk_dot_by_d,
+        mlp_multiple_of=1,
+    )

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/falcon.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2023, Tri Dao.
+import math
+import re
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import FalconConfig, GPT2Config
+def remap_state_dict_hf_falcon(state_dict, config):
+    def key_mapping_layers(key):
+        return re.sub(r"^transformer.h.", "transformer.layers.", key)
+    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
+    # Word embedding
+    def key_mapping_emb(key):
+        return re.sub(
+            r"^transformer.word_embeddings.", "transformer.embeddings.word_embeddings.", key
+        )
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    if getattr(config, "tie_word_embeddings"):
+        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    else:
+        output_embeddings = state_dict.pop("lm_head.weight")
+        # It's possible that vocab_size is padded to be a multiple of 8, for example.
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+        output_embeddings_bias = state_dict.pop("lm_head.bias")
+        state_dict["lm_head.bias"] = F.pad(
+            output_embeddings_bias, (0, vocab_size - output_embeddings_bias.shape[0])
+        )
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).input_layernorm.", r"transformer.layers.\1.norm1.", key
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).post_attention_layernorm.",
+            r"transformer.layers.\1.norm2.",
+            key,
+        )
+        key = re.sub(r"^transformer.layers.(\d+).ln_attn.", r"transformer.layers.\1.norm1.", key)
+        key = re.sub(r"^transformer.layers.(\d+).ln_mlp.", r"transformer.layers.\1.norm2.", key)
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    # MLP
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.dense_h_to_4h.", r"transformer.layers.\1.mlp.fc1.", key
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.dense_4h_to_h.", r"transformer.layers.\1.mlp.fc2.", key
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    def key_mapping_attn(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attention.query_key_value.",
+            r"transformer.layers.\1.mixer.Wqkv.",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attention.dense.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    n_head = config.n_head
+    n_head_kv = getattr(config, "n_head_kv", 1)
+    headdim = config.hidden_size // n_head
+    for l in range(config.n_layer):
+        # The weights are stored in a different layout compared to our implementation
+        Wqkv = rearrange(
+            state_dict.pop(f"transformer.layers.{l}.mixer.Wqkv.weight"),
+            "(group ratio headdim) ... -> group ratio headdim ...",
+            ratio=n_head // n_head_kv + 2,
+            headdim=headdim,
+        )
+        Wq = rearrange(Wqkv[:, :-2], "group ratio headdim ... -> (group ratio headdim) ...")
+        Wk = rearrange(Wqkv[:, [-2]], "group ratio headdim ... -> (group ratio headdim) ...")
+        Wv = rearrange(Wqkv[:, [-1]], "group ratio headdim ... -> (group ratio headdim) ...")
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
+    return state_dict
+def falcon_config_to_gpt2_config(falcon_config: FalconConfig) -> GPT2Config:
+    # The 40b config uses "n_head_kv" instead of "num_kv_heads"
+    n_head_kv = getattr(
+        falcon_config,
+        "n_head_kv",
+        1 if getattr(falcon_config, "multi_query", False) else falcon_config.n_head,
+    )
+    # HACK: the 40b config has 2 LN per layer instead of 1, but that's not reflected in the config.
+    # So we have to infer it from the number of heads in the key/value block
+    parallel_block_tied_norm = n_head_kv == 1
+    return GPT2Config(
+        vocab_size=falcon_config.vocab_size,
+        n_positions=0,  # No absolute position embedding
+        n_embd=falcon_config.hidden_size,
+        n_layer=falcon_config.n_layer,
+        n_head=falcon_config.n_head,
+        n_inner=falcon_config.hidden_size * 4,
+        activation_function="gelu",
+        resid_pdrop=falcon_config.hidden_dropout,
+        embd_pdrop=0.0,  # There doesn't seem to be any embedding dropout
+        attn_pdrop=falcon_config.attention_dropout,
+        layer_norm_epsilon=falcon_config.layer_norm_epsilon,
+        initializer_range=falcon_config.initializer_range,
+        bos_token_id=falcon_config.bos_token_id,
+        eos_token_id=falcon_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        parallel_block=falcon_config.parallel_attn,
+        n_head_kv=n_head_kv,
+        parallel_block_tied_norm=parallel_block_tied_norm,
+        rotary_emb_fraction=1.0,
+        rotary_emb_interleaved=False,
+        tie_word_embeddings=True,
+        qkv_proj_bias=falcon_config.bias,
+        out_proj_bias=falcon_config.bias,
+        mlp_fc1_bias=falcon_config.bias,
+        mlp_fc2_bias=falcon_config.bias,
+        lm_head_bias=False,
+    )

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/gpt.py ADDED Viewed

	@@ -0,0 +1,1080 @@

+# Copyright (c) 2024, Tri Dao.
+import logging
+import math
+import re
+from collections import OrderedDict, namedtuple
+from collections.abc import Sequence
+from functools import partial
+from typing import Dict, List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import GPT2Config
+from flash_attn.models.bigcode import remap_state_dict_hf_bigcode
+from flash_attn.models.falcon import remap_state_dict_hf_falcon
+from flash_attn.models.gpt_neox import remap_state_dict_hf_gpt_neox
+from flash_attn.models.gptj import remap_state_dict_hf_gptj
+from flash_attn.models.llama import remap_state_dict_hf_llama
+from flash_attn.models.opt import remap_state_dict_hf_opt
+from flash_attn.modules.block import Block, ParallelBlock
+from flash_attn.modules.embedding import GPT2Embeddings, ParallelGPT2Embeddings
+from flash_attn.modules.mha import MHA, ParallelMHA
+from flash_attn.modules.mlp import (
+    FusedMLP,
+    GatedMlp,
+    Mlp,
+    ParallelFusedMLP,
+    ParallelGatedMlp,
+    ParallelMLP,
+)
+from flash_attn.ops.activations import sqrelu_fwd
+from flash_attn.utils.distributed import (
+    all_gather,
+    all_gather_raw,
+    get_dim_for_local_rank,
+    sync_shared_params,
+)
+from flash_attn.utils.generation import GenerationMixin
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+try:
+    from flash_attn.ops.fused_dense import ColumnParallelLinear
+except ImportError:
+    ColumnParallelLinear = None
+try:
+    from flash_attn.ops.triton.mlp import FusedDenseSqreluDense
+except ImportError:
+    FusedDenseSqreluDense = None
+try:
+    from flash_attn.ops.triton.layer_norm import layer_norm_fn, RMSNorm
+except ImportError:
+    layer_norm_fn, RMSNorm = None, None
+logger = logging.getLogger(__name__)
+def create_mixer_cls(config, layer_idx=None, process_group=None, device=None, dtype=None):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    attn_scale_power = 0.5 if not getattr(config, "mup_scale_qk_dot_by_d", False) else 1.0
+    softmax_scale = 1.0 if not config.scale_attn_weights else (head_dim ** (-attn_scale_power))
+    softmax_scale *= getattr(config, "mup_attn_multiplier", 1.0)
+    if config.scale_attn_by_inverse_layer_idx:
+        assert layer_idx is not None
+        softmax_scale /= float(layer_idx + 1)
+    dwconv = getattr(config, "attn_dwconv", False)
+    if dwconv:
+        assert process_group is None, "TensorParallel MHA does not support dwconv yet"
+    qkv_proj_bias = getattr(config, "qkv_proj_bias", True)
+    out_proj_bias = getattr(config, "out_proj_bias", True)
+    rotary_emb_dim = int(getattr(config, "rotary_emb_fraction", 0.0) * head_dim)
+    rotary_emb_base = getattr(config, "rotary_emb_base", 10000.0)
+    rotary_emb_scale_base = getattr(config, "rotary_emb_scale_base", None)
+    rotary_emb_interleaved = getattr(config, "rotary_emb_interleaved", False)
+    use_alibi = getattr(config, "use_alibi", False)
+    window_size = getattr(config, "window_size", (-1, -1))
+    use_flash_attn = getattr(config, "use_flash_attn", False)
+    fused_bias_fc = getattr(config, "fused_bias_fc", False)
+    if not fused_bias_fc:
+        assert process_group is None, "TensorParallel MHA requires fused_bias_fc"
+    mha_cls = MHA if process_group is None else ParallelMHA
+    serial_kwargs = (
+        {"fused_bias_fc": fused_bias_fc, "dwconv": dwconv} if process_group is None else {}
+    )
+    parallel_kwargs = (
+        {
+            "process_group": process_group,
+            "sequence_parallel": getattr(config, "sequence_parallel", True),
+        }
+        if process_group is not None
+        else {}
+    )
+    num_heads_kv = getattr(config, "n_head_kv", None)
+    mixer_cls = partial(
+        mha_cls,
+        num_heads=config.num_attention_heads,
+        num_heads_kv=num_heads_kv,
+        qkv_proj_bias=qkv_proj_bias,
+        out_proj_bias=out_proj_bias,
+        dropout=config.attn_pdrop,
+        softmax_scale=softmax_scale,
+        causal=True,
+        layer_idx=layer_idx,
+        rotary_emb_dim=rotary_emb_dim,
+        rotary_emb_base=rotary_emb_base,
+        rotary_emb_scale_base=rotary_emb_scale_base,
+        rotary_emb_interleaved=rotary_emb_interleaved,
+        use_alibi=use_alibi,
+        window_size=window_size,
+        use_flash_attn=use_flash_attn,
+        **serial_kwargs,
+        **parallel_kwargs,
+        **factory_kwargs,
+    )
+    return mixer_cls
+def create_mlp_cls(config, layer_idx=None, process_group=None, device=None, dtype=None):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    mlp_fc1_bias = getattr(config, "mlp_fc1_bias", True)
+    mlp_fc2_bias = getattr(config, "mlp_fc2_bias", True)
+    fused_mlp = getattr(config, "fused_mlp", False)
+    if fused_mlp:
+        assert config.activation_function in [
+            "gelu_new",
+            "gelu_fast",
+            "gelu_approx",
+            "gelu_pytorch_tanh",
+            "relu",
+            "sqrelu",
+        ]
+    fused_dense_sqrelu_dense = getattr(config, "fused_dense_sqrelu_dense", False)
+    if fused_dense_sqrelu_dense:
+        assert config.activation_function == "sqrelu", (
+            "fused_dense_sqrelu_dense only " "supports approximate activation_function sqrelu"
+        )
+    assert not (fused_dense_sqrelu_dense and fused_mlp)
+    if not fused_mlp and not fused_dense_sqrelu_dense:
+        assert config.activation_function in [
+            "gelu",
+            "gelu_new",
+            "gelu_fast",
+            "gelu_approx",
+            "gelu_pytorch_tanh",
+            "relu",
+            "sqrelu",
+            "glu",
+            "swiglu",
+            "geglu",
+        ]
+        if config.activation_function in ["glu", "swiglu", "geglu"]:
+            activation = (
+                F.sigmoid
+                if config.activation_function == "glu"
+                else (F.silu if config.activation_function == "swiglu" else F.gelu)
+            )
+            mlp_cls = GatedMlp if process_group is None else ParallelGatedMlp
+            parallel_kwargs = (
+                {
+                    "process_group": process_group,
+                    "sequence_parallel": getattr(config, "sequence_parallel", True),
+                }
+                if process_group is not None
+                else {}
+            )
+            mlp_multiple_of = getattr(config, "mlp_multiple_of", 128)
+            mlp_cls = partial(
+                mlp_cls,
+                hidden_features=config.n_inner,
+                activation=activation,
+                bias1=mlp_fc1_bias,
+                bias2=mlp_fc2_bias,
+                multiple_of=mlp_multiple_of,
+                **parallel_kwargs,
+                **factory_kwargs,
+            )
+        else:
+            if config.activation_function == "relu":
+                activation = partial(F.relu, inplace=True)
+            elif config.activation_function == "sqrelu":
+                activation = sqrelu_fwd
+            else:
+                approximate = (
+                    "tanh"
+                    if config.activation_function
+                    in ["gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh"]
+                    else "none"
+                )
+                activation = partial(F.gelu, approximate=approximate)
+            mlp_cls = Mlp if process_group is None else ParallelMLP
+            parallel_kwargs = (
+                {
+                    "process_group": process_group,
+                    "sequence_parallel": getattr(config, "sequence_parallel", True),
+                }
+                if process_group is not None
+                else {}
+            )
+            mlp_cls = partial(
+                mlp_cls,
+                hidden_features=config.n_inner,
+                activation=activation,
+                bias1=mlp_fc1_bias,
+                bias2=mlp_fc2_bias,
+                **parallel_kwargs,
+                **factory_kwargs,
+            )
+    else:
+        mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
+        # mlp_checkpoint_lvl could be a list, which contains the checkpoint_lvl for each layer
+        if isinstance(mlp_checkpoint_lvl, Sequence):
+            assert layer_idx is not None
+            mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx]
+        if fused_mlp:
+            if FusedMLP is None:
+                raise ImportError("fused_dense is not installed")
+            activation = (
+                "gelu_approx"
+                if config.activation_function
+                in ["gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh"]
+                else config.activation_function
+            )
+            mlp_cls = FusedMLP if process_group is None else ParallelFusedMLP
+            parallel_kwargs = (
+                {
+                    "process_group": process_group,
+                    "sequence_parallel": getattr(config, "sequence_parallel", True),
+                }
+                if process_group is not None
+                else {}
+            )
+            mlp_cls = partial(
+                mlp_cls,
+                hidden_features=config.n_inner,
+                activation=activation,
+                checkpoint_lvl=mlp_checkpoint_lvl,
+                bias1=mlp_fc1_bias,
+                bias2=mlp_fc2_bias,
+                **parallel_kwargs,
+                **factory_kwargs,
+            )
+        elif fused_dense_sqrelu_dense:
+            if process_group is not None:
+                assert fused_mlp, "Tensor Parallel is not implemented for FusedDenseSqreluDense"
+            assert FusedDenseSqreluDense is not None
+            mlp_cls = partial(
+                FusedDenseSqreluDense,
+                hidden_features=config.n_inner,
+                checkpoint_lvl=mlp_checkpoint_lvl,
+                **factory_kwargs,
+            )
+        else:
+            raise RuntimeError("MLP type not supported")
+    return mlp_cls
+def create_block(config, layer_idx=None, process_group=None, device=None, dtype=None):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    sequence_parallel = getattr(config, "sequence_parallel", True)
+    mixer_cls = create_mixer_cls(config, layer_idx, process_group=process_group, **factory_kwargs)
+    mlp_cls = create_mlp_cls(config, layer_idx, process_group=process_group, **factory_kwargs)
+    use_rms_norm = getattr(config, "rms_norm", False)
+    norm_cls = partial(
+        nn.LayerNorm if not use_rms_norm else RMSNorm,
+        eps=config.layer_norm_epsilon,
+        **factory_kwargs,
+    )
+    # TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable
+    residual_in_fp32 = getattr(config, "residual_in_fp32", False)
+    resid_dropout1 = config.resid_pdrop if layer_idx is None or layer_idx > 0 else config.embd_pdrop
+    prenorm = getattr(config, "prenorm", True)
+    parallel_block = getattr(config, "parallel_block", False)
+    if not parallel_block:
+        block = Block(
+            config.hidden_size,
+            mixer_cls,
+            mlp_cls,
+            norm_cls=norm_cls,
+            prenorm=prenorm,
+            resid_dropout1=resid_dropout1,
+            resid_dropout2=config.resid_pdrop,
+            fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
+            residual_in_fp32=residual_in_fp32,
+            sequence_parallel=sequence_parallel and process_group is not None,
+            mark_shared_params=process_group is not None,
+        )
+    else:
+        assert prenorm
+        block = ParallelBlock(
+            config.hidden_size,
+            mixer_cls,
+            mlp_cls,
+            norm_cls=norm_cls,
+            resid_dropout1=resid_dropout1,
+            resid_dropout2=config.resid_pdrop,
+            tied_norm=getattr(config, "parallel_block_tied_norm", False),
+            fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
+            residual_in_fp32=residual_in_fp32,
+            sequence_parallel=sequence_parallel and process_group is not None,
+            mark_shared_params=process_group is not None,
+        )
+    block.layer_idx = layer_idx
+    return block
+class GPTPreTrainedModel(nn.Module):
+    """An abstract class to handle weights initialization and
+    a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__()
+        if not isinstance(config, GPT2Config):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `GPT2Config`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                )
+            )
+        self.config = config
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name,
+        config,
+        *args,
+        strict=True,
+        device=None,
+        dtype=None,
+        world_size=1,
+        rank=0,
+        **kwargs,
+    ):
+        """
+        Instantiate a GPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+        """
+        # Instantiate model.
+        model = cls(config, *args, device=device, dtype=dtype, **kwargs)
+        # Load state_dict in cpu because we already initialized the model in GPU, and we don't
+        # want extra stuff taking up more GPU memory
+        state_dict = state_dict_from_pretrained(model_name, device="cpu", dtype=dtype)
+        if model_name.startswith("gpt2"):
+            state_dict = remap_state_dict_hf_gpt2(state_dict, config)
+        elif model_name.startswith("facebook/opt"):
+            state_dict = remap_state_dict_hf_opt(state_dict, config)
+        elif model_name.startswith("EleutherAI/gpt-j-") or model_name.startswith(
+            "togethercomputer/GPT-JT-"
+        ):
+            state_dict = remap_state_dict_hf_gptj(state_dict, config)
+        elif (
+            model_name.startswith("EleutherAI/gpt-neox-")
+            or model_name.startswith("EleutherAI/pythia-")
+            or model_name.startswith("togethercomputer/RedPajama-INCITE-")
+        ):
+            state_dict = remap_state_dict_hf_gpt_neox(state_dict, config)
+        elif model_name.startswith("tiiuae/falcon-"):
+            state_dict = remap_state_dict_hf_falcon(state_dict, config)
+        elif model_name.startswith("meta-llama/Llama-"):
+            state_dict = remap_state_dict_hf_llama(state_dict, config)
+        elif model_name.startswith("bigcode/") or model_name.startswith("WizardLM/"):
+            state_dict = remap_state_dict_hf_bigcode(state_dict, config)
+        else:
+            raise NotImplementedError(f"Model {model_name} not supported")
+        if world_size > 1:
+            state_dict = shard_state_dict_tp(state_dict, config, world_size, rank)
+        load_return = model.load_state_dict(state_dict, strict=strict)
+        logger.info(load_return)
+        return model
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+def _init_weights(
+    module, n_layer, initializer_range=0.02, mup_width_scale=1.0, rescale_prenorm_residual=True
+):
+    mup_init_scale = math.sqrt(mup_width_scale)
+    if isinstance(module, nn.Linear):
+        nn.init.normal_(module.weight, std=initializer_range * mup_init_scale)
+        optim_cfg = getattr(module.weight, "_optim", {})
+        optim_cfg.update({"lr_multiplier": mup_width_scale})
+        setattr(module.weight, "_optim", optim_cfg)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+    if rescale_prenorm_residual:
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                nn.init.normal_(
+                    p, mean=0.0, std=initializer_range * mup_init_scale / math.sqrt(2 * n_layer)
+                )
+class GPTModel(GPTPreTrainedModel):
+    def __init__(self, config: GPT2Config, process_group=None, device=None, dtype=None):
+        super().__init__(config)
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.process_group = process_group
+        self.sequence_parallel = getattr(config, "sequence_parallel", True)
+        assert config.activation_function in [
+            "gelu",
+            "gelu_new",
+            "gelu_fast",
+            "gelu_approx",
+            "gelu_pytorch_tanh",
+            "relu",
+            "sqrelu",
+            "glu",
+            "swiglu",
+            "geglu",
+        ]
+        pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+        vocab_size = (
+            math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+        )
+        self.embeddings_multiplier = getattr(config, "mup_embeddings_multiplier", 1.0)
+        # TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable
+        self.residual_in_fp32 = getattr(config, "residual_in_fp32", False)
+        # These 2 options are for OPT-350m
+        self.prenorm = getattr(config, "prenorm", True)
+        use_rms_norm = getattr(config, "rms_norm", False)
+        word_embed_proj_dim = getattr(config, "word_embed_proj_dim", None)
+        # For GPT-J, GPT-NeoX
+        self.parallel_block = getattr(config, "parallel_block", False)
+        if process_group is None:
+            self.embeddings = GPT2Embeddings(
+                config.hidden_size,
+                vocab_size,
+                config.max_position_embeddings,
+                word_embed_proj_dim=word_embed_proj_dim,
+                **factory_kwargs,
+            )
+        else:
+            self.embeddings = ParallelGPT2Embeddings(
+                config.hidden_size,
+                vocab_size,
+                config.max_position_embeddings,
+                process_group=process_group,
+                sequence_parallel=self.sequence_parallel,
+                **factory_kwargs,
+            )
+        # We change the order of dropout, residual and layer norm:
+        # Instead of LN -> Attn / MLP -> Dropout -> Add, we do:
+        # Dropout -> Add -> LN -> Attn / MLP, returning both the residual branch (output of Add) and
+        # the main branch (output of MLP). The model definition is unchanged, but the mapping of the
+        # nn.Dropout probabilities are changed.
+        # This is for performance reason: we can fuse dropout + add + layer_norm.
+        self.layers = nn.ModuleList(
+            [
+                create_block(config, layer_idx=i, process_group=process_group, **factory_kwargs)
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        rotary_emb_fraction = getattr(config, "rotary_emb_fraction", 0.0)
+        if rotary_emb_fraction > 0.0:  # Tie all the RotaryEmbedding modules to share the same cos/sin cache
+            for layer in self.layers[1:]:
+                layer.mixer.rotary_emb = self.layers[0].mixer.rotary_emb
+        self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
+        if self.fused_dropout_add_ln:
+            if layer_norm_fn is None:
+                raise ImportError("Triton is not installed")
+        if self.prenorm:
+            self.drop_f = nn.Dropout(config.resid_pdrop)
+            norm_cls = nn.LayerNorm if not use_rms_norm else RMSNorm
+            self.ln_f = norm_cls(
+                config.hidden_size, eps=config.layer_norm_epsilon, **factory_kwargs
+            )
+        if process_group is not None:
+            for p in self.ln_f.parameters():
+                # Mark the norm parameters as "shared_params" so that we sync their values at init.
+                p._shared_params = True
+                # Mark the norm params as "sequence_parallel" so we run all-reduce on their grads.
+                if self.sequence_parallel:
+                    p._sequence_parallel = True
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=config.num_hidden_layers,
+                initializer_range=config.initializer_range,
+                mup_width_scale=getattr(config, "mup_width_scale", 1.0),
+            )
+        )
+        self.tie_weights()
+    def tie_weights(self):
+        if self.process_group is not None:
+            sync_shared_params(self, self.process_group)
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return {
+            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+            for i, layer in enumerate(self.layers)
+        }
+    def forward(self, input_ids, position_ids=None, inference_params=None):
+        # If using Tensor Parallel with sequence parallel, we combine the batch and the seqlen
+        # dimensions so that we can split on it easily, in case of small batch size.
+        # Only the attention layers need to know the seqlen.
+        embedding_kwargs = (
+            {"combine_batch_seqlen_dim": True}
+            if self.process_group is not None and self.sequence_parallel
+            else {}
+        )
+        hidden_states = self.embeddings(input_ids, position_ids=position_ids, **embedding_kwargs)
+        if self.embeddings_multiplier != 1.0:
+            hidden_states = hidden_states * self.embeddings_multiplier
+        if self.parallel_block:
+            hidden_states2 = None
+        residual = None
+        mixer_kwargs = (
+            {"seqlen": input_ids.shape[1]}
+            if self.process_group is not None and self.sequence_parallel
+            else {}
+        )
+        if inference_params is not None:
+            mixer_kwargs["inference_params"] = inference_params
+        for layer in self.layers:
+            if self.prenorm:
+                if not self.parallel_block:
+                    hidden_states, residual = layer(
+                        hidden_states, residual, mixer_kwargs=mixer_kwargs
+                    )
+                else:
+                    hidden_states, hidden_states2, residual = layer(
+                        hidden_states, hidden_states2, residual, mixer_kwargs=mixer_kwargs
+                    )
+            else:
+                hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
+        if self.prenorm:
+            if not self.fused_dropout_add_ln:
+                dropped = self.drop_f(hidden_states)
+                if not self.parallel_block:
+                    residual = (dropped + residual) if residual is not None else dropped
+                else:
+                    dropped2 = self.drop_f(hidden_states2)
+                    residual = (
+                        (residual + dropped + dropped2)
+                        if residual is not None
+                        else dropped + dropped2
+                    )
+                hidden_states = self.ln_f(residual.to(dtype=self.ln_f.weight.dtype))
+            else:
+                # Set prenorm=False here since we don't need the residual
+                hidden_states = layer_norm_fn(
+                    hidden_states,
+                    self.ln_f.weight,
+                    self.ln_f.bias,
+                    residual=residual,
+                    x1=None if not self.parallel_block else hidden_states2,
+                    eps=self.ln_f.eps,
+                    dropout_p=self.drop_f.p if self.training else 0.0,
+                    prenorm=False,
+                    is_rms_norm=isinstance(self.ln_f, RMSNorm)
+                )
+        return hidden_states
+class GPTLMHeadModel(GPTPreTrainedModel, GenerationMixin):
+    def __init__(self, config: GPT2Config, process_group=None, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(config)
+        self.process_group = process_group
+        self.transformer = GPTModel(config, process_group=process_group, **factory_kwargs)
+        self.tie_word_embeddings = getattr(config, "tie_word_embeddings", True)
+        lm_head_bias = getattr(config, "lm_head_bias", False)
+        pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+        vocab_size = (
+            math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+        )
+        # This option is for OPT-350m
+        word_embed_proj_dim = getattr(config, "word_embed_proj_dim", None)
+        embed_dim = config.n_embd if word_embed_proj_dim is None else word_embed_proj_dim
+        if word_embed_proj_dim is not None:
+            self.project_out = nn.Linear(config.n_embd, embed_dim, bias=False, **factory_kwargs)
+        else:
+            self.project_out = None
+        mup_width_scale = getattr(config, "mup_width_scale", 1.0)
+        mup_output_multiplier = getattr(config, "mup_output_multiplier", 1.0)
+        self.output_scale = mup_output_multiplier * mup_width_scale
+        if process_group is None:
+            self.lm_head = nn.Linear(embed_dim, vocab_size, bias=lm_head_bias, **factory_kwargs)
+        else:
+            if ColumnParallelLinear is None:
+                raise ImportError("fused_dense_lib is not installed")
+            self.lm_head = ColumnParallelLinear(
+                embed_dim,
+                vocab_size,
+                process_group,
+                bias=lm_head_bias,
+                sequence_parallel=getattr(config, "sequence_parallel", True),
+                **factory_kwargs,
+            )
+        self.norm_head = getattr(config, "norm_head", False)
+        # Initialize weights and apply final processing
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=config.num_hidden_layers,
+                initializer_range=config.initializer_range,
+                mup_width_scale=mup_width_scale,
+            )
+        )
+        self.tie_weights()
+    def tie_weights(self):
+        if self.tie_word_embeddings:
+            self.lm_head.weight = self.transformer.embeddings.word_embeddings.weight
+        if self.process_group is not None:
+            sync_shared_params(self, self.process_group)
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.transformer.allocate_inference_cache(
+            batch_size, max_seqlen, dtype=dtype, **kwargs
+        )
+    def forward(self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0):
+        """
+        input_ids: (batch, seqlen) int tensor
+        inference_params: for generation. Adapted from Megatron-LM (and Apex)
+        https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
+        num_last_tokens: if > 0, only return the logits for the last n tokens
+        """
+        assert (
+            input_ids.ndim == 2
+        ), f"Expected `input_ids` to have shape [b, slen], but got shape {input_ids.shape}"
+        b, slen = input_ids.shape
+        hidden_states = self.transformer(
+            input_ids, position_ids=position_ids, inference_params=inference_params
+        )
+        if inference_params is not None:
+            assert hidden_states.ndim == 3, "sequence_parallel is not supported in generation mode"
+        if num_last_tokens > 0:
+            hidden_states = hidden_states[:, -num_last_tokens:]
+        if self.project_out is not None:
+            hidden_states = self.project_out(hidden_states)
+        if self.output_scale != 1.0:
+            hidden_states = hidden_states * self.output_scale
+        if not self.norm_head:
+            lm_logits = self.lm_head(hidden_states)
+        else:
+            lm_head_weight = F.normalize(self.lm_head.weight)
+            if isinstance(self.lm_head, ColumnParallelLinear) and self.lm_head.sequence_parallel:
+                hidden_states = all_gather(hidden_states, self.lm_head.process_group)
+            lm_logits = F.linear(hidden_states, lm_head_weight, bias=self.lm_head.bias)
+        # During inference, we want the full logit for sampling
+        if isinstance(self.lm_head, ColumnParallelLinear) and inference_params is not None:
+            lm_logits, _ = all_gather_raw(lm_logits, self.lm_head.process_group)
+            lm_logits = rearrange(lm_logits, "(n b) ... d -> b ... (n d)", b=b)
+        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
+        return CausalLMOutput(logits=lm_logits)
+    def load_state_dict(self, state_dict, strict=True):
+        # Remapping from our checkpoints that used a different ordering of layers in the block
+        # Previous: Attn / MLP -> Dropout -> Add -> LN
+        # Current: Dropout -> Add -> LN -> Attn / MLP
+        if "transformer.ln_0.weight" in state_dict:
+            n_layers = len(self.transformer.layers)
+            ln_weight = state_dict.pop(f"transformer.layers.{n_layers - 1}.norm2.weight")
+            ln_bias = state_dict.pop(f"transformer.layers.{n_layers - 1}.norm2.bias")
+            state_dict["transformer.ln_f.weight"] = ln_weight
+            state_dict["transformer.ln_f.bias"] = ln_bias
+            for l in reversed(range(n_layers)):
+                ln_weight = state_dict.pop(f"transformer.layers.{l}.norm1.weight")
+                ln_bias = state_dict.pop(f"transformer.layers.{l}.norm1.bias")
+                state_dict[f"transformer.layers.{l}.norm2.weight"] = ln_weight
+                state_dict[f"transformer.layers.{l}.norm2.bias"] = ln_bias
+                if l > 0:
+                    ln_weight = state_dict.pop(f"transformer.layers.{l - 1}.norm2.weight")
+                    ln_bias = state_dict.pop(f"transformer.layers.{l - 1}.norm2.bias")
+                    state_dict[f"transformer.layers.{l}.norm1.weight"] = ln_weight
+                    state_dict[f"transformer.layers.{l}.norm1.bias"] = ln_bias
+            ln_weight = state_dict.pop("transformer.ln_0.weight")
+            ln_bias = state_dict.pop("transformer.ln_0.bias")
+            state_dict[f"transformer.layers.0.norm1.weight"] = ln_weight
+            state_dict[f"transformer.layers.0.norm1.bias"] = ln_bias
+        return super().load_state_dict(state_dict, strict=strict)
+def shard_state_dict_tp(state_dict, config, world_size, rank):
+    """Convert the state_dict of a standard GPT model to the state_dict of a GPT model
+    with tensor parallel.
+    This function modifies state_dict in place.
+    """
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    assert vocab_size % world_size == 0
+    assert config.hidden_size % world_size == 0
+    inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
+    assert inner_dim % world_size == 0
+    n_head = config.n_head
+    n_head_kv = getattr(config, "n_head_kv", n_head)
+    embed_dim = config.hidden_size
+    head_dim = embed_dim // n_head
+    def shard_first_dim(state_dict, key):
+        if key in state_dict:
+            x = state_dict[key]
+            dim = x.shape[0] // world_size
+            state_dict[key] = x[rank * dim : (rank + 1) * dim]
+    def shard_last_dim(state_dict, key, multiple_of=1):
+        if key in state_dict:
+            x = state_dict[key]
+            dim_each_rank = [
+                get_dim_for_local_rank(x.size(-1), world_size, local_rank, multiple_of)
+                for local_rank in range(world_size)
+            ]
+            beg, end = tuple(sum(dim_each_rank[:pos]) for pos in (rank, rank + 1))
+            state_dict[key] = x[..., beg:end]
+    def shard_gatedmlp_fc1_dim(state_dict, key):
+        if key in state_dict:
+            x = state_dict[key]
+            dim = x.shape[0] // world_size // 2
+            state_dict[key] = rearrange(
+                rearrange(x, "(two o) ... -> two o ...", two=2)[:, rank * dim : (rank + 1) * dim],
+                "two o ... -> (two o) ...",
+            )
+    def shard_qkv_headdim(state_dict, key):
+        if key in state_dict:
+            n_head_each_rank = [
+                get_dim_for_local_rank(n_head, world_size, local_rank)
+                for local_rank in range(world_size)
+            ]
+            n_head_kv_each_rank = [
+                get_dim_for_local_rank(n_head_kv, world_size, local_rank)
+                for local_rank in range(world_size)
+            ]
+            beg_n_head = sum(n_head_each_rank[:rank])
+            end_n_head = sum(n_head_each_rank[: rank + 1])
+            beg_n_head_kv = sum(n_head_kv_each_rank[:rank])
+            end_n_head_kv = sum(n_head_kv_each_rank[: rank + 1])
+            if n_head_kv == n_head:
+                x = rearrange(state_dict[key], "(three d) ... -> three d ...", three=3)
+                state_dict[key] = rearrange(
+                    x[:, beg_n_head * head_dim : end_n_head * head_dim],
+                    "three d ... -> (three d) ...",
+                )
+            else:
+                x = rearrange(
+                    state_dict[key],
+                    "(nheadqkv headdim) ... -> nheadqkv headdim ...",
+                    nheadqkv=n_head + 2 * n_head_kv,
+                )
+                state_dict[key] = rearrange(
+                    torch.cat(
+                        [
+                            x[beg_n_head:end_n_head],
+                            x[n_head + beg_n_head_kv : n_head + end_n_head_kv],
+                            x[
+                                n_head
+                                + n_head_kv
+                                + beg_n_head_kv : n_head
+                                + n_head_kv
+                                + end_n_head_kv
+                            ],
+                        ],
+                        dim=0,
+                    ),
+                    "nheadqkv headdim ... -> (nheadqkv headdim) ...",
+                )
+    shard_first_dim(state_dict, "transformer.embeddings.word_embeddings.weight")
+    if "lm_head.weight" in state_dict:
+        shard_first_dim(state_dict, "lm_head.weight")
+    if "transformer.embeddings.position_embeddings.weight" in state_dict:
+        shard_last_dim(state_dict, "transformer.embeddings.position_embeddings.weight")
+    for i in range(config.num_hidden_layers):
+        shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.weight")
+        shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.bias")
+        shard_last_dim(
+            state_dict, f"transformer.layers.{i}.mixer.out_proj.weight", multiple_of=head_dim
+        )
+        if rank != 0:
+            state_dict.pop(f"transformer.layers.{i}.mixer.out_proj.bias", None)
+        if config.activation_function in ["glu", "swiglu", "geglu"]:
+            shard_gatedmlp_fc1_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
+            shard_gatedmlp_fc1_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.bias")
+        else:
+            shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
+            shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.bias")
+        shard_last_dim(state_dict, f"transformer.layers.{i}.mlp.fc2.weight")
+        if rank != 0:
+            state_dict.pop(f"transformer.layers.{i}.mlp.fc2.bias", None)
+    return state_dict
+def combine_state_dicts_tp(state_dicts: List[Dict[str, torch.Tensor]], config: GPT2Config):
+    """Convert the list of sharded state_dict of a GPT model with tensor parallel to
+    the state_dict of a standard GPT model.
+    This function is meant to be the "reverse" of shard_state_dict_tp.
+    Precondition:
+        - state_dicts should be ordered in the same way as the shards were created.
+    """
+    world_size = len(state_dicts)
+    keys = state_dicts[0].keys()
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    assert vocab_size % world_size == 0
+    assert config.hidden_size % world_size == 0
+    inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
+    assert inner_dim % world_size == 0
+    assert config.hidden_size % config.n_head == 0
+    headdim = config.hidden_size // config.n_head
+    # Sometimes the word embeddings are sharded on the 0th dim, sometimes on the 1st dim.
+    # vocab_size // world_size coordinates are nonzero.
+    def combine_word_embeddings(state_dicts, state_dict, key):
+        dim = 0 if state_dicts[0][key].shape[0] == vocab_size // world_size else 1
+        state_dict[key] = torch.cat([s[key] for s in state_dicts], dim=dim)
+    def combine_dim(state_dicts, state_dict, key, dim=-1):
+        if key in state_dict:
+            state_dict[key] = torch.cat([s[key] for s in state_dicts], dim=dim)
+    def combine_qkv_headdim(state_dicts, state_dict, key):
+        n_head = config.n_head
+        n_head_kv = getattr(config, "n_head_kv", n_head)
+        if key in state_dict:
+            if n_head_kv == n_head:
+                xs = [
+                    rearrange(s[key], "(three d) ... -> three d ...", three=3) for s in state_dicts
+                ]
+                state_dict[key] = rearrange(torch.cat(xs, dim=1), "three d ... -> (three d) ...")
+            else:
+                n_head_each_rank = [
+                    get_dim_for_local_rank(n_head, world_size, local_rank)
+                    for local_rank in range(world_size)
+                ]
+                n_head_kv_each_rank = [
+                    get_dim_for_local_rank(n_head_kv, world_size, local_rank)
+                    for local_rank in range(world_size)
+                ]
+                xs = [
+                    rearrange(
+                        s[key],
+                        "(nheadqkv headdim) ... -> nheadqkv headdim ...",
+                        nheadqkv=rank_n_head + 2 * rank_n_head_kv,
+                        headdim=headdim,
+                    )
+                    for s, rank_n_head, rank_n_head_kv in zip(
+                        state_dicts, n_head_each_rank, n_head_kv_each_rank
+                    )
+                ]
+                wq = torch.cat([x[: n_head_each_rank[rank]] for rank, x in enumerate(xs)], dim=0)
+                wk = torch.cat(
+                    [
+                        x[
+                            n_head_each_rank[rank] : n_head_each_rank[rank]
+                            + n_head_kv_each_rank[rank]
+                        ]
+                        for rank, x in enumerate(xs)
+                    ],
+                    dim=0,
+                )
+                wv = torch.cat(
+                    [
+                        x[n_head_each_rank[rank] + n_head_kv_each_rank[rank] :]
+                        for rank, x in enumerate(xs)
+                    ],
+                    dim=0,
+                )
+                wqkv = torch.cat(
+                    [wq, wk, wv],
+                    dim=0,
+                )
+                state_dict[key] = rearrange(
+                    wqkv,
+                    "nheadqkv headdim ... -> (nheadqkv headdim) ...",
+                )
+    def combine_gated_mlp(state_dicts, state_dict, key):
+        if key in state_dict:
+            xs = [rearrange(s[key], "(two d) ... -> two d ...", two=2) for s in state_dicts]
+            state_dict[key] = rearrange(torch.cat(xs, dim=1), "two d ... -> (two d) ...")
+    state_dict = state_dicts[0].copy()  # don't modify state_dict[0] inplace
+    combine_word_embeddings(
+        state_dicts, state_dict, "transformer.embeddings.word_embeddings.weight"
+    )
+    if "lm_head.weight" in state_dict:
+        combine_word_embeddings(state_dicts, state_dict, "lm_head.weight")
+    if "transformer.embeddings.position_embeddings.weight" in state_dict:
+        combine_dim(
+            state_dicts, state_dict, "transformer.embeddings.position_embeddings.weight", -1
+        )
+    mlp_combine_fn = (
+        combine_gated_mlp
+        if config.activation_function in ["glu", "swiglu", "geglu"]
+        else partial(combine_dim, dim=0)
+    )
+    for i in range(config.num_hidden_layers):
+        combine_qkv_headdim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.Wqkv.weight")
+        combine_qkv_headdim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.Wqkv.bias")
+        combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.out_proj.weight", -1)
+        mlp_combine_fn(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
+        combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc1.bias", 0)
+        combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc2.weight", -1)
+    return state_dict
+def remap_state_dict_hf_gpt2(state_dict, config):
+    # Word embedding and position embedding
+    def key_mapping_pos_emb(key):
+        return re.sub(r"^wpe.", "transformer.embeddings.position_embeddings.", key)
+    state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("wte.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
+        key = re.sub(r"^h.(\d+).ln_(1|2).(weight|bias)", r"transformer.layers.\1.norm\2.\3", key)
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    # MLP
+    for d in range(config.num_hidden_layers):
+        W1 = state_dict.pop(f"h.{d}.mlp.c_fc.weight")
+        state_dict[f"transformer.layers.{d}.mlp.fc1.weight"] = W1.t()
+        W2 = state_dict.pop(f"h.{d}.mlp.c_proj.weight")
+        state_dict[f"transformer.layers.{d}.mlp.fc2.weight"] = W2.t()
+    def key_mapping_mlp(key):
+        key = re.sub(r"^h.(\d+).mlp.c_fc.bias", r"transformer.layers.\1.mlp.fc1.bias", key)
+        key = re.sub(r"^h.(\d+).mlp.c_proj.bias", r"transformer.layers.\1.mlp.fc2.bias", key)
+        return key
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # Attention
+    for d in range(config.num_hidden_layers):
+        state_dict.pop(f"h.{d}.attn.bias", None)  # We don't store this bias
+        Wqkv = state_dict.pop(f"h.{d}.attn.c_attn.weight")
+        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = Wqkv.t()
+        Wout = state_dict.pop(f"h.{d}.attn.c_proj.weight")
+        state_dict[f"transformer.layers.{d}.mixer.out_proj.weight"] = Wout.t()
+    def key_mapping_attn(key):
+        key = re.sub(r"^h.(\d+).attn.c_attn.bias", r"transformer.layers.\1.mixer.Wqkv.bias", key)
+        key = re.sub(
+            r"^h.(\d+).attn.c_proj.bias", r"transformer.layers.\1.mixer.out_proj.bias", key
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    return state_dict
+def remap_state_dict_megatron(state_dict, config):
+    def key_mapping_transformer(key):
+        key = re.sub(r"^language_model.encoder.", "transformer.", key)
+        key = re.sub(r"^language_model.", "transformer.", key)
+        return key
+    state_dict = OrderedDict((key_mapping_transformer(k), v) for k, v in state_dict.items())
+    # Word embedding and position embedding
+    def key_mapping_pos_emb(key):
+        return re.sub(r"^wpe.", "transformer.embeddings.position_embeddings.", key)
+    state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embedding.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    )
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.final_layernorm.(weight|bias)", r"transformer.ln_f.\1", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).input_layernorm.(weight|bias)",
+            r"transformer.layers.\1.norm1.\2",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).post_attention_layernorm.(weight|bias)",
+            r"transformer.layers.\1.norm2.\2",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    # MLP
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.dense_h_to_4h.(weight|bias)",
+            r"transformer.layers.\1.mlp.fc1.\2",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.dense_4h_to_h.(weight|bias)",
+            r"transformer.layers.\1.mlp.fc2.\2",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # Attention
+    def key_mapping_attn(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attention.rotary_emb.inv_freq",
+            r"transformer.layers.\1.mixer.rotary_emb.inv_freq",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attention.query_key_value.(weight|bias)",
+            r"transformer.layers.\1.mixer.Wqkv.\2",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attention.dense.(weight|bias)",
+            r"transformer.layers.\1.mixer.out_proj.\2",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    # Megatron stores Wqkv as ((nheads 3 headdim), hidden_dim)
+    # while we store Wqkv as ((3 nheads headdim), hidden_dim)
+    headdim = config.hidden_size // config.num_attention_heads
+    for d in range(config.num_hidden_layers):
+        Wqkv = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.weight")
+        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = rearrange(
+            Wqkv,
+            "(nheads three headdim) ... -> (three nheads headdim) ...",
+            three=3,
+            headdim=headdim,
+        )
+        bqkv = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.bias")
+        state_dict[f"transformer.layers.{d}.mixer.Wqkv.bias"] = rearrange(
+            bqkv, "(nheads three headdim) -> (three nheads headdim)", three=3, headdim=headdim
+        )
+    return state_dict

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/gpt_neox.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright (c) 2023, Tri Dao.
+import math
+import re
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import GPT2Config, GPTNeoXConfig
+def remap_state_dict_hf_gpt_neox(state_dict, config):
+    def key_mapping_layers(key):
+        return re.sub(r"^gpt_neox.", "transformer.", key)
+    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
+    # Word embedding
+    def key_mapping_emb(key):
+        return re.sub(r"^transformer.embed_in.", "transformer.embeddings.word_embeddings.", key)
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    if getattr(config, "tie_word_embeddings", False):
+        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    else:
+        output_embeddings = state_dict.pop("embed_out.weight")
+        # It's possible that vocab_size is padded to be a multiple of 8, for example.
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.final_layer_norm.", r"transformer.ln_f.", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).input_layernorm.", r"transformer.layers.\1.norm1.", key
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).post_attention_layernorm.",
+            r"transformer.layers.\1.norm2.",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    # MLP
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.dense_h_to_4h.", r"transformer.layers.\1.mlp.fc1.", key
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.dense_4h_to_h.", r"transformer.layers.\1.mlp.fc2.", key
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # Attention
+    for l in range(config.n_layer):
+        # We don't store these biases
+        state_dict.pop(f"transformer.layers.{l}.attention.bias")
+        state_dict.pop(f"transformer.layers.{l}.attention.masked_bias")
+        # We don't store these
+        state_dict.pop(f"transformer.layers.{l}.attention.rotary_emb.inv_freq", None)
+        # GPT-NeoX stores Wqkv as ((nheads 3 headdim), hidden_dim)
+        # while we store Wqkv as ((3 nheads headdim), hidden_dim)
+        headdim = config.hidden_size // config.num_attention_heads
+        Wqkv = state_dict.pop(f"transformer.layers.{l}.attention.query_key_value.weight")
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = rearrange(
+            Wqkv,
+            "(nheads three headdim) ... -> (three nheads headdim) ...",
+            three=3,
+            headdim=headdim,
+        )
+        bqkv = state_dict.pop(f"transformer.layers.{l}.attention.query_key_value.bias")
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.bias"] = rearrange(
+            bqkv, "(nheads three headdim) -> (three nheads headdim)", three=3, headdim=headdim
+        )
+    def key_mapping_attn(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).attention.dense.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    return state_dict
+def gpt_neox_config_to_gpt2_config(gpt_neox_config: GPTNeoXConfig) -> GPT2Config:
+    assert gpt_neox_config.rotary_emb_base == 10000
+    return GPT2Config(
+        vocab_size=gpt_neox_config.vocab_size,
+        n_positions=0,  # No absolute position embedding
+        n_embd=gpt_neox_config.hidden_size,
+        n_layer=gpt_neox_config.num_hidden_layers,
+        n_head=gpt_neox_config.num_attention_heads,
+        n_inner=gpt_neox_config.intermediate_size,
+        activation_function=gpt_neox_config.hidden_act,
+        resid_pdrop=0.0,  # No dropout
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        layer_norm_epsilon=gpt_neox_config.layer_norm_eps,
+        initializer_range=gpt_neox_config.initializer_range,
+        bos_token_id=gpt_neox_config.bos_token_id,
+        eos_token_id=gpt_neox_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        prenorm=True,
+        parallel_block=gpt_neox_config.use_parallel_residual,
+        parallel_block_tied_norm=False,
+        rotary_emb_fraction=gpt_neox_config.rotary_pct,
+        tie_word_embeddings=gpt_neox_config.tie_word_embeddings,
+    )

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/gptj.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright (c) 2023, Tri Dao.
+import math
+import re
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+from transformers import GPT2Config, GPTJConfig
+def remap_state_dict_hf_gptj(state_dict, config):
+    def key_mapping_layers(key):
+        return re.sub(r"^transformer.h.", "transformer.layers.", key)
+    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
+    # Word embedding
+    def key_mapping_emb(key):
+        return re.sub(r"^transformer.wte.", "transformer.embeddings.word_embeddings.", key)
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    if getattr(config, "tie_word_embeddings"):
+        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    else:
+        output_embeddings = state_dict.pop("lm_head.weight")
+        # It's possible that vocab_size is padded to be a multiple of 8, for example.
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+        output_embeddings_bias = state_dict.pop("lm_head.bias")
+        state_dict["lm_head.bias"] = F.pad(
+            output_embeddings_bias, (0, vocab_size - output_embeddings_bias.shape[0])
+        )
+    # LayerNorm
+    def key_mapping_ln(key):
+        return re.sub(r"^transformer.layers.(\d+).ln_1.", r"transformer.layers.\1.norm1.", key)
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    # MLP
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.fc_in.", r"transformer.layers.\1.mlp.fc1.", key
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.fc_out.", r"transformer.layers.\1.mlp.fc2.", key
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # Attention
+    for l in range(config.n_layer):
+        Wq = state_dict.pop(f"transformer.layers.{l}.attn.q_proj.weight")
+        Wk = state_dict.pop(f"transformer.layers.{l}.attn.k_proj.weight")
+        Wv = state_dict.pop(f"transformer.layers.{l}.attn.v_proj.weight")
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
+        # We don't store these biases
+        state_dict.pop(f"transformer.layers.{l}.attn.bias")
+        state_dict.pop(f"transformer.layers.{l}.attn.masked_bias")
+    def key_mapping_attn(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).attn.out_proj.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    return state_dict
+def gptj_config_to_gpt2_config(gptj_config: GPTJConfig) -> GPT2Config:
+    headdim = gptj_config.n_embd // gptj_config.n_head
+    return GPT2Config(
+        vocab_size=gptj_config.vocab_size,
+        n_positions=0,  # No absolute position embedding
+        n_embd=gptj_config.n_embd,
+        n_layer=gptj_config.n_layer,
+        n_head=gptj_config.n_head,
+        n_inner=gptj_config.n_inner,
+        activation_function=gptj_config.activation_function,
+        resid_pdrop=gptj_config.resid_pdrop,
+        embd_pdrop=gptj_config.embd_pdrop,
+        attn_pdrop=gptj_config.attn_pdrop,
+        layer_norm_epsilon=gptj_config.layer_norm_epsilon,
+        initializer_range=gptj_config.initializer_range,
+        bos_token_id=gptj_config.bos_token_id,
+        eos_token_id=gptj_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        prenorm=True,
+        parallel_block=True,
+        parallel_block_tied_norm=True,
+        rotary_emb_fraction=gptj_config.rotary_dim / headdim,
+        rotary_emb_interleaved=True,
+        tie_word_embeddings=False,
+        qkv_proj_bias=False,
+        out_proj_bias=False,
+        lm_head_bias=True,
+    )

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/llama.py ADDED Viewed

	@@ -0,0 +1,422 @@

+# Copyright (c) 2023, Tri Dao.
+import json
+import math
+import os
+import re
+from collections import OrderedDict
+from pathlib import Path
+from typing import Dict, List, Union
+import torch
+import torch.nn.functional as F
+from sentencepiece import SentencePieceProcessor
+from transformers import GPT2Config, LlamaConfig
+from einops import rearrange
+def remap_state_dict_meta_llama(
+    state_dict: Dict[str, torch.Tensor], config: GPT2Config
+) -> Dict[str, torch.Tensor]:
+    """Convert the state_dict in Meta format to standard GPT format.
+    This function modifies state_dict in place.
+    """
+    def key_mapping_layers(key):
+        return f"transformer.{key}" if not key.startswith("output.") else key
+    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
+    # Word embedding
+    def key_mapping_emb(key):
+        return re.sub(
+            r"^transformer.tok_embeddings.", "transformer.embeddings.word_embeddings.", key
+        )
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    )
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    if getattr(config, "tie_word_embeddings"):
+        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    else:
+        output_embeddings = state_dict.pop("output.weight")
+        # Need to recompute vocab_size since LLaMa shards the word embeddings and output embeddings
+        # differently.
+        vocab_size = (
+            math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple)
+            * pad_vocab_size_multiple
+        )
+        # It's possible that vocab_size is padded to be a multiple of 8, for example.
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.norm.", r"transformer.ln_f.", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).attention_norm.",
+            r"transformer.layers.\1.norm1.",
+            key,
+        )
+        key = re.sub(r"^transformer.layers.(\d+).ffn_norm.", r"transformer.layers.\1.norm2.", key)
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    # MLP
+    for l in range(config.n_layer):
+        w1 = state_dict.pop(f"transformer.layers.{l}.feed_forward.w1.weight")
+        w3 = state_dict.pop(f"transformer.layers.{l}.feed_forward.w3.weight")
+        # Our ordering is different
+        state_dict[f"transformer.layers.{l}.mlp.fc1.weight"] = torch.cat([w3, w1], dim=0)
+    def key_mapping_mlp(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).feed_forward.w2.",
+            r"transformer.layers.\1.mlp.fc2.",
+            key,
+        )
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # Attention
+    for l in range(config.n_layer):
+        Wq = state_dict.pop(f"transformer.layers.{l}.attention.wq.weight")
+        Wk = state_dict.pop(f"transformer.layers.{l}.attention.wk.weight")
+        Wv = state_dict.pop(f"transformer.layers.{l}.attention.wv.weight")
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
+        # We don't store these
+        state_dict.pop(f"transformer.layers.{l}.attention.inner_attention.rope.freqs", None)
+    def key_mapping_attn(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).attention.wo.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    state_dict.pop("transformer.rope.freqs", None)
+    return state_dict
+def remap_state_dict_hf_llama(
+    state_dict: Dict[str, torch.Tensor], config: GPT2Config
+) -> Dict[str, torch.Tensor]:
+    """Convert the state_dict in Hugging Face format to standard GPT format.
+    This function modifies state_dict in place.
+    """
+    # Embedding
+    def key_mapping_emb(key):
+        return re.sub(r"^model.embed_tokens.", "transformer.embeddings.word_embeddings.", key)
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    )
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    # LM head
+    if getattr(config, "tie_word_embeddings"):
+        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    else:
+        output_embeddings = state_dict.pop("lm_head.weight")
+        # Need to recompute vocab_size since LLaMa shards the word embeddings and output embeddings
+        # differently.
+        vocab_size = (
+            math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple)
+            * pad_vocab_size_multiple
+        )
+        # It's possible that vocab_size is padded to be a multiple of 8, for example.
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+    # MLP
+    for l in range(config.n_layer):
+        # Fusing weights this way based on difference in the following:
+        # https://github.com/huggingface/transformers/blob/b42010bb1d3cbf262d27e0a328661885be46dfdb/src/transformers/models/llama/modeling_llama.py#L220
+        # https://github.com/Dao-AILab/flash-attention/blob/c60851a8253257eb970e06a022c82517a8033e8c/flash_attn/modules/mlp.py#L115
+        w1 = state_dict.pop(f"model.layers.{l}.mlp.gate_proj.weight")
+        w3 = state_dict.pop(f"model.layers.{l}.mlp.up_proj.weight")
+        state_dict[f"transformer.layers.{l}.mlp.fc1.weight"] = torch.cat([w3, w1], dim=0)
+    def key_mapping_mlp(key):
+        return re.sub(
+            r"^model.layers.(\d+).mlp.down_proj.",
+            r"transformer.layers.\1.mlp.fc2.",
+            key,
+        )
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^model.norm.", r"transformer.ln_f.", key)
+        key = re.sub(
+            r"^model.layers.(\d+).input_layernorm.",
+            r"transformer.layers.\1.norm1.",
+            key,
+        )
+        key = re.sub(
+            r"^model.layers.(\d+).post_attention_layernorm.",
+            r"transformer.layers.\1.norm2.",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    def inv_permute(w):
+        # Inverse of permute implemented in:
+        # https://github.com/huggingface/transformers/blob/b42010bb1d3cbf262d27e0a328661885be46dfdb/src/transformers/models/llama/convert_llama_weights_to_hf.py#L114
+        return rearrange(
+            w, "(h two d) n -> (h d two) n", d=config.n_embd // config.n_head // 2, two=2
+        )
+    # Attention
+    for l in range(config.n_layer):
+        Wq = state_dict.pop(f"model.layers.{l}.self_attn.q_proj.weight")
+        Wk = state_dict.pop(f"model.layers.{l}.self_attn.k_proj.weight")
+        Wv = state_dict.pop(f"model.layers.{l}.self_attn.v_proj.weight")
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat(
+            [inv_permute(Wq), inv_permute(Wk), Wv], dim=0
+        )
+        # We don't store these
+        state_dict.pop(f"model.layers.{l}.self_attn.rotary_emb.inv_freq", None)
+    def key_mapping_attn(key):
+        return re.sub(
+            r"^model.layers.(\d+).self_attn.o_proj.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    return state_dict
+def inv_remap_state_dict_hf_llama(
+    state_dict: Dict[str, torch.Tensor], config: GPT2Config
+) -> Dict[str, torch.Tensor]:
+    """Convert the state_dict in standard GPT format to Hugging Face format.
+    This function is meant to be the inverse of remap_state_dict_hf_llama, up to a
+    multiplier pad in the embedding and lm_head. That is if the original embedding
+    isn't a multiple of pad_vocab_size_multiple, then
+    inv_remap_state_dict_hf_llama(remap_state_dict_hf_llama(state_dict)) != state_dict.
+    This function modifies state_dict in place.
+    """
+    # Embedding
+    def key_mapping_emb(key):
+        return re.sub(r"^transformer.embeddings.word_embeddings.", "model.embed_tokens.", key)
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("model.embed_tokens.weight")
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    )
+    state_dict["model.embed_tokens.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    # LM head
+    if getattr(config, "tie_word_embeddings"):
+        state_dict["lm_head.weight"] = state_dict["model.embed_tokens.weight"]
+    else:
+        output_embeddings = state_dict.pop("lm_head.weight")
+        vocab_size = (
+            math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple)
+            * pad_vocab_size_multiple
+        )
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+    # MLP
+    for l in range(config.n_layer):
+        w3, w1 = torch.chunk(
+            state_dict.pop(f"transformer.layers.{l}.mlp.fc1.weight"), chunks=2, dim=0
+        )
+        state_dict[f"model.layers.{l}.mlp.gate_proj.weight"] = w1
+        state_dict[f"model.layers.{l}.mlp.up_proj.weight"] = w3
+    def key_mapping_mlp(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).mlp.fc2.",
+            r"model.layers.\1.mlp.down_proj.",
+            key,
+        )
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.ln_f.", r"model.norm.", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).norm1.",
+            r"model.layers.\1.input_layernorm.",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).norm2.",
+            r"model.layers.\1.post_attention_layernorm.",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    def permute(w):
+        return rearrange(
+            w, "(h d two) n -> (h two d) n", d=config.n_embd // config.n_head // 2, two=2
+        )
+    n_head = config.n_head
+    n_head_kv = getattr(config, "n_head_kv", n_head)
+    embed_dim = config.hidden_size
+    head_dim = embed_dim // n_head
+    q_dim = n_head * head_dim
+    k_dim = v_dim = n_head_kv * head_dim
+    # Attention
+    for l in range(config.n_layer):
+        Wqkv = state_dict.pop(f"transformer.layers.{l}.mixer.Wqkv.weight")
+        Wq = Wqkv[:q_dim]
+        Wk = Wqkv[q_dim : q_dim + k_dim]
+        Wv = Wqkv[q_dim + k_dim : q_dim + k_dim + v_dim]
+        state_dict[f"model.layers.{l}.self_attn.q_proj.weight"] = permute(Wq)
+        state_dict[f"model.layers.{l}.self_attn.k_proj.weight"] = permute(Wk)
+        state_dict[f"model.layers.{l}.self_attn.v_proj.weight"] = Wv
+        state_dict.pop(f"transformer.layers.{l}.attention.inner_attention.rope.freqs", None)
+    def key_mapping_attn(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).mixer.out_proj.",
+            r"model.layers.\1.self_attn.o_proj.",
+            key,
+        )
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    return state_dict
+def config_from_meta_checkpoint(
+    checkpoint_path: Union[str, os.PathLike], model_name: str
+) -> LlamaConfig:
+    """Load a LlamaConfig from a checkpoint path."""
+    with open(Path(checkpoint_path) / model_name / "params.json") as f:
+        params = json.load(f)
+    config = LlamaConfig(
+        hidden_size=params["dim"],
+        intermediate_size=None,
+        num_attention_heads=params["n_heads"],
+        num_hidden_layers=params["n_layers"],
+        rms_norm_eps=params["norm_eps"],
+        num_key_value_heads=params.get("n_kv_heads", None),
+    )
+    multiple_of = params.get("multiple_of", 1)
+    ffn_dim_multiplier = params.get("ffn_dim_multiplier", None)
+    # Compute the hidden dimension of the MLP
+    # https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/model.py#L224
+    intermediate_size = 4 * config.hidden_size
+    # https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/model.py#L195-L199
+    intermediate_size = int(2 * intermediate_size / 3)
+    # custom dim factor multiplier
+    if ffn_dim_multiplier is not None:
+        intermediate_size = int(ffn_dim_multiplier * intermediate_size)
+    intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of)
+    config.intermediate_size = intermediate_size
+    if "rope_theta" in params:
+        config.rotary_emb_base = params["rope_theta"]
+    config.vocab_size = 32000
+    # some CodeLLaMa have vocab_size 32000, some 32016
+    # Sadly it's not specified in the `params.json` file :(
+    tokenizer = Path(checkpoint_path) / model_name / "tokenizer.model"
+    if tokenizer.is_file():
+        config.vocab_size = SentencePieceProcessor(str(tokenizer)).vocab_size()
+    return config
+def config_from_hf_checkpoint(
+    checkpoint_path: Union[str, os.PathLike], model_name: str
+) -> LlamaConfig:
+    return LlamaConfig.from_pretrained(Path(checkpoint_path) / f"{model_name}-hf" / "config.json")
+def config_from_checkpoint(
+    checkpoint_path: Union[str, os.PathLike], model_name: str, checkpoint_format="meta"
+) -> LlamaConfig:
+    if checkpoint_format == "meta":
+        return config_from_meta_checkpoint(checkpoint_path, model_name)
+    else:
+        return config_from_hf_checkpoint(checkpoint_path, model_name)
+def state_dicts_from_checkpoint(
+    checkpoint_path: Union[str, os.PathLike], model_name: str
+) -> List[dict]:
+    # Need to sort, otherwise we mess up the ordering and the weights are wrong
+    return [
+        torch.load(path, map_location="cpu")
+        for path in sorted((Path(checkpoint_path) / model_name).glob("consolidated.*.pth"))
+    ]
+def llama_config_to_gpt2_config(llama_config: LlamaConfig) -> GPT2Config:
+    return GPT2Config(
+        vocab_size=llama_config.vocab_size,
+        n_positions=0,  # No absolute position embedding
+        n_embd=llama_config.hidden_size,
+        n_layer=llama_config.num_hidden_layers,
+        n_head=llama_config.num_attention_heads,
+        n_inner=llama_config.intermediate_size,
+        activation_function="swiglu",  # Hardcode since HF calls it 'silu'
+        # Llama doesn't have dropout, idk if it's because they only release the inference code
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        layer_norm_epsilon=llama_config.rms_norm_eps,
+        initializer_range=llama_config.initializer_range,
+        bos_token_id=llama_config.bos_token_id,
+        eos_token_id=llama_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        pad_token_id=llama_config.pad_token_id,  # Idk if this does anything
+        rms_norm=True,
+        rotary_emb_fraction=1.0,
+        rotary_emb_interleaved=True,
+        tie_word_embeddings=False,
+        qkv_proj_bias=False,
+        out_proj_bias=False,
+        mlp_fc1_bias=False,
+        mlp_fc2_bias=False,
+        rotary_emb_base=getattr(llama_config, "rotary_emb_base", 10000.0),
+        n_head_kv=llama_config.num_key_value_heads,
+    )

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/opt.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright (c) 2023, Tri Dao.
+import math
+import re
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+from transformers import GPT2Config, OPTConfig
+def remap_state_dict_hf_opt(state_dict, config):
+    def key_mapping_model(key):
+        key = re.sub(r"^model.decoder.", "transformer.", key)
+        # The OPT-350m model uses '^decoder' instead of '^model.decoder'
+        key = re.sub(r"^decoder.", "transformer.", key)
+        return key
+    state_dict = OrderedDict((key_mapping_model(k), v) for k, v in state_dict.items())
+    # Word embedding and position embedding
+    def key_mapping_emb(key):
+        key = re.sub(r"^transformer.embed_tokens.", "transformer.embeddings.word_embeddings.", key)
+        # The OPT-350m model uses has project_in and project_out
+        key = re.sub(r"^transformer.project_in.", "transformer.embeddings.project_in.", key)
+        key = re.sub(r"^transformer.project_out.", "project_out.", key)
+        key = re.sub(
+            r"^transformer.embed_positions.", "transformer.embeddings.position_embeddings.", key
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    # OPT uses the first 2 indices of pos_emb for padding tokens
+    pos_embeddings = state_dict.pop("transformer.embeddings.position_embeddings.weight")
+    state_dict["transformer.embeddings.position_embeddings.weight"] = pos_embeddings[2:]
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.final_layer_norm.", r"transformer.ln_f.", key)
+        # The OPT-175B checkpoint calls this 'decoder.layer_norm' instead of 'decoder.final_layer_norm'
+        key = re.sub(r"^transformer.layer_norm.", r"transformer.ln_f.", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attn_layer_norm.", r"transformer.layers.\1.norm1.", key
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).final_layer_norm.", r"transformer.layers.\1.norm2.", key
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    # MLP
+    def key_mapping_mlp(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).fc(1|2).", r"transformer.layers.\1.mlp.fc\2.", key
+        )
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # Attention
+    for l in range(config.n_layer):
+        Wq = state_dict.pop(f"transformer.layers.{l}.self_attn.q_proj.weight")
+        Wk = state_dict.pop(f"transformer.layers.{l}.self_attn.k_proj.weight")
+        Wv = state_dict.pop(f"transformer.layers.{l}.self_attn.v_proj.weight")
+        bq = state_dict.pop(f"transformer.layers.{l}.self_attn.q_proj.bias")
+        bk = state_dict.pop(f"transformer.layers.{l}.self_attn.k_proj.bias")
+        bv = state_dict.pop(f"transformer.layers.{l}.self_attn.v_proj.bias")
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
+    def key_mapping_attn(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).self_attn.out_proj.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    return state_dict
+def opt_config_to_gpt2_config(opt_config: OPTConfig) -> GPT2Config:
+    assert opt_config.layerdrop == 0.0
+    assert opt_config.layer_norm_elementwise_affine
+    word_embed_proj_dim = (
+        None
+        if opt_config.word_embed_proj_dim == opt_config.hidden_size
+        else opt_config.word_embed_proj_dim
+    )
+    return GPT2Config(
+        vocab_size=opt_config.vocab_size,
+        n_positions=opt_config.max_position_embeddings,
+        n_embd=opt_config.hidden_size,
+        n_layer=opt_config.num_hidden_layers,
+        n_head=opt_config.num_attention_heads,
+        n_inner=opt_config.ffn_dim,
+        activation_function=opt_config.activation_function,
+        resid_pdrop=opt_config.dropout,
+        # HF's implementation of OPT doesn't seem to have embedding dropout
+        embd_pdrop=opt_config.dropout,
+        attn_pdrop=opt_config.attention_dropout,
+        initializer_range=opt_config.init_std,
+        bos_token_id=opt_config.bos_token_id,
+        eos_token_id=opt_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        prenorm=opt_config.do_layer_norm_before,
+        word_embed_proj_dim=word_embed_proj_dim,
+    )

.venv/lib/python3.11/site-packages/xformers/_flash_attn/models/vit.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# Copyright (c) 2022, Tri Dao.
+# Inspired by / adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+import math
+import re
+from collections import OrderedDict
+from copy import deepcopy
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from timm.models.helpers import named_apply
+from torch.nn.init import trunc_normal_
+from torchvision.ops import StochasticDepth
+from flash_attn.layers.patch_embed import PatchEmbed
+from flash_attn.modules.block import Block
+from flash_attn.modules.mha import MHA
+from flash_attn.modules.mlp import FusedMLP, Mlp
+try:
+    from flash_attn.ops.triton.layer_norm import layer_norm_fn
+except ImportError:
+    layer_norm_fn = None
+def create_mixer_cls(
+    num_heads, qkv_bias, attn_drop, use_flash_attn, fused_bias_fc, cross_attn=False
+):
+    mixer_cls = partial(
+        MHA,
+        num_heads=num_heads,
+        cross_attn=cross_attn,
+        qkv_proj_bias=qkv_bias,
+        dropout=attn_drop,
+        fused_bias_fc=fused_bias_fc,
+        use_flash_attn=use_flash_attn,
+    )
+    return mixer_cls
+def create_mlp_cls(embed_dim, mlp_ratio, act_layer, fused_mlp):
+    inner_dim = int(embed_dim * mlp_ratio)
+    if not fused_mlp:
+        mlp_cls = partial(Mlp, hidden_features=inner_dim, activation=act_layer())
+    else:
+        mlp_cls = partial(FusedMLP, hidden_features=inner_dim)
+    return mlp_cls
+def create_block(
+    embed_dim,
+    num_heads,
+    mlp_ratio,
+    qkv_bias,
+    drop_rate,
+    attn_drop_rate,
+    drop_path1,
+    drop_path2,
+    norm_layer,
+    act_layer,
+    use_flash_attn,
+    fused_bias_fc,
+    fused_mlp,
+    fused_dropout_add_ln,
+    layer_idx=None,
+    n_layer=None,
+    last_layer_subset=False,
+):
+    mixer_cls = create_mixer_cls(
+        num_heads,
+        qkv_bias,
+        attn_drop_rate,
+        use_flash_attn,
+        fused_bias_fc,
+        cross_attn=(last_layer_subset and layer_idx == n_layer - 1),
+    )
+    mlp_cls = create_mlp_cls(embed_dim, mlp_ratio, act_layer, fused_mlp)
+    # TD [2022-10-15]: Force residual in fp32 in case of DeepSpeed
+    block = Block(
+        embed_dim,
+        mixer_cls,
+        mlp_cls,
+        norm_cls=norm_layer,
+        prenorm=True,
+        resid_dropout1=drop_rate,
+        resid_dropout2=drop_rate,
+        drop_path1=drop_path1,
+        drop_path2=drop_path2,
+        fused_dropout_add_ln=fused_dropout_add_ln,
+        residual_in_fp32=True,
+    )
+    return block
+class VisionTransformer(nn.Module):
+    """Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        num_classes=1000,
+        global_pool="token",
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        init_values=None,
+        class_token=True,
+        no_embed_class=False,
+        pre_norm=False,
+        fc_norm=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        weight_init="",
+        embed_layer=PatchEmbed,
+        norm_layer=None,
+        act_layer=None,
+        use_flash_attn=False,
+        fused_bias_fc=False,
+        fused_mlp=False,
+        fused_dropout_add_ln=False,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            global_pool (str): type of global pooling for final sequence (default: 'token')
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            init_values: (float): layer-scale init values
+            class_token (bool): use class token
+            fc_norm (Optional[bool]): pre-fc norm after pool, set if global_pool == 'avg' if None (default: None)
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            weight_init (str): weight init scheme
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+            act_layer: (nn.Module): MLP activation layer
+        """
+        super().__init__()
+        assert global_pool == "token", "Only support pooling with CLS token"
+        assert class_token
+        assert init_values is None, "LayerScale is not supported yet"
+        assert weight_init == ""
+        assert fc_norm is None
+        # pre_norm seems redundant, as there's a LayerNorm right at the start of each block, idk
+        assert not pre_norm
+        use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = (
+            self.embed_dim
+        ) = embed_dim  # num_features for consistency with other models
+        self.num_prefix_tokens = 1 if class_token else 0
+        self.no_embed_class = no_embed_class
+        patch_embed_extra_kwargs = (
+            {"fused_bias_fc": fused_bias_fc} if embed_layer is PatchEmbed else {}
+        )
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            bias=not pre_norm,  # disable bias if pre-norm is used (e.g. CLIP)
+            **patch_embed_extra_kwargs,
+        )
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None
+        embed_len = num_patches if no_embed_class else num_patches + self.num_prefix_tokens
+        self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        # We change the order of dropout, residual and layer norm:
+        # Instead of LN -> Attn / MLP -> Dropout -> Add, we do:
+        # Dropout -> Add -> LN -> Attn / MLP, returning both the residual branch (output of Add) and
+        # the main branch (output of MLP). The model definition is unchanged, but the mapping of the
+        # nn.Dropout probabilities are changed.
+        # This is for performance reason: we can fuse dropout + add + layer_norm.
+        self.blocks = nn.ModuleList(
+            [
+                create_block(
+                    embed_dim,
+                    num_heads,
+                    mlp_ratio,
+                    qkv_bias,
+                    drop_rate,
+                    attn_drop_rate,
+                    drop_path1=dpr[i - 1] if i > 0 else 0.0,
+                    drop_path2=dpr[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                    use_flash_attn=use_flash_attn,
+                    fused_bias_fc=fused_bias_fc,
+                    fused_mlp=fused_mlp,
+                    fused_dropout_add_ln=fused_dropout_add_ln,
+                    layer_idx=i,
+                    n_layer=depth,
+                    last_layer_subset=(global_pool == "token"),
+                )
+                for i in range(depth)
+            ]
+        )
+        self.dropout = nn.Dropout(p=drop_rate)
+        self.drop_path = StochasticDepth(p=dpr[-1], mode="row")
+        self.norm = norm_layer(embed_dim)
+        self.fused_dropout_add_ln = fused_dropout_add_ln
+        if self.fused_dropout_add_ln and layer_norm_fn is None:
+            raise ImportError("Triton is not installed")
+        # Classifier Head
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        self.init_weights(weight_init)
+    def init_weights(self, mode=""):
+        assert mode == ""
+        trunc_normal_(self.pos_embed, std=0.02)
+        if self.cls_token is not None:
+            nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def _init_weights(self, m):
+        # this fn left here for compat with downstream users
+        init_weights_vit_timm(m)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"pos_embed", "cls_token"}
+    def _pos_embed(self, x):
+        if self.no_embed_class:
+            # deit-3, updated JAX (big vision)
+            # position embedding does not overlap with class token, add then concat
+            x = x + self.pos_embed
+            if self.cls_token is not None:
+                x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        else:
+            # original timm, JAX, and deit vit impl
+            # pos_embed has entry for class token, concat then add
+            if self.cls_token is not None:
+                x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+            x = x + self.pos_embed
+        return x
+    def forward_features(self, x, all_tokens=True):
+        """
+        If all_tokens==False and self.global_pool == 'token', we only return the features for the
+        cls token.
+        """
+        x = self.patch_embed(x)
+        hidden_states = self._pos_embed(x)
+        residual = None
+        if self.global_pool != "token" or all_tokens:
+            # if True:
+            for block in self.blocks:
+                hidden_states, residual = block(hidden_states, residual)
+        else:
+            for block in self.blocks[:-1]:
+                hidden_states, residual = block(hidden_states, residual)
+            # For the last layer, we only want the 1st token of the output. So we do cross-attention
+            # where the query is the 1st token and the key/value is the whole sequence.
+            hidden_states, residual = self.blocks[-1](
+                hidden_states, residual, mixer_subset=slice(0, 1)
+            )
+        if not self.fused_dropout_add_ln:
+            residual = self.drop_path(self.dropout(hidden_states)) + residual
+            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+        else:
+            if self.drop_path.p == 0 or not self.training:
+                rowscale = None
+            else:
+                rowscale = self.drop_path(
+                    torch.ones(
+                        hidden_states.shape[:-1],
+                        device=hidden_states.device,
+                        dtype=hidden_states.dtype,
+                    )
+                )
+            # Set prenorm=False here since we don't need to the residual
+            hidden_states = layer_norm_fn(
+                hidden_states,
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual,
+                eps=self.norm.eps,
+                dropout_p=self.dropout.p if self.training else 0.0,
+                rowscale=rowscale,
+                prenorm=False,
+            )
+        return hidden_states
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool:
+            x = x[:, self.num_prefix_tokens :].mean(dim=1) if self.global_pool == "avg" else x[:, 0]
+        return x if pre_logits else self.head(x)
+    def forward(self, x):
+        x = self.forward_features(x, all_tokens=False)
+        x = self.forward_head(x)
+        return x
+    def load_state_dict(self, state_dict, strict=True):
+        patch_embed_weight = state_dict["patch_embed.proj.weight"]
+        if patch_embed_weight.dim() == 4:
+            # convert from Conv2d to Linear
+            state_dict["patch_embed.proj.weight"] = rearrange(
+                patch_embed_weight, "o c h w -> o (c h w)"
+            )
+        def key_mapping_attn(key):
+            key = re.sub(r"^blocks.(\d+).attn.qkv.", r"blocks.\1.mixer.Wqkv.", key)
+            key = re.sub(r"^blocks.(\d+).attn.proj.", r"blocks.\1.mixer.out_proj.", key)
+            return key
+        state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+        n_layer = len(self.blocks)
+        # Convert from Wqkv to Wq and Wkv for cross attention (last layer)
+        if (
+            self.blocks[-1].mixer.cross_attn
+            and f"blocks.{n_layer - 1}.mixer.Wqkv.weight" in state_dict
+        ):
+            Wqkv = state_dict.pop(f"blocks.{n_layer - 1}.mixer.Wqkv.weight")
+            bqkv = state_dict.pop(f"blocks.{n_layer - 1}.mixer.Wqkv.bias")
+            state_dict[f"blocks.{n_layer - 1}.mixer.Wq.weight"] = Wqkv[: self.embed_dim]
+            state_dict[f"blocks.{n_layer - 1}.mixer.Wkv.weight"] = Wqkv[self.embed_dim :]
+            state_dict[f"blocks.{n_layer - 1}.mixer.Wq.bias"] = bqkv[: self.embed_dim]
+            state_dict[f"blocks.{n_layer - 1}.mixer.Wkv.bias"] = bqkv[self.embed_dim :]
+        return super().load_state_dict(state_dict, strict=strict)
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, "init_weights"):
+        module.init_weights()
+def vit_base_patch16_224(pretrained=False, **kwargs):
+    """ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    assert not pretrained
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = VisionTransformer(**model_kwargs)
+    return model

.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (197 Bytes). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/__pycache__/activations.cpython-311.pyc ADDED Viewed

Binary file (6.86 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/__pycache__/fused_dense.cpython-311.pyc ADDED Viewed

Binary file (30.4 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/__pycache__/layer_norm.cpython-311.pyc ADDED Viewed

Binary file (22.6 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/__pycache__/rms_norm.cpython-311.pyc ADDED Viewed

Binary file (5.19 kB). View file

.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/activations.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Copied from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/model/layers/activations.py
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+@torch.jit.script
+def bias_gelu(y, bias):
+    x = bias + y
+    return (x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))).to(dtype=y.dtype)
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, y, bias):
+    """Assume that y has shape (B, D) and bias has shape (D)"""
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
+        1 + tanh_out
+    )
+    grad_y = ff * g
+    return grad_y.to(dtype=y.dtype), grad_y.sum(dim=(0), dtype=bias.dtype)
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(input, bias)
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, input, bias)
+        return tmp, tmp
+bias_gelu_impl = GeLUFunction.apply
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+@torch.jit.script
+def gelu_fwd(x):
+    return (x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))).to(dtype=x.dtype)
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def gelu_bwd(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
+        1 + tanh_out
+    )
+    return (ff * g).to(dtype=x.dtype)
+class FastGeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        return gelu_fwd(input)
+    @staticmethod
+    def backward(ctx, grad_output):
+        (input,) = ctx.saved_tensors
+        tmp = gelu_bwd(grad_output, input)
+        return tmp
+fast_gelu_impl = FastGeLUFunction.apply
+@torch.jit.script
+def relu_bwd(g, x):
+    return torch.where(x >= 0, g, 0.0).to(dtype=x.dtype)
+@torch.jit.script
+def sqrelu_fwd(x):
+    r = F.relu(x)
+    return (r * r).to(dtype=x.dtype)
+@torch.jit.script
+def sqrelu_bwd(g, x):
+    return (2.0 * g * F.relu(x)).to(dtype=x.dtype)
+swiglu_fwd_codestring = """
+template <typename T> T swiglu_fwd(T x, T y) {
+    return float(x) * float(y) / (1.0f + ::exp(-float(x)));
+}
+"""
+swiglu_bwd_codestring = """
+template <typename T> T swiglu_bwd(T x, T y, T g, T& dx, T& dy) {
+    float x_sigmoid = 1.0f / (1.0f + ::exp(-float(x)));
+    dx = x_sigmoid * (1 + float(x) * (1.0f - x_sigmoid)) * float(g) * float(y);
+    dy = float(x) * x_sigmoid * float(g);
+}
+"""
+swiglu_fwd = torch.cuda.jiterator._create_jit_fn(swiglu_fwd_codestring)
+swiglu_bwd = torch.cuda.jiterator._create_multi_output_jit_fn(swiglu_bwd_codestring, num_outputs=2)
+class SwiGLUFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, y):
+        ctx.save_for_backward(x, y)
+        return swiglu_fwd(x, y)
+    @staticmethod
+    def backward(ctx, dout):
+        x, y = ctx.saved_tensors
+        return swiglu_bwd(x, y, dout)
+swiglu = SwiGLUFunction.apply

.venv/lib/python3.11/site-packages/xformers/_flash_attn/ops/fused_dense.py ADDED Viewed

	@@ -0,0 +1,688 @@

+# Copyright (c) 2023, Tri Dao.
+# Inspired by https://github.com/NVIDIA/apex/blob/master/apex/fused_dense/fused_dense.py
+# We make it work with pytorch amp and with bfloat16.
+# The TensorParallel linear modules are inspired by https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/layers.py
+from functools import partial
+from typing import Optional
+# import fused_dense_cuda  # from apex
+import fused_dense_lib as fused_dense_cuda
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.distributed import ProcessGroup
+from flash_attn.ops.activations import gelu_bwd, relu_bwd, sqrelu_bwd, sqrelu_fwd
+from flash_attn.utils.distributed import (
+    all_gather_raw,
+    all_reduce,
+    all_reduce_raw,
+    reduce_scatter,
+    reduce_scatter_raw,
+)
+class FusedDenseFunc(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx, x, weight, bias, return_residual=False, process_group=None, sequence_parallel=True
+    ):
+        """
+        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
+        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
+        """
+        ctx.compute_weight_gradient = weight.requires_grad
+        ctx.return_residual = return_residual
+        ctx.process_group = process_group
+        ctx.sequence_parallel = sequence_parallel
+        if torch.is_autocast_enabled():
+            x = x.to(dtype=torch.get_autocast_gpu_dtype())
+        x = x.contiguous()
+        if process_group is not None and sequence_parallel:
+            # We want to kick off the all_gather early, before weight dtype conversion
+            total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+        else:
+            total_x = x
+        if torch.is_autocast_enabled():
+            weight = weight.to(dtype=torch.get_autocast_gpu_dtype())
+            bias = bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
+        weight = weight.contiguous()
+        if process_group is not None and sequence_parallel:
+            handle_x.wait()
+        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
+        batch_dim = batch_shape.numel()
+        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
+        if min(batch_dim, n, *weight.shape) > 65535 * 32:
+            raise RuntimeError("fused_dense only supports matrix dims <= 2M")
+        output = F.linear(total_x, weight, bias)
+        if ctx.compute_weight_gradient:
+            ctx.save_for_backward(x, weight)
+        else:
+            ctx.save_for_backward(weight)
+        return output if not return_residual else (output, x)
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output, *args):
+        grad_output = grad_output.contiguous()
+        if ctx.return_residual:
+            (grad_input,) = args
+            grad_input = grad_input.contiguous()
+        process_group = ctx.process_group
+        sequence_parallel = ctx.sequence_parallel
+        if ctx.compute_weight_gradient:
+            x, weight = ctx.saved_tensors
+            if process_group is not None and sequence_parallel:
+                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+            else:
+                total_x = x
+        else:
+            (weight,) = ctx.saved_tensors
+            total_x = None
+        batch_shape = grad_output.shape[:-1]
+        batch_dim = batch_shape.numel()
+        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_output, weight.t())
+            else:
+                grad_input = torch.addmm(
+                    grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight
+                )
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+            if process_group is not None:
+                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
+                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
+        else:
+            grad_input = None
+        if ctx.needs_input_grad[1]:
+            assert ctx.compute_weight_gradient
+            if process_group is not None and sequence_parallel:
+                handle_x.wait()
+            grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
+                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+            )
+        else:
+            grad_weight = None
+            grad_bias = grad_output if ctx.needs_input_grad[2] else None
+        if process_group is not None and ctx.needs_input_grad[0]:
+            handle_grad_input.wait()
+        return grad_input, grad_weight, grad_bias, None, None, None
+def fused_dense_func(
+    x: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    return_residual: bool = False,
+    process_group: Optional[ProcessGroup] = None,
+    sequence_parallel: bool = True,
+):
+    dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
+        x.dtype == torch.float32 and torch.is_autocast_enabled()
+    )
+    if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
+        return FusedDenseFunc.apply(
+            x, weight, bias, return_residual, process_group, sequence_parallel
+        )
+    else:
+        assert process_group is None
+        out = F.linear(x, weight, bias)
+        return out if not return_residual else (out, x)
+class FusedDense(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        return_residual: bool = False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__(in_features, out_features, bias=bias, device=device, dtype=dtype)
+        self.return_residual = return_residual
+    def forward(self, x, process_group=None):
+        """
+        If process_group is not None, we're doing Tensor Parallel with sequence parallelism:
+        we do an all_gather of x before doing the matmul.
+        """
+        return fused_dense_func(
+            x,
+            self.weight,
+            self.bias,
+            return_residual=self.return_residual,
+            process_group=process_group,
+        )
+class ColumnParallelLinear(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        process_group: ProcessGroup,
+        bias: bool = True,
+        sequence_parallel=True,
+        multiple_of=1,
+        device=None,
+        dtype=None,
+    ) -> None:
+        world_size = torch.distributed.get_world_size(process_group)
+        if out_features % multiple_of:
+            raise ValueError(f"out_features ({out_features}) must be a multiple of {multiple_of}")
+        multiple = out_features // multiple_of
+        # We want to split @multiple across world_size, but it could be an uneven split
+        div = multiple // world_size
+        mod = multiple % world_size
+        # The first @mod ranks get @div + 1 copies, the rest get @div copies
+        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
+        super().__init__(
+            in_features, local_multiple * multiple_of, bias=bias, device=device, dtype=dtype
+        )
+        self.process_group = process_group
+        self.sequence_parallel = sequence_parallel
+    def forward(self, x):
+        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+        # we do an all_gather of x before doing the matmul.
+        # If not, then the input is already gathered.
+        return fused_dense_func(
+            x,
+            self.weight,
+            self.bias,
+            process_group=self.process_group,
+            sequence_parallel=self.sequence_parallel,
+        )
+class RowParallelLinear(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        process_group: ProcessGroup,
+        bias: bool = True,
+        sequence_parallel=True,
+        multiple_of=1,
+        device=None,
+        dtype=None,
+    ) -> None:
+        world_size = torch.distributed.get_world_size(process_group)
+        rank = torch.distributed.get_rank(process_group)
+        if in_features % multiple_of:
+            raise ValueError(f"in_features ({in_features}) must be a multiple of {multiple_of}")
+        multiple = in_features // multiple_of
+        # We want to split @multiple across world_size, but it could be an uneven split
+        div = multiple // world_size
+        mod = multiple % world_size
+        # The first @mod ranks get @div + 1 copies, the rest get @div copies
+        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
+        # Only rank 0 will have bias
+        super().__init__(
+            local_multiple * multiple_of,
+            out_features,
+            bias=bias and rank == 0,
+            device=device,
+            dtype=dtype,
+        )
+        self.process_group = process_group
+        self.sequence_parallel = sequence_parallel
+    def forward(self, x):
+        """
+        We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
+        a reduce_scatter of the result.
+        """
+        out = fused_dense_func(x, self.weight, self.bias)
+        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
+        return reduce_fn(out, self.process_group)
+class FusedMLPFunc(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx,
+        x,
+        weight1,
+        bias1,
+        weight2,
+        bias2,
+        activation="gelu_approx",
+        save_pre_act=True,
+        return_residual=False,
+        checkpoint_lvl=0,
+        heuristic=0,
+        process_group=None,
+        sequence_parallel=True,
+    ):
+        """
+        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
+        with sequence parallelism: we do an all_gather of x before doing the matmul.
+        If sequence_parallel=False, then the input is already gathered.
+        checkpoint_lvl:
+        0: no recomputation in the bwd
+        1: recompute gelu_out / relu_out in the bwd
+        2: recompute pre_act and gelu_out / relu_out in the bwd
+        """
+        assert -1 <= heuristic <= 4
+        assert activation in ["gelu_approx", "relu", "sqrelu"]
+        if activation == "sqrelu":
+            assert heuristic == -1
+        if not save_pre_act:
+            checkpoint_lvl = 2
+        assert checkpoint_lvl in [0, 1, 2]
+        ctx.return_residual = return_residual
+        ctx.process_group = process_group
+        ctx.sequence_parallel = sequence_parallel
+        ctx.checkpoint_lvl = checkpoint_lvl
+        ctx.activation = activation
+        ctx.heuristic = heuristic
+        if torch.is_autocast_enabled():
+            x = x.to(dtype=torch.get_autocast_gpu_dtype())
+        x = x.contiguous()
+        if process_group is not None and sequence_parallel:
+            # We want to kick off the all_gather early, before weight dtype conversion
+            total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+        else:
+            total_x = x
+        if torch.is_autocast_enabled():
+            dtype = torch.get_autocast_gpu_dtype()
+            weight1, weight2 = [a.to(dtype=dtype) for a in [weight1, weight2]]
+            bias1 = bias1.to(dtype=dtype) if bias1 is not None else None
+            bias2 = bias2.to(dtype=dtype) if bias2 is not None else None
+        weight1 = weight1.contiguous()
+        bias1 = bias1.contiguous() if bias1 is not None else None
+        weight2 = weight2.contiguous()
+        bias2 = bias2.contiguous() if bias2 is not None else None
+        if process_group is not None and sequence_parallel:
+            handle_x.wait()
+        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
+        batch_dim = batch_shape.numel()
+        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
+        if min(batch_dim, n, *weight1.shape, *weight2.shape) > 65535 * 32:
+            raise RuntimeError("fused_dense only supports matrix dims <= 2M")
+        if heuristic == -1:
+            pre_act = F.linear(total_x, weight1, bias1)
+            activation_fn = (
+                partial(F.gelu, approximate="tanh")
+                if activation == "gelu_approx"
+                else (sqrelu_fwd if activation == "sqrelu" else F.relu)
+            )
+            with torch.jit.fuser("fuser2"):
+                output1 = activation_fn(pre_act)
+            # This is before adding bias1
+            # pre_act = F.linear(total_x.reshape(batch_dim, n), weight1)
+            # with torch.jit.fuser('fuser2'):
+            #     output1 = bias_gelu(pre_act, bias1)
+        else:
+            is_gelu = activation == "gelu_approx"
+            output1, *rest = fused_dense_cuda.linear_act_forward(
+                total_x.reshape(batch_dim, n), weight1, bias1, is_gelu, save_pre_act, heuristic
+            )
+            if save_pre_act:
+                pre_act = rest[0]
+        output2 = F.linear(output1, weight2, bias2)
+        if checkpoint_lvl == 0 or (checkpoint_lvl == 1 and activation == "relu"):
+            # For RELU the pre_act is very small (just a bit-mask) so we just save it
+            ctx.save_for_backward(x, weight1, weight2, pre_act, output1)
+        elif checkpoint_lvl == 1:
+            ctx.save_for_backward(x, weight1, weight2, pre_act)
+        elif checkpoint_lvl == 2:
+            ctx.save_for_backward(x, weight1, weight2, bias1)
+        output2 = output2.reshape(*batch_shape, output2.shape[-1])
+        return output2 if not return_residual else (output2, x)
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output, *args):
+        grad_output = grad_output.contiguous()
+        checkpoint_lvl = ctx.checkpoint_lvl
+        activation = ctx.activation
+        activation_fn = (
+            partial(F.gelu, approximate="tanh")
+            if activation == "gelu_approx"
+            else (sqrelu_fwd if activation == "sqrelu" else F.relu)
+        )
+        if ctx.return_residual:
+            (grad_input,) = args
+            grad_input = grad_input.contiguous()
+        process_group = ctx.process_group
+        sequence_parallel = ctx.sequence_parallel
+        x, weight1, weight2, *rest = ctx.saved_tensors
+        if process_group is None or not sequence_parallel:
+            total_x = x
+        batch_shape = grad_output.shape[:-1]
+        batch_dim = batch_shape.numel()
+        if checkpoint_lvl in [0, 1]:
+            if process_group is not None and sequence_parallel:
+                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+            if checkpoint_lvl == 0 or (checkpoint_lvl == 1 and activation == "relu"):
+                pre_act, output1 = rest
+            elif checkpoint_lvl == 1:
+                (pre_act,) = rest
+                with torch.jit.fuser("fuser2"):
+                    output1 = activation_fn(pre_act)
+        elif checkpoint_lvl == 2:
+            (bias1,) = rest
+            if process_group is not None and sequence_parallel:
+                total_x, _ = all_gather_raw(x, process_group)
+            if ctx.heuristic == -1:
+                pre_act = F.linear(total_x, weight1, bias1)
+                with torch.jit.fuser("fuser2"):
+                    output1 = activation_fn(pre_act)
+            else:
+                output1, pre_act = fused_dense_cuda.linear_act_forward(
+                    total_x.reshape(batch_dim, total_x.shape[-1]),
+                    weight1,
+                    bias1,
+                    activation == "gelu_approx",
+                    True,
+                    ctx.heuristic,
+                )
+        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+        output1 = output1.reshape(batch_dim, output1.shape[-1])
+        pre_act = pre_act.reshape(batch_dim, pre_act.shape[-1])
+        if ctx.needs_input_grad[3]:
+            grad_weight2, grad_bias2 = fused_dense_cuda.linear_bias_wgrad(
+                output1, grad_output, ctx.needs_input_grad[4]
+            )
+        else:
+            grad_weight2 = None
+            grad_bias2 = grad_output if ctx.needs_input_grad[4] else None
+        if ctx.heuristic == -1:
+            # grad_pre_act = matmul_dgelu(grad_output, weight2, pre_act)
+            grad_output1 = F.linear(grad_output, weight2.t())
+            activation_grad_fn = (
+                gelu_bwd
+                if activation == "gelu_approx"
+                else (sqrelu_bwd if activation == "sqrelu" else relu_bwd)
+            )
+            with torch.jit.fuser("fuser2"):
+                grad_pre_act = activation_grad_fn(grad_output1, pre_act)
+        else:
+            # The cublasLt epilogue has to compute both gelu/relu grad and bias grad, we can't
+            # just compute gelu/relu grad
+            grad_pre_act, grad_bias1 = fused_dense_cuda.bias_act_linear_dgrad_bgrad(
+                weight2, grad_output, pre_act, activation == "gelu_approx", ctx.heuristic
+            )
+            if not ctx.needs_input_grad[2]:
+                grad_bias1 = None
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_pre_act, weight1.t())
+            else:
+                grad_input = torch.addmm(
+                    grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_pre_act, weight1
+                )
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+            if process_group is not None:
+                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
+                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
+        else:
+            grad_input = None
+        if ctx.heuristic == -1:
+            if ctx.needs_input_grad[1]:
+                if process_group is not None and sequence_parallel and checkpoint_lvl != 2:
+                    handle_x.wait()
+                grad_weight1, grad_bias1 = fused_dense_cuda.linear_bias_wgrad(
+                    total_x.reshape(batch_dim, total_x.shape[-1]),
+                    grad_pre_act,
+                    ctx.needs_input_grad[2],
+                )
+            else:
+                grad_weight1 = None
+                grad_bias1 = grad_pre_act if ctx.needs_input_grad[2] else None
+        else:
+            if ctx.needs_input_grad[1]:
+                if process_group is not None and sequence_parallel and checkpoint_lvl != 2:
+                    handle_x.wait()
+                grad_weight1 = F.linear(
+                    grad_pre_act.t(), total_x.reshape(batch_dim, total_x.shape[-1]).t()
+                )
+            else:
+                grad_weight1 = None
+        if process_group is not None and ctx.needs_input_grad[0]:
+            handle_grad_input.wait()
+        return (
+            grad_input,
+            grad_weight1,
+            grad_bias1,
+            grad_weight2,
+            grad_bias2,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+def fused_mlp_func(
+    x: Tensor,
+    weight1: Tensor,
+    weight2: Tensor,
+    bias1: Optional[Tensor] = None,
+    bias2: Optional[Tensor] = None,
+    activation: str = "gelu_approx",
+    save_pre_act: bool = True,
+    return_residual: bool = False,
+    checkpoint_lvl: int = 0,
+    heuristic: int = 0,
+    process_group: Optional[ProcessGroup] = None,
+    sequence_parallel: bool = True,
+):
+    assert activation in ["gelu_approx", "relu", "sqrelu"]
+    dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
+        x.dtype == torch.float32 and torch.is_autocast_enabled()
+    )
+    # If we save pre-activation, dimension must be divisible by 128 (relu) or 8 (gelu)
+    dim_eligible = not save_pre_act or (x.shape[-1] % (128 if activation == "relu" else 8) == 0)
+    if (
+        x.is_cuda
+        and weight1.is_cuda
+        and weight2.is_cuda
+        and (bias1 is None or bias1.is_cuda)
+        and (bias2 is None or bias2.is_cuda)
+        and dtype_eligible
+        and dim_eligible
+    ):
+        return FusedMLPFunc.apply(
+            x,
+            weight1,
+            bias1,
+            weight2,
+            bias2,
+            activation,
+            save_pre_act,
+            return_residual,
+            checkpoint_lvl,
+            heuristic,
+            process_group,
+            sequence_parallel,
+        )
+    else:
+        assert process_group is None
+        pre_act = F.linear(x, weight1, bias1)
+        activation_fn = (
+            partial(F.gelu, approximate="tanh")
+            if activation == "gelu_approx"
+            else partial(F.relu, inplace=True)
+        )
+        output1 = activation_fn(pre_act)
+        output2 = F.linear(output1, weight2, bias2)
+        return output2 if not return_residual else (output2, x)
+class FusedMLP(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        bias1=True,
+        bias2=True,
+        activation="gelu_approx",
+        return_residual=False,
+        checkpoint_lvl=0,
+        heuristic="auto",
+        device=None,
+        dtype=None,
+    ):
+        """
+        If process_group is not None, we're doing Tensor Parallel with sequence parallelism:
+        we do an all_gather of x before doing the matmul, gelu, then matmul.
+        Finally we do a reduce_scatter of the output.
+        checkpoint_lvl (increasing lvl means slower but more memory saving):
+            0: no recomputation in the bwd
+            1: recompute gelu_out in the bwd
+            2: recompute pre_act and gelu_out in the bwd
+        heuristic:
+            -1: don't fuse gemm + gelu (separate kernel)
+            0..4: use this heuristic for the algo section in the fused gemm + gelu
+            'auto': heuristic will be picked automatically:
+                For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf.
+                For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16.
+                For H100, we set heuristic=-1 for both fp16 and bf16 as the fused cuBlasLt implementation
+                is slower than the unfused version.
+        return_residual: whether to return the input x along with the output. This is for
+            performance reason: for post-norm architecture, returning the input allows us
+            to fuse the backward of nn.Linear with the residual connection.
+        """
+        assert checkpoint_lvl in [0, 1, 2]
+        assert activation in ["gelu_approx", "relu", "sqrelu"]
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features * 4
+        self.activation = activation
+        self.return_residual = return_residual
+        self.checkpoint_lvl = checkpoint_lvl
+        self.heuristic = heuristic if activation != "sqrelu" else -1
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
+    def forward(self, x, process_group=None):
+        dtype = x.dtype if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype()
+        if self.heuristic == "auto":
+            if self.activation == "gelu_approx":
+                if torch.cuda.get_device_capability("cuda") == (9, 0):
+                    heuristic = -1
+                else:
+                    cuda_ver = tuple(map(int, torch.version.cuda.split(".")))
+                    heuristic = 0 if cuda_ver >= (11, 8) else (1 if dtype == torch.float16 else -1)
+            else:
+                heuristic = 0
+        else:
+            heuristic = self.heuristic
+        out = fused_mlp_func(
+            x,
+            self.fc1.weight,
+            self.fc2.weight,
+            self.fc1.bias,
+            self.fc2.bias,
+            activation=self.activation,
+            save_pre_act=self.training,
+            return_residual=self.return_residual,
+            checkpoint_lvl=self.checkpoint_lvl,
+            heuristic=heuristic,
+            process_group=process_group,
+        )
+        if self.return_residual:
+            out, x = out
+        if process_group is not None:
+            out = reduce_scatter(out, process_group)
+        return out if not self.return_residual else (out, x)
+class ParallelFusedMLP(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        activation="gelu_approx",
+        process_group: ProcessGroup = None,
+        bias1=True,
+        bias2=True,
+        sequence_parallel=True,
+        checkpoint_lvl=0,
+        heuristic="auto",
+        device=None,
+        dtype=None,
+    ):
+        """
+        process_group is required. We're doing Tensor Parallel with sequence parallelism:
+        we do an all_gather of x before doing the matmul, gelu, then matmul.
+        Finally we do a reduce_scatter of the output.
+        checkpoint_lvl (increasing lvl means slower but more memory saving):
+            0: no recomputation in the bwd
+            1: recompute gelu_out in the bwd
+            2: recompute pre_act and gelu_out in the bwd
+        heuristic:
+            -1: don't fuse gemm + gelu (separate kernel)
+            0..4: use this heuristic for the algo section in the fused gemm + gelu
+            'auto': heuristic will be picked automatically:
+                For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf.
+                For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16.
+        """
+        assert checkpoint_lvl in [0, 1, 2]
+        assert activation in ["gelu_approx", "relu", "sqrelu"]
+        assert process_group is not None
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features * 4
+        self.activation = activation
+        self.process_group = process_group
+        self.sequence_parallel = sequence_parallel
+        self.checkpoint_lvl = checkpoint_lvl
+        self.heuristic = heuristic if activation != "sqrelu" else -1
+        self.fc1 = ColumnParallelLinear(
+            in_features, hidden_features, process_group, bias=bias1, **factory_kwargs
+        )
+        self.fc2 = RowParallelLinear(
+            hidden_features, out_features, process_group, bias=bias2, **factory_kwargs
+        )
+    def forward(self, x):
+        dtype = x.dtype if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype()
+        if self.heuristic == "auto":
+            if self.activation == "gelu_approx":
+                cuda_ver = tuple(map(int, torch.version.cuda.split(".")))
+                heuristic = 0 if cuda_ver >= (11, 8) else (1 if dtype == torch.float16 else -1)
+            else:
+                heuristic = 0
+        else:
+            heuristic = self.heuristic
+        out = fused_mlp_func(
+            x,
+            self.fc1.weight,
+            self.fc2.weight,
+            self.fc1.bias,
+            self.fc2.bias,
+            activation=self.activation,
+            save_pre_act=self.training,
+            checkpoint_lvl=self.checkpoint_lvl,
+            heuristic=heuristic,
+            process_group=self.process_group,
+            sequence_parallel=self.sequence_parallel,
+        )
+        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
+        return reduce_fn(out, self.process_group)