Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

config.json +26 -0
configuration_cloverlm.py +31 -0
exp_mlp.py +168 -0
exp_transformer.py +696 -0
fake_quartet.py +348 -0
model.safetensors +3 -0
modeling_cloverlm.py +237 -0
tokenization_cloverlm.py +68 -0
tokenizer_config.json +10 -0

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architectures": [
+    "CloverLMForCausalLM"
+  ],
+  "attn_backend": "flash2",
+  "auto_map": {
+    "AutoConfig": "configuration_cloverlm.CloverLMConfig",
+    "AutoModelForCausalLM": "modeling_cloverlm.CloverLMForCausalLM",
+    "AutoTokenizer": [
+      "tokenization_cloverlm.CloverLMTokenizer",
+      null
+    ]
+  },
+  "d_head": 128,
+  "heads": 28,
+  "max_context": 1024,
+  "model_type": "cloverlm",
+  "num_blocks": 29,
+  "num_hidden_layers": 29,
+  "quartet_2_impl": "pseudoquant",
+  "ratio": 4,
+  "scale_type": "1/sqrt(d)",
+  "transformers_version": "5.3.0",
+  "vocab_size": 32000,
+  "weight_tying": true
+}

configuration_cloverlm.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from transformers import PretrainedConfig
+class CloverLMConfig(PretrainedConfig):
+    model_type = "cloverlm"
+    def __init__(
+        self,
+        vocab_size=32000,
+        num_blocks=4,
+        heads=6,
+        d_head=128,
+        ratio=3,
+        scale_type="1/sqrt(d)",
+        max_context=1024,
+        quartet_2_impl="pseudoquant",
+        weight_tying=True,
+        attn_backend="pytorch",
+        **kwargs,
+    ):
+        self.num_blocks = num_blocks
+        self.num_hidden_layers = num_blocks
+        self.heads = heads
+        self.d_head = d_head
+        self.ratio = ratio
+        self.scale_type = scale_type
+        self.max_context = max_context
+        self.quartet_2_impl = quartet_2_impl
+        self.weight_tying = weight_tying
+        self.attn_backend = attn_backend
+        super().__init__(vocab_size=vocab_size, **kwargs)

exp_mlp.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+# Normalizes on the hypersphere along dim
+# (s1*...*)s-1
+def sphere_norm(X, dim=-1):
+    return torch.nn.functional.normalize(X, dim=dim)
+class SphereNorm(torch.nn.Module):
+    def __init__(self, dim=-1):
+        super().__init__()
+        self.dim = dim
+    def forward(self, X):
+        Y = sphere_norm(X, dim=self.dim)
+        return Y
+def get_norm(enable, norm_type, d, bias):
+    if enable:
+        if norm_type=="layer":
+            norm = torch.nn.LayerNorm(d, bias=bias)
+        elif norm_type=="rms_learned":
+            norm = torch.nn.RMSNorm(d, elementwise_affine=True)
+        elif norm_type=="rms_const":
+            norm = torch.nn.RMSNorm(d, elementwise_affine=False)
+        elif norm_type=="sphere":
+            norm = SphereNorm(dim=-1)
+    else:
+        norm = None
+    return norm
+class ReLU2(torch.nn.Module):
+    def forward(self, x):
+        y = torch.nn.functional.relu(x)**2
+        return y
+class Abs(torch.nn.Module):
+    def forward(self, x):
+        y = x.abs()
+        return y
+class GLU(torch.nn.Module):
+    def __init__(self, d0, d1, bias=True, act=torch.nn.ReLU(), quartet=True, fake_quartet=False):
+        super().__init__()
+        self.d0 = d0
+        self.d1 = d1
+        self.bias = bias
+        self.act = act
+        self.quartet = quartet
+        self.fake_quartet = fake_quartet
+        if quartet:
+            pass  # quartet2 not available in HF mode
+            self.gate = torch.nn.Sequential(quartet2.linear.Quartet_II_linear(d0, d1, bias), act)
+            self.proj = quartet2.linear.Quartet_II_linear(d0, d1, bias)
+        elif fake_quartet:
+            from . import fake_quartet as fq
+            self.gate = torch.nn.Sequential(fq.FakeQuartetLinear(d0, d1, bias), act)
+            self.proj = fq.FakeQuartetLinear(d0, d1, bias)
+        else:
+            self.gate = torch.nn.Sequential(torch.nn.Linear(d0, d1, bias), act)
+            self.proj = torch.nn.Linear(d0, d1, bias)
+    def forward(self, x):
+        y = self.gate(x) * self.proj(x)
+        return y
+class MLP2L(torch.nn.Module):
+    def __init__(self, d0, d1, d2, bias=True, act=torch.nn.ReLU(), dropout=0, l1_type="linear", norm_type="rms_learned", norm=False, quartet=True, fake_quartet=False):
+        super().__init__()
+        self.d0 = d0
+        self.d1 = d1
+        self.d2 = d2
+        self.bias = bias
+        self.act = act
+        self.dropout = dropout
+        self.l1_type = l1_type
+        self.norm_type = norm_type
+        if l1_type=="linear":
+            if quartet:
+                pass  # quartet2 not available in HF mode
+                self.l1 = torch.nn.Sequential(quartet2.linear.Quartet_II_linear(d0, d1, bias), act)
+            elif fake_quartet:
+                from . import fake_quartet as fq
+                self.l1 = torch.nn.Sequential(fq.FakeQuartetLinear(d0, d1, bias), act)
+            else:
+                self.l1 = torch.nn.Sequential(torch.nn.Linear(d0, d1, bias), act)
+        elif l1_type=="glu":
+            self.l1 = GLU(d0, d1, bias, act, quartet, fake_quartet)
+        self.norm = get_norm(norm, norm_type, d1, bias)
+        if quartet:
+            pass  # quartet2 not available in HF mode
+            self.l2 = quartet2.linear.Quartet_II_linear(d1, d2, bias)
+        elif fake_quartet:
+            from . import fake_quartet as fq
+            self.l2 = fq.FakeQuartetLinear(d1, d2, bias)
+        else:
+            self.l2 = torch.nn.Linear(d1, d2, bias)
+    def forward(self, x):
+        a1 = self.l1(x)
+        if self.norm: a1 = self.norm(a1)
+        a1 = torch.nn.functional.dropout(a1, p=self.dropout, training=self.training)
+        y = self.l2(a1)
+        return y
+class MLP3L(torch.nn.Module):
+    def __init__(self, d0, d1, d2, d3, bias=True, act=torch.nn.ReLU(), dropout=0):
+        super().__init__()
+        self.d0 = d0
+        self.d1 = d1
+        self.d2 = d2
+        self.d3 = d3
+        self.bias = bias
+        self.act = act
+        self.dropout=dropout
+        self.l1 = torch.nn.Linear(d0, d1, bias)
+        self.l2 = torch.nn.Linear(d1, d2, bias)
+        self.l3 = torch.nn.Linear(d2, d3, bias)
+    def forward(self, x):
+        z1 = self.l1(x)
+        a1 = self.act(z1)
+        a1 = torch.nn.functional.dropout(a1, p=self.dropout, training=self.training)
+        z2 = self.l2(a1)
+        a2 = self.act(z2)
+        a2 = torch.nn.functional.dropout(a2, p=self.dropout, training=self.training)
+        y = self.l3(a2)
+        return y
+class MLP3L_image(torch.nn.Module):
+    def __init__(self, res=28, d1=16, d2=16, dropout=0, classes=10):
+        super().__init__()
+        self.res = res
+        self.d1 = d1
+        self.d2 = d2
+        self.dropout = dropout
+        self.classes = classes
+        self.mlp = MLP3L(res*res, d1, d2, classes, dropout=dropout)
+    def forward(self, x):
+        x = x.flatten(start_dim=-3, end_dim=-1)
+        y = self.mlp(x)
+        return y

exp_transformer.py ADDED Viewed

	@@ -0,0 +1,696 @@

+import torch
+from . import exp_mlp as mlp
+from math import sqrt
+import math
+SCALE_TYPES = ["1/sqrt(d)", "1/d"]
+POS_TYPES = ["learned", "sinusoidal", "rope", "alibi"]
+BACKENDS = ["pytorch", "flash2", "flash3", "flash4", "flex", "cudnn"]
+NORM_TYPES = ["layer", "rms_learned", "rms_const", "sphere"]
+def get_causal(context):
+    causal = torch.full((context,context), True)
+    causal = causal.tril()
+    return causal
+def get_sinusoidal(context, d, base=1024):
+    # [pos=0, pos=1, ...]
+    poss = torch.arange(0., context)
+    # [i=0, i=1, ...]
+    js = torch.arange(0., d//2)
+    # [ω0, ω1, ...]
+    ωs = 1/base**(2*js/d)
+    # [pos=0*ω0, pos=0*ω1, ...]
+    # [pos=1*ω0, pos=1*ω1, ...]
+    φs = poss[...,None] @ ωs[None,...]
+    # context*d
+    sinusoidal = torch.empty((context, d))
+    sinusoidal[:,0::2] = torch.sin(φs)
+    sinusoidal[:,1::2] = torch.cos(φs)
+    return sinusoidal
+def get_rope(context, d, *, device, base=1024):
+    # [m=0, m=1, ...]
+    ms = torch.arange(0., context, device=device, dtype=torch.float32)
+    # [i=0, i=1, ...]
+    js = torch.arange(0., d//2, device=device, dtype=torch.float32)
+    # [θ0, θ1, ...]
+    θs = 1/base**(2*js/d)
+    # [m=0*θ0, m=0*θ1, ...]
+    # [m=1*θ0, m=1*θ1, ...]
+    φs = ms[...,None] @ θs[None,...]
+    # context*d/2
+    cos = torch.cos(φs)
+    sin = torch.sin(φs)
+    # context*d
+    cos = cos.repeat_interleave(repeats=2, dim=1)
+    sin = sin.repeat_interleave(repeats=2, dim=1)
+    # 2*context*d
+    rope = torch.stack((cos,sin))
+    return rope
+# (batches*)context*d
+def apply_rope(X, rope):
+    X_ = torch.empty_like(X)
+    X_[...,0::2] = -X[...,1::2]
+    X_[...,1::2] = X[...,0::2]
+    # context*d
+    cos = rope[0]
+    sin = rope[1]
+    Y = X*cos + X_*sin
+    return Y.to(X.dtype)
+def get_m(heads, base=2, exp=8):
+    m = base**( (-exp/heads)*torch.arange(1,heads+1) )
+    return m
+def get_alibi(heads, context):
+    # 1*context*1
+    i = torch.arange(0, context)[None,:,None]
+    # 1*1*context
+    j = i.mT
+    # heads*1*1
+    m = get_m(heads)[:,None,None]
+    alibi = -torch.abs(i - j)*m
+    return alibi
+def get_swa(context, window):
+    # context*1
+    i = torch.arange(0, context).unsqueeze(-1)
+    # 1*context
+    j = i.T
+    swa = torch.abs(i - j) <= window
+    return swa
+# (batches*)heads/groups*context*d_head
+def sdpa_pytorch(Q, K, V, causal=None, alibi=None, swa=None, scale=None, return_A=False):
+    if scale is None:
+        d_head = Q.shape[-1]
+        scale = 1/sqrt(d_head)
+    # (batches*)heads*context*d_head
+    heads = Q.shape[-3]
+    groups = K.shape[-3]
+    ratio = heads//groups
+    # PyTorch only broadcasts when the operation is not defined otherwise. MM does not involve the batch dimensions, and hence PyTorch does not broadcast them.
+    K = K.repeat_interleave(repeats=ratio, dim=-3)
+    V = V.repeat_interleave(repeats=ratio, dim=-3)
+    # (batches*)heads*context*context
+    A__ = Q @ K.mT
+    # batches*heads*context*context
+    A_ = scale*A__
+    # (batches*)heads*context*context
+    A_ = A_.reshape(A__.shape)
+    if alibi is not None:
+        A_ = A_ + alibi
+    if causal is not None:
+        A_.masked_fill_(~causal, -float("inf"))
+    if swa is not None:
+        A_.masked_fill_(~swa, -float("inf"))
+    A = torch.softmax(A_, dim=-1)
+    # (batches*)heads*context*d_head
+    Y = A @ V
+    if not return_A:
+        return Y
+    else:
+        return Y, A__, A_, A
+# (batches*)heads/groups*context*d_head
+def sdpa_flash(Q, K, V, causal=False, alibi=None, swa=None, scale=None, backend="flash2"):
+    if (alibi is not None) and backend != "flash2":
+        print("\x1b[93;3m[WARNING]: backend={backend} does not support ALiBi. Hence, we force backend=flash2.\x1b[0m")
+        backend = "flash2"
+    # FlashAttention only supports float scale
+    if isinstance(scale, torch.Tensor):
+        Q_shape = Q.shape
+        # batches*heads*context*d_head
+        Q = scale*Q
+        # (batches*)heads*context*d_head
+        Q = Q.reshape(Q_shape)
+        scale = 1
+    # FlashAttention2 only supports BF16 and FP16
+    if Q.dtype in [torch.bfloat16, torch.float16]:
+        dtype = Q.dtype
+    else:
+        dtype = torch.bfloat16
+    heads = Q.shape[-3]
+    groups = K.shape[-3]
+    context = Q.shape[-2]
+    d_head = Q.shape[-1]
+    # CAUTION: FlashAttention expects batches*context*heads/groups*d_head
+    Q = Q.movedim(-3,-2).reshape(-1,context,heads,d_head)
+    K = K.movedim(-3,-2).reshape(-1,context,groups,d_head)
+    V = V.movedim(-3,-2).reshape(-1,context,groups,d_head)
+    if swa is None:
+        swa = (-1,-1)
+    if backend=="flash2":
+        import flash_attn
+        Y = flash_attn.flash_attn_func(Q.to(dtype), K.to(dtype), V.to(dtype), causal=causal, alibi_slopes=alibi,  window_size=swa, softmax_scale=scale)
+    elif backend=="flash3":
+        import flash_attn_interface
+        Y = flash_attn_interface.flash_attn_func(Q.to(dtype), K.to(dtype), V.to(dtype), causal=causal, window_size=swa, softmax_scale=scale)
+    elif backend=="flash4":
+        import flash_attn.cute
+        # FlashAttention4 returns (out, lse)
+        Y = flash_attn.cute.flash_attn_func(Q.to(dtype), K.to(dtype), V.to(dtype), causal=causal, window_size=swa, softmax_scale=scale)[0]
+    Y = Y.to(Q.dtype)
+    # Restore the shape to: (batches*)heads*context*d_head
+    Y = Y.movedim(-3,-2).squeeze(0)
+    return Y
+# (batches*)heads/groups*context*d_head
+def sdpa_flex():
+    return None
+# (batches*)heads/groups*context*d_head
+def sdpa_cudnn():
+    return None
+def sdpa_wrapper(Q, K, V, causal=None, alibi=None, swa=None, scale=None, return_A=False, backend="flash2"):
+    if backend=="pytorch":
+        return sdpa_pytorch(Q, K, V, causal, alibi, swa, scale, return_A)
+    elif backend in {"flash2", "flash3", "flash4"}:
+        return sdpa_flash(Q, K, V, causal, alibi, swa, scale, backend)
+    elif backend=="flex":
+        return sdpa_flex()
+    elif backend=="cudnn":
+        return sdpa_cudnn()
+def test_sdpa():
+    batches = 32
+    heads = 12
+    context = 1024
+    d_head = 64
+    window = 256
+    groups = 4
+    dtype = torch.bfloat16
+    print("\x1b[1mbfloat16\x1b[0m",end="")
+    Q = torch.rand((batches, heads, context, d_head)).to("cuda:0", dtype)
+    K = torch.rand((batches, heads, context, d_head)).to("cuda:0", dtype)
+    V = torch.rand((batches, heads, context, d_head)).to("cuda:0", dtype)
+    pytorch = sdpa_wrapper(Q, K, V, backend="pytorch")
+    flash2 = sdpa_wrapper(Q, K, V, backend="flash2")
+    torch.testing.assert_close(flash2, pytorch, check_dtype=False)
+    flash3 = sdpa_wrapper(Q, K, V, backend="flash3")
+    torch.testing.assert_close(flash3, pytorch, check_dtype=False)
+    flash4 = sdpa_wrapper(Q, K, V, backend="flash4")
+    torch.testing.assert_close(flash4, pytorch, check_dtype=False)
+    print("\x1b[32m ✔\x1b[0m")
+    print("\x1b[1mcausal\x1b[0m",end="")
+    pytorch = sdpa_wrapper(Q, K, V, causal=get_causal(context).to("cuda:0"), backend="pytorch")
+    flash2 = sdpa_wrapper(Q, K, V, causal=True, backend="flash2")
+    torch.testing.assert_close(flash2, pytorch, check_dtype=False)
+    flash3 = sdpa_wrapper(Q, K, V, causal=True, backend="flash3")
+    torch.testing.assert_close(flash3, pytorch, check_dtype=False)
+    flash4 = sdpa_wrapper(Q, K, V, causal=True, backend="flash4")
+    torch.testing.assert_close(flash4, pytorch, check_dtype=False)
+    print("\x1b[32m ✔\x1b[0m")
+    print("\x1b[1malibi\x1b[0m",end="")
+    pytorch = sdpa_wrapper(Q, K, V, alibi=get_alibi(heads,context).to("cuda:0",dtype), backend="pytorch")
+    flash2 = sdpa_wrapper(Q, K, V, alibi=get_m(heads).to("cuda:0"), backend="flash2")
+    torch.testing.assert_close(flash2, pytorch, check_dtype=False)
+    # ALiBi not supported on FlashAttention3/4
+    print("\x1b[32m ✔\x1b[0m")
+    print("\x1b[1mswa\x1b[0m",end="")
+    pytorch = sdpa_wrapper(Q, K, V, swa=get_swa(context,window).to("cuda:0"), backend="pytorch")
+    flash2 = sdpa_wrapper(Q, K, V, swa=(window,window), backend="flash2")
+    torch.testing.assert_close(flash2, pytorch, check_dtype=False)
+    flash3 = sdpa_wrapper(Q, K, V, swa=(window,window), backend="flash3")
+    torch.testing.assert_close(flash3, pytorch, check_dtype=False)
+    flash4 = sdpa_wrapper(Q, K, V, swa=(window,window), backend="flash4")
+    torch.testing.assert_close(flash4, pytorch, check_dtype=False)
+    print("\x1b[32m ✔\x1b[0m")
+    print("\x1b[1mcausal+alibi\x1b[0m",end="")
+    pytorch = sdpa_wrapper(Q, K, V, causal=get_causal(context).to("cuda:0"), alibi=get_alibi(heads,context).to("cuda:0",dtype), backend="pytorch")
+    flash2 = sdpa_wrapper(Q, K, V, causal=True, alibi=get_m(heads).to("cuda:0"), backend="flash2")
+    torch.testing.assert_close(flash2, pytorch, check_dtype=False)
+    # ALiBi not supported on FlashAttention3/4
+    print("\x1b[32m ✔\x1b[0m")
+    print("\x1b[1mcausal+swa\x1b[0m",end="")
+    pytorch = sdpa_wrapper(Q, K, V, causal=get_causal(context).to("cuda:0"), swa=get_swa(context,window).to("cuda:0"), backend="pytorch")
+    flash2 = sdpa_wrapper(Q, K, V, causal=True, swa=(window,window), backend="flash2")
+    torch.testing.assert_close(flash2, pytorch, check_dtype=False)
+    flash3 = sdpa_wrapper(Q, K, V, causal=True, swa=(window,window), backend="flash3")
+    torch.testing.assert_close(flash3, pytorch, check_dtype=False)
+    flash4 = sdpa_wrapper(Q, K, V, causal=True, swa=(window,window), backend="flash4")
+    torch.testing.assert_close(flash4, pytorch, check_dtype=False)
+    print("\x1b[32m ✔\x1b[0m")
+    print("\x1b[1mGQA\x1b[0m",end="")
+    Q = torch.rand((batches, heads, context, d_head)).to("cuda:0", dtype)
+    K = torch.rand((batches, groups, context, d_head)).to("cuda:0", dtype)
+    V = torch.rand((batches, groups, context, d_head)).to("cuda:0", dtype)
+    pytorch = sdpa_wrapper(Q, K, V, backend="pytorch")
+    flash2 = sdpa_wrapper(Q, K, V, backend="flash2")
+    torch.testing.assert_close(flash2, pytorch, check_dtype=False)
+    flash3 = sdpa_wrapper(Q, K, V, backend="flash3")
+    torch.testing.assert_close(flash3, pytorch, check_dtype=False)
+    flash4 = sdpa_wrapper(Q, K, V, backend="flash4")
+    torch.testing.assert_close(flash4, pytorch, check_dtype=False)
+    print("\x1b[32m ✔\x1b[0m")
+class MHSA(torch.nn.Module):
+    def __init__(self, heads, d_head, scale_type="1/sqrt(d)", ratio=1, qk_norm=True, quartet=True, fake_quartet=False):
+        super().__init__()
+        self.heads = heads
+        self.d_head = d_head
+        self.d = heads * d_head
+        self.scale_type = scale_type
+        self.ratio = ratio
+        self.groups = heads//ratio
+        self.d_KV = self.groups * d_head
+        self.qk_norm = qk_norm
+        if qk_norm:
+            # (batches*)heads*context*d_head
+            scale = torch.full((1,heads,1,1), sqrt(d_head))
+            self.scale = torch.nn.Parameter(scale)
+        else:
+            if scale_type=="1/sqrt(d)":
+                self.scale = 1/sqrt(d_head)
+            elif scale_type=="1/d":
+                self.scale = 1/d_head
+        self.quartet = quartet
+        self.fake_quartet = fake_quartet
+        # Packing QKV gives negligible speed gains, while not allowing GQA, hurting code clarity and having side effects with μP
+        if quartet:
+            pass  # quartet2 not available in HF mode
+            self.lq = quartet2.linear.Quartet_II_linear(self.d, self.d, bias=False)
+            self.lk = quartet2.linear.Quartet_II_linear(self.d, self.d_KV, bias=False)
+            self.lv = quartet2.linear.Quartet_II_linear(self.d, self.d_KV, bias=False)
+            self.lo = quartet2.linear.Quartet_II_linear(self.d, self.d, bias=False)
+        elif fake_quartet:
+            from . import fake_quartet as fq
+            self.lq = fq.FakeQuartetLinear(self.d, self.d, bias=False)
+            self.lk = fq.FakeQuartetLinear(self.d, self.d_KV, bias=False)
+            self.lv = fq.FakeQuartetLinear(self.d, self.d_KV, bias=False)
+            self.lo = fq.FakeQuartetLinear(self.d, self.d, bias=False)
+        else:
+            self.lq = torch.nn.Linear(self.d, self.d, bias=False)
+            self.lk = torch.nn.Linear(self.d, self.d_KV, bias=False)
+            self.lv = torch.nn.Linear(self.d, self.d_KV, bias=False)
+            self.lo = torch.nn.Linear(self.d, self.d, bias=False)
+    # (batches*)context*d
+    def forward(self, X, causal=None, rope=None, alibi=None, swa=None, return_A=False, backend="flash2"):
+        # (batches*)context*d
+        Q = self.lq(X)
+        # (batches*)context*d_KV
+        K = self.lk(X)
+        V = self.lv(X)
+        # (batches*)context*heads*d_head
+        Q = Q.unflatten(dim=-1, sizes=(self.heads, self.d_head))
+        # (batches*)context*groups*d_head
+        K = K.unflatten(dim=-1, sizes=(self.groups, self.d_head))
+        V = V.unflatten(dim=-1, sizes=(self.groups, self.d_head))
+        # (batches*)heads*context*d_head
+        Q = Q.movedim(-3,-2)
+        # (batches*)groups*context*d_head
+        K = K.movedim(-3,-2)
+        V = V.movedim(-3,-2)
+        if rope is not None:
+            Q = apply_rope(Q,rope)
+            K = apply_rope(K,rope)
+        # After RoPE
+        if self.qk_norm:
+            Q = mlp.sphere_norm(Q)
+            K = mlp.sphere_norm(K)
+        # (batches*)heads*context*d_head
+        if not return_A:
+            Y = sdpa_wrapper(Q, K, V, causal, alibi, swa, self.scale, return_A, backend)
+        else:
+            Y, A__, A_, A = sdpa_wrapper(Q, K, V, causal, alibi, swa, self.scale, return_A, backend)
+        # (batches*)context*heads*d_head
+        Y = Y.movedim(-3,-2)
+        # (batches*)context*d
+        Y = Y.flatten(-2,-1)
+        Y = self.lo(Y)
+        if not return_A:
+            return Y
+        else:
+            return Y, A__, A_, A
+class Block(torch.nn.Module):
+    def __init__(self, heads, d_head, scale_type="1/sqrt(d)", ratio=1, exp_factor=4, dropout=0, norm_type="rms_learned", bias=False, act=mlp.ReLU2(), l1_type="linear", pre_att_norm=False, qk_norm=True, out_att_norm=True, pre_mlp_norm=False, act_norm=False, out_mlp_norm=True, quartet=True, fake_quartet=False):
+        super().__init__()
+        self.heads = heads
+        self.d_head = d_head
+        self.d = heads * d_head
+        self.scale_type = scale_type
+        self.ratio = ratio
+        self.groups = heads//ratio
+        self.exp_factor = exp_factor
+        self.d_hidden = int(exp_factor*self.d)
+        self.dropout = dropout
+        self.norm_type = norm_type
+        self.bias = bias
+        self.act = act
+        self.l1_type = l1_type
+        self.mhsa = MHSA(heads, d_head, scale_type, ratio, qk_norm, quartet, fake_quartet)
+        self.pre_att_norm = mlp.get_norm(pre_att_norm, norm_type, self.d, bias)
+        self.out_att_norm = mlp.get_norm(out_att_norm, norm_type, self.d, bias)
+        self.mlp = mlp.MLP2L(self.d, self.d_hidden, self.d, bias, act, dropout, l1_type, norm_type, act_norm, quartet, fake_quartet)
+        self.pre_mlp_norm = mlp.get_norm(pre_mlp_norm, norm_type, self.d, bias)
+        self.out_mlp_norm = mlp.get_norm(out_mlp_norm, norm_type, self.d, bias)
+        self.quartet = quartet
+        self.fake_quartet = fake_quartet
+    def forward(self, X, causal=None, rope=None, alibi=None, swa=None, return_res=False, return_A=False, backend="flash2"):
+        mhsa = self.mhsa(self.pre_att_norm(X) if self.pre_att_norm else X, causal, rope, alibi, swa, return_A, backend)
+        if not return_A:
+            Y = mhsa
+        else:
+            Y, A__, A_, A = mhsa
+        if self.out_att_norm: Y = self.out_att_norm(Y)
+        Y_ = torch.nn.functional.dropout(Y, p=self.dropout, training=self.training)
+        Y__ = X + Y_
+        Z = self.mlp(self.pre_mlp_norm(Y__) if self.pre_mlp_norm else Y__)
+        if self.out_mlp_norm: Z = self.out_mlp_norm(Z)
+        Z_ = torch.nn.functional.dropout(Z, p=self.dropout, training=self.training)
+        Z__ = Y__ + Z_
+        if not return_res:
+            if not return_A:
+                return Z__
+            else:
+                return Z__, A__, A_, A
+        else:
+            if not return_A:
+                return Z__, Y__
+            else:
+                return Z__, Y__, A__, A_, A
+class Transformer(torch.nn.Module):
+    def __init__(self, vocab_size=50304, num_blocks=12, heads=12, d_head=64, scale_type="1/sqrt(d)", ratio=1, is_causal=True, window=None, backend="flash2", exp_factor=4, dropout=0, pos_type="rope", max_context=128, norm_type="rms_learned", bias=False, act=mlp.ReLU2(), l1_type="linear", std=0.02, test=False, weight_tying=True, emb_norm=False, pre_att_norm=False, qk_norm=True, out_att_norm=True, pre_mlp_norm=False, act_norm=False, out_mlp_norm=True, out_norm=True, fix_norm=False, quartet=True, fake_quartet=False):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.num_blocks = num_blocks
+        self.heads = heads
+        self.d_head = d_head
+        self.d = heads * d_head
+        self.scale_type = scale_type
+        self.ratio = ratio
+        self.groups = heads//ratio
+        self.is_causal = is_causal
+        self.window = window
+        self.backend = backend
+        self.exp_factor = exp_factor
+        self.dropout = dropout
+        self.pos_type = pos_type
+        self.max_context = max_context
+        self.norm_type = norm_type
+        self.bias = bias
+        self.act = act
+        self.l1_type = l1_type
+        self.weight_tying = weight_tying
+        self.fix_norm = fix_norm
+        self.quartet = quartet
+        self.fake_quartet = fake_quartet
+        self.emb = torch.nn.Embedding(vocab_size, self.d)
+        self.emb_norm = mlp.get_norm(emb_norm, norm_type, self.d, bias)
+        if pos_type == "learned":
+            pos = torch.rand((max_context, self.d))
+            self.pos = torch.nn.Parameter(pos)
+        self.blocks = torch.nn.Sequential(*[Block(heads, d_head, scale_type, ratio, exp_factor, dropout, norm_type, bias, act, l1_type, pre_att_norm, qk_norm, out_att_norm, pre_mlp_norm, act_norm, out_mlp_norm, quartet, fake_quartet) for _ in range(num_blocks)])
+        self.out_norm = mlp.get_norm(out_norm, norm_type, self.d, bias)
+        self.linear = torch.nn.Linear(self.d, vocab_size, bias=False)
+        if weight_tying: self.emb.weight = self.linear.weight
+        self.init(std, test)
+        if fake_quartet:
+            for m in self.modules():
+                if isinstance(m, (torch.nn.LayerNorm, torch.nn.RMSNorm, torch.nn.Embedding)):
+                    m.to(torch.bfloat16)
+    def init(self, std=0.02, test=False):
+        if test: print("\x1b[1m%36.36s %8.8s %8.8s %8.8s\x1b[0m" % ("parameter_name", "suffix", "mean", "std"))
+        for parameter_name, parameter in self.named_parameters():
+            parent_name, _, suffix = parameter_name.rpartition(".")
+            parent = self.get_submodule(parent_name)
+            if isinstance(parent, (torch.nn.Linear, torch.nn.Embedding)) and suffix=="weight":
+                torch.nn.init.normal_(parameter, 0, std)
+            elif isinstance(parent, (torch.nn.Linear, torch.nn.LayerNorm)) and suffix=="bias":
+                torch.nn.init.zeros_(parameter)
+            elif isinstance(parent, (torch.nn.LayerNorm, torch.nn.RMSNorm)) and suffix=="weight":
+                torch.nn.init.ones_(parameter)
+            else:
+                # pos
+                if parameter.ndim == 2:
+                    torch.nn.init.zeros_(parameter)
+                # scale
+                elif parameter.ndim == 4:
+                    torch.nn.init.constant_(parameter, sqrt(self.d_head))
+            if test:
+                print("%36.36s %8.8s %8.8s %8.8s\x1b[0m" % (parameter_name, suffix, "%f" % parameter.mean(), "%f" % parameter.std()))
+    # (batches*)context
+    def forward(self, ids, return_res=False, return_A=False):
+        context = ids.shape[-1]
+        if return_A:
+            # (batches*)num_blocks*heads*context*context
+            A__ = torch.empty(*ids.shape[:-1], self.num_blocks, self.heads, context, context)
+            A_ = torch.empty_like(A__)
+            A = torch.empty_like(A__)
+        # (batches*)context*d
+        X = self.emb(ids)
+        if return_res:
+            res_in = X
+            # (batches*)num_blocks*context*d
+            res_att = torch.empty(*ids.shape[:-1], self.num_blocks, context, self.d)
+            res_mlp = torch.empty(*ids.shape[:-1], self.num_blocks, context, self.d)
+        # Recompute in every batch in case context changes
+        if self.is_causal:
+            if self.backend=="pytorch":
+                causal = get_causal(context).to(ids.device)
+            elif self.backend in {"flash2", "flash3", "flash4"}:
+                causal = True
+            elif self.backend=="flex":
+                causal = causal_mod
+            elif self.backend=="cudnn":
+                # right_bound
+                causal = 0
+        else: causal = None
+        if self.pos_type == "sinusoidal":
+            pos = get_sinusoidal(context, self.d).to(ids.device)
+            X = X + pos
+        if self.pos_type == "learned":
+            X = X + self.pos[:context,:]
+        if self.pos_type == "rope":
+            rope = get_rope(context, self.d_head, device=ids.device)
+        else: rope = None
+        if self.pos_type == "alibi":
+            if self.backend=="pytorch":
+                alibi = get_alibi(self.heads, context).to(ids.device)
+            elif self.backend in {"flash2", "flash3", "flash4"}:
+                alibi = get_m(self.heads).to(ids.device)
+            elif self.backend=="flex":
+                alibi = alibi_mod
+            elif self.backend=="cudnn":
+                alibi = True
+        else: alibi = None
+        if self.window is not None:
+            if self.backend=="pytorch":
+                swa = get_swa(context, self.window).to(ids.device)
+            elif self.backend in {"flash2", "flash3", "flash4"}:
+                swa = (self.window, self.window)
+            elif self.backend=="flex":
+                swa = swa_mod
+            elif self.backend=="cudnn":
+                # left_bound
+                swa = self.window
+        else: swa = None
+        # After positional encoding
+        if self.emb_norm: X = self.emb_norm(X)
+        X_ = torch.nn.functional.dropout(X, p=self.dropout, training=self.training)
+        Y = X_
+        for i, block in enumerate(self.blocks):
+            if not return_res:
+                if not return_A:
+                    Y = block(Y, causal, rope, alibi, swa, return_res, return_A, self.backend)
+                else:
+                    Y, A__[...,i,:,:,:], A_[...,i,:,:,:], A[...,i,:,:,:] = block(Y, causal, rope, alibi, swa, return_res, return_A, self.backend)
+            else:
+                if not return_A:
+                    Y, res_att[...,i,:,:] = block(Y, causal, rope, alibi, swa, return_res, return_A, self.backend)
+                    res_mlp[...,i,:,:]= Y
+                else:
+                    Y, res_att[...,i,:,:], A__[...,i,:,:,:], A_[...,i,:,:,:], A[...,i,:,:,:] = block(Y, causal, rope, alibi, swa, return_res, return_A, self.backend)
+                    res_mlp[...,i,:,:]= Y
+        if self.out_norm: Y = self.out_norm(Y)
+        # (batches*)context*vocab_size
+        if self.fix_norm:
+            Z = torch.nn.functional.linear(Y, mlp.sphere_norm(self.linear.weight))
+        else:
+            Z = self.linear(Y)
+        if not return_res:
+            if not return_A:
+                return Z
+            else:
+                return Z, A__, A_, A
+        else:
+            if not return_A:
+                return Z, res_in, res_att, res_mlp
+            else:
+                return Z, res_in, res_att, res_mlp, A__, A_, A
+def get_attention_header(transformer):
+    attention_header = ""
+    for block in range(transformer.num_blocks):
+        for head in range(transformer.heads):
+            attention_header += f"block{block}.head{head} "
+    # Remove last space
+    attention_header = attention_header[:-1]
+    return attention_header
+def get_attention(W):
+    attention = ""
+    for block in range(W.shape[0]):
+        for head in range(W.shape[1]):
+            # rows->y, columns->x
+            attention +=  "%.2f " % W[block, head]
+    # Remove last space
+    attention = attention[:-1]
+    return attention
+def get_similarity_header(transformer):
+    similarity_header = "embedding "
+    for block in range(transformer.num_blocks):
+        similarity_header += f"block{block} "
+    # Remove last space
+    similarity_header = similarity_header[:-1]
+    return similarity_header
+def get_similarity(embeddings_x, embeddings_y):
+    similarity = ""
+    for block in range(embeddings_x.shape[0]):
+        similarity +=  "%.2f " % torch.nn.functional.cosine_similarity(embeddings_x[block,:], embeddings_y[block,:], dim=0)
+    # Remove last space
+    similarity = similarity[:-1]
+    return similarity
+def get_clustering_header(transformer):
+    clustering_header = "embedding.random.x embedding.random.y "\
+                        "embedding.pca.x embedding.pca.y "\
+                        "embedding.mds.x embedding.mds.y "\
+                        "embedding.tsne.x embedding.tsne.y "\
+                        "embedding.umap.x embedding.umap.y "
+    for block in range(transformer.num_blocks):
+        clustering_header += f"block{block}.random.x block{block}.random.y "\
+                             f"block{block}.pca.x block{block}.pca.y "\
+                             f"block{block}.mds.x block{block}.mds.y "\
+                             f"block{block}.tsne.x block{block}.tsne.y "\
+                             f"block{block}.umap.x block{block}.umap.y "
+    # Remove last space
+    clustering_header = clustering_header[:-1]
+    return clustering_header
+def get_clustering(random, pca, mds, tsne, umap):
+    clustering = ""
+    for block in range(random.shape[0]):
+        clustering += "%f %f %f %f %f %f %f %f %f %f " % (random[block,0], random[block,1], pca[block,0], pca[block,1], mds[block,0], mds[block,1], tsne[block,0], tsne[block,1], umap[block,0], umap[block,1])
+    # Remove last space
+    clustering = clustering[:-1]
+    return clustering

fake_quartet.py ADDED Viewed

	@@ -0,0 +1,348 @@

+from random import randint
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from scipy.linalg import hadamard
+def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device):
+    return torch.tensor(
+        hadamard(group_size) * group_size**-0.5,
+        dtype=dtype,
+        device=device,
+        requires_grad=False,
+    )
+def rerotate_hadamard(hadamard_matrix):
+    signs = torch.diag(
+        torch.randint(
+            0, 2, (hadamard_matrix.size(0),),
+            device=hadamard_matrix.device,
+            dtype=hadamard_matrix.dtype,
+        ) * 2 - 1
+    )
+    return hadamard_matrix @ signs
+@triton.jit
+def _rtn_fp4(x):
+    x_abs = tl.abs(x)
+    x_sign = tl.where(x > 0, 1, -1)
+    x_fp4_abs = tl.where(
+        x_abs >= 5, 6,
+        tl.where(x_abs >= 3.5, 4,
+        tl.where(x_abs >= 2.5, 3,
+        tl.where(x_abs >= 1.75, 2,
+        tl.where(x_abs >= 1.25, 1.5,
+        tl.where(x_abs >= 0.75, 1,
+        tl.where(x_abs >= 0.25, 0.5,
+        0.0)))))))
+    return x_fp4_abs * x_sign
+@triton.jit
+def _get_scales(x, amax, val_max, scales_max):
+    s_dec = tl.where(amax == 0.0, 1.0, amax / scales_max / val_max)
+    s_dec_b = tl.max(tl.abs(x), axis=-1, keep_dims=True) / val_max
+    s_dec_b_e4m3 = (s_dec_b / s_dec).to(tl.float8e4nv).to(tl.float32)
+    s_dec_b_e4m3 = tl.where(s_dec_b_e4m3 == 0, 1.0, s_dec_b_e4m3)
+    return s_dec_b_e4m3, s_dec
+@triton.jit
+def _get_alt_scales(x, val_max, s_dec):
+    s_dec_b = tl.max(tl.abs(x), axis=-1, keep_dims=True) / val_max
+    s_dec_b_e4m3 = (s_dec_b * (6 / 4) / s_dec).to(tl.float8e4nv).to(tl.float32)
+    s_dec_b_e4m3 = tl.where(s_dec_b_e4m3 == 0, 1.0, s_dec_b_e4m3)
+    return s_dec_b_e4m3
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 64 * 32}),
+        triton.Config({"BLOCK_SIZE": 128 * 32}),
+        triton.Config({"BLOCK_SIZE": 256 * 32}),
+        triton.Config({"BLOCK_SIZE": 512 * 32}),
+    ],
+    key=[],
+)
+@triton.jit
+def _rtn_1x16s_fp4_kernel(
+    x_ptr, amax_ptr, output_ptr,
+    n_elements: tl.constexpr,
+    scale_override: tl.constexpr,
+    group_size: tl.constexpr,
+    four_over_six: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = pid * BLOCK_SIZE
+    offsets = start_idx + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x_flat = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+    x_grouped = tl.reshape(x_flat, (BLOCK_SIZE // group_size, group_size))
+    scales_max = 256.00 if four_over_six else 448.00
+    val_max = 6.0 / scale_override
+    amax = tl.load(amax_ptr)
+    s_dec_b_e4m3, s_dec = _get_scales(x_grouped, amax, val_max, scales_max)
+    x_scaled = x_grouped / (s_dec_b_e4m3 * s_dec)
+    x_fp4 = _rtn_fp4(x_scaled)
+    x_dequantized = x_fp4 * (s_dec_b_e4m3 * s_dec)
+    if not four_over_six:
+        best_x_dequantized = x_dequantized
+    else:
+        alt_s_dec_b_e4m3 = _get_alt_scales(x_grouped, val_max, s_dec)
+        alt_x_scaled = x_grouped / (alt_s_dec_b_e4m3 * s_dec)
+        alt_x_fp4 = _rtn_fp4(alt_x_scaled)
+        alt_x_dequantized = alt_x_fp4 * (alt_s_dec_b_e4m3 * s_dec)
+        error_six = tl.sum((x_grouped - x_dequantized) * (x_grouped - x_dequantized), axis=-1, keep_dims=True)
+        error_four = tl.sum((x_grouped - alt_x_dequantized) * (x_grouped - alt_x_dequantized), axis=-1, keep_dims=True)
+        best_x_dequantized = tl.where(error_six <= error_four, x_dequantized, alt_x_dequantized)
+    x_dequantized_flat = tl.reshape(best_x_dequantized, (BLOCK_SIZE,))
+    tl.store(output_ptr + offsets, x_dequantized_flat, mask=mask)
+@torch.compiler.disable()
+def rtn_1x16s_fp4(x, scale_override: float, group_size: int, four_over_six: bool):
+    x = x.contiguous()
+    output = torch.empty_like(x)
+    n_elements = x.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+    _rtn_1x16s_fp4_kernel[grid](
+        x_ptr=x, amax_ptr=x.abs().max(), output_ptr=output,
+        n_elements=n_elements, scale_override=scale_override,
+        group_size=group_size, four_over_six=four_over_six,
+    )
+    return output
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 64 * 32}),
+        triton.Config({"BLOCK_SIZE": 128 * 32}),
+        triton.Config({"BLOCK_SIZE": 256 * 32}),
+        triton.Config({"BLOCK_SIZE": 512 * 32}),
+    ],
+    key=[],
+)
+@triton.jit
+def _eden_1x16s_fp4_kernel(
+    x_ptr, hadamard_matrix_ptr, current_amax_ptr, output_ptr, next_amax_ptr,
+    n_elements: tl.constexpr,
+    hadamard_dim: tl.constexpr,
+    scale_override: tl.constexpr,
+    group_size: tl.constexpr,
+    seed: int,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = pid * BLOCK_SIZE
+    offsets = start_idx + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x_flat = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+    offsets_hadamard = tl.arange(0, hadamard_dim * hadamard_dim)
+    hadamard_matrix = tl.load(hadamard_matrix_ptr + offsets_hadamard).reshape(hadamard_dim, hadamard_dim)
+    x = tl.reshape(x_flat, (BLOCK_SIZE // hadamard_dim, hadamard_dim))
+    x_had = tl.dot(x, hadamard_matrix)
+    tl.atomic_max(next_amax_ptr, tl.max(tl.abs(x_had)).to(tl.float32), sem="relaxed")
+    x_grouped = tl.reshape(x_had, (BLOCK_SIZE // group_size, group_size))
+    scales_max = 255.99
+    val_max = 6.0 / scale_override
+    amax = tl.load(current_amax_ptr)
+    s_dec = tl.where(amax == 0.0, 1.0, amax / scales_max / val_max)
+    s_dec_b = tl.max(tl.abs(x_grouped), axis=-1, keep_dims=True) / val_max
+    s_dec_b_e4m3 = (s_dec_b / s_dec).to(tl.float8e4nv).to(tl.float32)
+    s_dec_b_e4m3 = tl.where(s_dec_b_e4m3 == 0, 1.0, s_dec_b_e4m3)
+    x_scaled = x_grouped / (s_dec_b_e4m3 * s_dec)
+    x_scaled_abs = tl.abs(x_scaled)
+    x_scaled_sign = tl.where(x_scaled > 0, 1, -1)
+    x_fp4 = tl.where(
+        x_scaled_abs >= 5, 6,
+        tl.where(x_scaled_abs >= 3.5, 4,
+        tl.where(x_scaled_abs >= 2.5, 3,
+        tl.where(x_scaled_abs >= 1.75, 2,
+        tl.where(x_scaled_abs >= 1.25, 1.5,
+        tl.where(x_scaled_abs >= 0.75, 1,
+        tl.where(x_scaled_abs >= 0.25, 0.5,
+        0))))))) * x_scaled_sign
+    x_scaled = tl.reshape(x_scaled, (BLOCK_SIZE // hadamard_dim, hadamard_dim))
+    x_fp4 = tl.reshape(x_fp4, (BLOCK_SIZE // hadamard_dim, hadamard_dim))
+    num = tl.sum(x_scaled * x_scaled, axis=-1, keep_dims=True)
+    denom = tl.sum(x_scaled * x_fp4, axis=-1, keep_dims=True)
+    correction = tl.where(denom == 0.0, 1.0, num / denom)
+    scales = tl.reshape(s_dec_b_e4m3, (BLOCK_SIZE // hadamard_dim, hadamard_dim // group_size))
+    corrected_scales = tl.reshape(scales * correction, (BLOCK_SIZE // group_size, 1))
+    bitscales = tl.cast(corrected_scales.to(tl.float8e4nv), tl.uint8, bitcast=True)
+    prevscale = tl.cast((bitscales - 1), tl.float8e4nv, bitcast=True).to(tl.float32)
+    currscale = tl.cast((bitscales), tl.float8e4nv, bitcast=True).to(tl.float32)
+    nextscale = tl.cast((bitscales + 1), tl.float8e4nv, bitcast=True).to(tl.float32)
+    up = tl.where(currscale > corrected_scales, currscale, nextscale)
+    down = tl.where(currscale > corrected_scales, prevscale, currscale)
+    prob_up = (corrected_scales - down) / (up - down)
+    scale_start_idx = pid * (BLOCK_SIZE // group_size)
+    scale_offsets = scale_start_idx + tl.arange(0, BLOCK_SIZE // group_size)
+    sampled_prob = tl.rand(seed, scale_offsets).reshape(BLOCK_SIZE // group_size, 1)
+    scales = tl.where(sampled_prob < prob_up, up, down)
+    scales = tl.reshape(scales, (BLOCK_SIZE // group_size, 1))
+    x_fp4 = tl.reshape(x_fp4, (BLOCK_SIZE // group_size, group_size))
+    x_dequantized = x_fp4 * scales * s_dec
+    x_dequantized_flat = tl.reshape(x_dequantized, (BLOCK_SIZE,))
+    tl.store(output_ptr + offsets, x_dequantized_flat.to(x_ptr.dtype.element_ty), mask=mask)
+@torch.compiler.disable()
+def eden_1x16s_fp4(x, hadamard_matrix, scale_override: float, group_size: int, current_amax):
+    hadamard_dim = hadamard_matrix.size(0)
+    x = x.contiguous()
+    hadamard_matrix = hadamard_matrix.T.contiguous()
+    output = torch.empty_like(x)
+    seed = randint(0, 1_000_000)
+    next_amax = torch.zeros_like(current_amax)
+    n_elements = x.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+    _eden_1x16s_fp4_kernel[grid](
+        x_ptr=x, hadamard_matrix_ptr=hadamard_matrix,
+        current_amax_ptr=current_amax, output_ptr=output,
+        next_amax_ptr=next_amax, n_elements=n_elements,
+        hadamard_dim=hadamard_dim, scale_override=scale_override,
+        group_size=group_size, seed=seed,
+    )
+    return output, next_amax
+class AmaxStorage:
+    __slots__ = ("e_ht_amax", "weght_tht_amax", "e_tht_amax", "input_tht_amax")
+    def __init__(self):
+        self.e_ht_amax = None
+        self.weght_tht_amax = None
+        self.e_tht_amax = None
+        self.input_tht_amax = None
+class FakeQuartetFn(torch.autograd.Function):
+    group_size = 16
+    forward_scale_override = 1.0
+    backward_scale_override = (17 / 16) * 0.93
+    hadamard_matrix = None
+    @torch.compile(dynamic=False)
+    @staticmethod
+    def forward(ctx, input, weight, amax_storage, delayed_amax, disable_forward_quant, disable_backward_quant, four_over_six):
+        ctx.batch = input.shape[0]
+        ctx.seq = input.shape[1]
+        ctx.in_dim = weight.shape[1]
+        ctx.out_dim = weight.shape[0]
+        ctx.delayed_amax = delayed_amax
+        ctx.amax_storage = amax_storage
+        ctx.disable_backward_quant = disable_backward_quant
+        if disable_forward_quant:
+            input_fq = input
+            weight_fq = weight
+        else:
+            input_fq = rtn_1x16s_fp4(input, FakeQuartetFn.forward_scale_override, FakeQuartetFn.group_size, four_over_six)
+            weight_fq = rtn_1x16s_fp4(weight, FakeQuartetFn.forward_scale_override, FakeQuartetFn.group_size, four_over_six)
+        ctx.save_for_backward(input_fq, weight_fq)
+        return F.linear(input_fq, weight_fq)
+    @staticmethod
+    def backward(ctx, grad_output):
+        input_fq, weight_fq = ctx.saved_tensors
+        dtype = grad_output.dtype
+        input_fq = input_fq.to(dtype).reshape(ctx.batch * ctx.seq, ctx.in_dim)
+        weight_fq = weight_fq.to(dtype)
+        grad_output = grad_output.reshape(ctx.batch * ctx.seq, ctx.out_dim)
+        FakeQuartetFn.hadamard_matrix = rerotate_hadamard(FakeQuartetFn.hadamard_matrix)
+        if ctx.disable_backward_quant:
+            grad_input = F.linear(grad_output, weight_fq.T, None).view(ctx.batch, ctx.seq, ctx.in_dim)
+            grad_weight = F.linear(grad_output.T, input_fq.T, None)
+            return grad_input, grad_weight, None, None, None, None, None
+        had = FakeQuartetFn.hadamard_matrix.to(grad_output.dtype)
+        bso = FakeQuartetFn.backward_scale_override
+        gs = FakeQuartetFn.group_size
+        # EW: grad_output @ weight^T
+        if ctx.amax_storage.e_ht_amax is None or not ctx.delayed_amax:
+            ctx.amax_storage.e_ht_amax = (grad_output.reshape(-1, had.size(0)) @ had.T).abs().max().float()
+        e_ht_fp4, ctx.amax_storage.e_ht_amax = eden_1x16s_fp4(grad_output, had, bso, gs, ctx.amax_storage.e_ht_amax)
+        if ctx.amax_storage.weght_tht_amax is None or not ctx.delayed_amax:
+            ctx.amax_storage.weght_tht_amax = (weight_fq.T.reshape(-1, had.size(0)) @ had.T).abs().max().float()
+        weight_tht_fp4, ctx.amax_storage.weght_tht_amax = eden_1x16s_fp4(weight_fq.T, had, bso, gs, ctx.amax_storage.weght_tht_amax)
+        grad_input = F.linear(e_ht_fp4, weight_tht_fp4, None).view(ctx.batch, ctx.seq, ctx.in_dim)
+        # EtX: grad_output^T @ input
+        if ctx.amax_storage.e_tht_amax is None or not ctx.delayed_amax:
+            ctx.amax_storage.e_tht_amax = (grad_output.T.reshape(-1, had.size(0)) @ had.T).abs().max().float()
+        e_tht_fp4, ctx.amax_storage.e_tht_amax = eden_1x16s_fp4(grad_output.T, had, bso, gs, ctx.amax_storage.e_tht_amax)
+        if ctx.amax_storage.input_tht_amax is None or not ctx.delayed_amax:
+            ctx.amax_storage.input_tht_amax = (input_fq.T.reshape(-1, had.size(0)) @ had.T).abs().max().float()
+        input_tht_fp4, ctx.amax_storage.input_tht_amax = eden_1x16s_fp4(input_fq.T, had, bso, gs, ctx.amax_storage.input_tht_amax)
+        grad_weight = F.linear(e_tht_fp4, input_tht_fp4, None)
+        return grad_input, grad_weight, None, None, None, None, None
+class FakeQuartetLinear(torch.nn.Linear):
+    def __init__(self, *args, hadamard_dim=32, delayed_amax=False,
+                 disable_forward_quant=False, disable_backward_quant=False,
+                 four_over_six=True, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hadamard_dim = hadamard_dim
+        self.delayed_amax = delayed_amax
+        self.disable_forward_quant = disable_forward_quant
+        self.disable_backward_quant = disable_backward_quant
+        self.four_over_six = four_over_six
+        self.amax_storage = AmaxStorage()
+        if FakeQuartetFn.hadamard_matrix is None:
+            FakeQuartetFn.hadamard_matrix = get_hadamard_matrix(
+                self.hadamard_dim, dtype=torch.float32, device="cuda",
+            )
+    def forward(self, x):
+        return FakeQuartetFn.apply(
+            x, self.weight, self.amax_storage,
+            self.delayed_amax, self.disable_forward_quant,
+            self.disable_backward_quant, self.four_over_six,
+        )

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5802c11b6b024033386dba4cdff8665d48de19850e0e63c31686f44430ca870f
+size 16563661264

modeling_cloverlm.py ADDED Viewed

	@@ -0,0 +1,237 @@

+from math import sqrt
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_cloverlm import CloverLMConfig
+from .fake_quartet import FakeQuartetLinear
+def _sphere_norm(X, dim=-1):
+    return F.normalize(X, dim=dim)
+class _ReLU2(nn.Module):
+    def forward(self, x):
+        return F.relu(x) ** 2
+def _make_linear(in_f, out_f, bias, quartet_2_impl):
+    if quartet_2_impl == "pseudoquant":
+        return FakeQuartetLinear(in_f, out_f, bias)
+    elif quartet_2_impl == "quartet2":
+        try:
+            from quartet2.linear import Quartet_II_linear
+        except ImportError as e:
+            e.add_note("Quartet_II_linear import failed. Install the latest quartet2 from https://github.com/IST-DASLab/Quartet-II")
+            raise e
+        return Quartet_II_linear(in_f, out_f, bias)
+    else:
+        raise ValueError(f"Unsupported quartet_2_impl: {quartet_2_impl}")
+def _build_rope(context, d_head, device):
+    ms = torch.arange(context, device=device, dtype=torch.float32)
+    js = torch.arange(d_head // 2, device=device, dtype=torch.float32)
+    theta = 1.0 / (1024.0 ** (2.0 * js / d_head))
+    phi = ms[:, None] @ theta[None, :]
+    cos = torch.cos(phi).repeat_interleave(2, dim=1)
+    sin = torch.sin(phi).repeat_interleave(2, dim=1)
+    return torch.stack((cos, sin))
+def _apply_rope(X, rope):
+    X_ = torch.empty_like(X)
+    X_[..., 0::2] = -X[..., 1::2]
+    X_[..., 1::2] = X[..., 0::2]
+    return (X * rope[0] + X_ * rope[1]).to(X.dtype)
+class _MLP(nn.Module):
+    def __init__(self, d, d_hidden, quartet_2_impl):
+        super().__init__()
+        self.l1 = nn.Sequential(_make_linear(d, d_hidden, False, quartet_2_impl), _ReLU2())
+        self.l2 = _make_linear(d_hidden, d, False, quartet_2_impl)
+    def forward(self, x):
+        return self.l2(self.l1(x))
+class MHSA(nn.Module):
+    def __init__(self, heads, d_head, ratio, quartet_2_impl):
+        super().__init__()
+        self.heads = heads
+        self.d_head = d_head
+        self.d = heads * d_head
+        self.groups = heads // ratio
+        d_kv = self.groups * d_head
+        self.lq = _make_linear(self.d, self.d, False, quartet_2_impl)
+        self.lk = _make_linear(self.d, d_kv, False, quartet_2_impl)
+        self.lv = _make_linear(self.d, d_kv, False, quartet_2_impl)
+        self.lo = _make_linear(self.d, self.d, False, quartet_2_impl)
+        self.scale = nn.Parameter(torch.full((1, heads, 1, 1), sqrt(d_head)))
+    def forward(self, X, rope, attn_backend):
+        B = X.shape[0] if X.dim() == 3 else 1
+        ctx = X.shape[-2]
+        Q = self.lq(X).unflatten(-1, (self.heads, self.d_head)).movedim(-3, -2)
+        K = self.lk(X).unflatten(-1, (self.groups, self.d_head)).movedim(-3, -2)
+        V = self.lv(X).unflatten(-1, (self.groups, self.d_head)).movedim(-3, -2)
+        Q = _apply_rope(Q, rope)
+        K = _apply_rope(K, rope)
+        Q = _sphere_norm(Q)
+        K = _sphere_norm(K)
+        Q_shape = Q.shape
+        Q = self.scale * Q
+        Q = Q.reshape(Q_shape)
+        if attn_backend == "pytorch":
+            K = K.repeat_interleave(self.heads // self.groups, dim=-3)
+            V = V.repeat_interleave(self.heads // self.groups, dim=-3)
+            Y = F.scaled_dot_product_attention(Q, K, V, is_causal=True, scale=1.0)
+            Y = Y.movedim(-3, -2).flatten(-2, -1)
+        elif attn_backend in ("flash2", "flash3", "flash4"):
+            Q = Q.movedim(-3, -2).reshape(-1, ctx, self.heads, self.d_head)
+            K = K.movedim(-3, -2).reshape(-1, ctx, self.groups, self.d_head)
+            V = V.movedim(-3, -2).reshape(-1, ctx, self.groups, self.d_head)
+            dtype = Q.dtype if Q.dtype in (torch.bfloat16, torch.float16) else torch.bfloat16
+            if attn_backend == "flash2":
+                import flash_attn
+                Y = flash_attn.flash_attn_func(Q.to(dtype), K.to(dtype), V.to(dtype), causal=True, softmax_scale=1.0)
+            elif attn_backend == "flash3":
+                import importlib
+                _fa3 = importlib.import_module("flash_attn_interface")
+                Y = _fa3.flash_attn_func(Q.to(dtype), K.to(dtype), V.to(dtype), causal=True, softmax_scale=1.0)
+            elif attn_backend == "flash4":
+                import importlib
+                _fa4 = importlib.import_module("flash_attn.cute")
+                Y = _fa4.flash_attn_func(Q.to(dtype), K.to(dtype), V.to(dtype), causal=True, softmax_scale=1.0)[0]
+            Y = Y.to(Q.dtype).flatten(-2, -1)
+        return self.lo(Y)
+class _Block(nn.Module):
+    def __init__(self, heads, d_head, ratio, quartet_2_impl):
+        super().__init__()
+        d = heads * d_head
+        self.mhsa = MHSA(heads, d_head, ratio, quartet_2_impl)
+        self.out_att_norm = nn.RMSNorm(d, elementwise_affine=True)
+        self.mlp = _MLP(d, 4 * d, quartet_2_impl)
+        self.out_mlp_norm = nn.RMSNorm(d, elementwise_affine=True)
+    def forward(self, X, rope, attn_backend):
+        Y = self.out_att_norm(self.mhsa(X, rope, attn_backend))
+        Y = X + Y
+        Z = self.out_mlp_norm(self.mlp(Y))
+        return Y + Z
+class _Transformer(nn.Module):
+    def __init__(self, vocab_size, num_blocks, heads, d_head, ratio,
+                 max_context, std, quartet_2_impl, weight_tying, attn_backend):
+        super().__init__()
+        self.d_head = d_head
+        self.attn_backend = attn_backend
+        d = heads * d_head
+        self.emb = nn.Embedding(vocab_size, d)
+        self.blocks = nn.Sequential(*[
+            _Block(heads, d_head, ratio, quartet_2_impl) for _ in range(num_blocks)
+        ])
+        self.out_norm = nn.RMSNorm(d, elementwise_affine=True)
+        self.linear = nn.Linear(d, vocab_size, bias=False)
+        if weight_tying:
+            self.emb.weight = self.linear.weight
+        for name, p in self.named_parameters():
+            parent_name, _, suffix = name.rpartition(".")
+            parent = self.get_submodule(parent_name)
+            if isinstance(parent, (nn.Linear, nn.Embedding)) and suffix == "weight":
+                nn.init.normal_(p, 0, std)
+            elif isinstance(parent, nn.RMSNorm) and suffix == "weight":
+                nn.init.ones_(p)
+            elif p.ndim == 4:
+                nn.init.constant_(p, sqrt(d_head))
+        if quartet_2_impl:
+            for m in self.modules():
+                if isinstance(m, (nn.LayerNorm, nn.RMSNorm, nn.Embedding)):
+                    m.to(torch.bfloat16)
+    def forward(self, ids):
+        ctx = ids.shape[-1]
+        rope = _build_rope(ctx, self.d_head, device=ids.device)
+        X = self.emb(ids)
+        for block in self.blocks:
+            X = block(X, rope, self.attn_backend)
+        X = self.out_norm(X)
+        return self.linear(X)
+class CloverLMForCausalLM(PreTrainedModel, GenerationMixin):
+    config_class = CloverLMConfig
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["_Block"]
+    _tied_weights_keys = ["transformer.linear.weight"]
+    _tp_plan = {}
+    def __init__(self, config: CloverLMConfig):
+        super().__init__(config)
+        self.all_tied_weights_keys = {k: "transformer.emb.weight"
+                                      for k in (self._tied_weights_keys or [])}
+        self.transformer = _Transformer(
+            vocab_size=config.vocab_size,
+            num_blocks=config.num_blocks,
+            heads=config.heads,
+            d_head=config.d_head,
+            ratio=config.ratio,
+            max_context=config.max_context,
+            std=0.02,
+            quartet_2_impl=config.quartet_2_impl,
+            weight_tying=config.weight_tying,
+            attn_backend=config.attn_backend,
+        )
+    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
+        logits = self.transformer(input_ids)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+            )
+        return CausalLMOutputWithPast(loss=loss, logits=logits)
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        return {"input_ids": input_ids}
+    def _supports_default_dynamic_cache(self):
+        return False

tokenization_cloverlm.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from typing import List, Optional
+import tokenmonster
+from transformers import PreTrainedTokenizer
+TOKENMONSTER_URL = (
+    "https://huggingface.co/gvlassis/tokenmonster/resolve/main/"
+    "englishcode-32000-strict-nocapcode-v1-eot%3D14199.vocab"
+    "?download=true"
+)
+class CloverLMTokenizer(PreTrainedTokenizer):
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(self, vocab_url: str = TOKENMONSTER_URL,
+                 eot_id: int = 14199, **kwargs):
+        self._tm = tokenmonster.load(vocab_url)
+        self._eot_id = eot_id
+        self._vocab_size = 32000
+        super().__init__(
+            eos_token="<eot>",
+            pad_token="<eot>",
+            bos_token="<eot>",
+            **kwargs,
+        )
+        self.eos_token_id = eot_id
+        self.pad_token_id = eot_id
+        self.bos_token_id = eot_id
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+    def get_vocab(self):
+        return {f"<tok_{i}>": i for i in range(self._vocab_size)}
+    def _tokenize(self, text: str, **kwargs) -> List[str]:
+        ids = self._tm.tokenize(text).tolist()
+        return [str(i) for i in ids]
+    def _convert_token_to_id(self, token: str) -> int:
+        return int(token)
+    def _convert_id_to_token(self, index: int) -> str:
+        return str(index)
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        ids = [int(t) for t in tokens]
+        return self._tm.decode(ids)
+    @property
+    def all_special_tokens_extended(self):
+        return [self.eos_token]
+    @property
+    def all_special_tokens(self):
+        return [self.eos_token]
+    @property
+    def all_special_ids(self):
+        return [self._eot_id]
+    def save_vocabulary(self, save_directory: str,
+                        filename_prefix: Optional[str] = None):
+        return ()

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "tokenizer_class": "CloverLMTokenizer",
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_cloverlm.CloverLMTokenizer",
+      null
+    ]
+  },
+  "use_fast": false
+}