MaxiiMin
/

Challenger-1

@@ -2,10 +2,10 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import PreTrainedModel
 from .configuration_challenger import ChallengerConfig
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps: float = 1e-8):
         super().__init__()
@@ -19,95 +19,15 @@ class RMSNorm(nn.Module):
         output = self._norm(x.float())
         return (output * self.weight.float()).type_as(x)
-def _to_fp8(x, dtype=torch.float8_e4m3fn):
-    finfo = torch.finfo(dtype)
-    scale = finfo.max / x.abs().max().clamp(min=1e-12)
-    x_f8  = (x * scale).clamp(finfo.min, finfo.max).to(dtype)
-    return x_f8, scale.reciprocal().float()   # inverse for _scaled_mm
-class _FP8Matmul(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, w, out_dtype=torch.bfloat16):
-        x_f8, x_inv = _to_fp8(x)
-        w_f8, w_inv = _to_fp8(w)
-        y = torch._scaled_mm(                     # row‑major A × col‑major B
-            x_f8, w_f8.t(),
-            out_dtype=out_dtype,
-            scale_a=x_inv, scale_b=w_inv,
-            use_fast_accum=True,
-        )
-        ctx.save_for_backward(x_f8, w_f8, x_inv, w_inv)
-        ctx.out_dtype = out_dtype
-        return y
-    @staticmethod
-    def backward(ctx, grad_out):
-        x_f8, w_f8, x_inv, w_inv = ctx.saved_tensors
-        g_f8, g_inv = _to_fp8(grad_out, dtype=torch.float8_e5m2)
-        # ---- dx = grad_out @  w ------------------------------------------
-        # A  = g_f8                        (row‑major, (N, out))
-        # B  = w_f8.T.contiguous().T       (col‑major, (out, in))
-        dx = torch._scaled_mm(
-            g_f8,
-            w_f8.t().contiguous().t(),
-            out_dtype=ctx.out_dtype,
-            scale_a=g_inv, scale_b=w_inv,
-            use_fast_accum=False,
-        )
-        # ---- dw = x.T @ grad_out  ----------------------------------------
-        # A  = x_f8.T.contiguous()         (row‑major, (in, N))
-        # B  = g_f8.T.contiguous().T       (col‑major, (N, out))
-        dw = torch._scaled_mm(
-            x_f8.t().contiguous(),
-            g_f8.t().contiguous().t(),
-            out_dtype=torch.float32,
-            scale_a=x_inv, scale_b=g_inv,
-            use_fast_accum=False,
-        ).t()                                # bring back to (out, in)
-        return dx, dw, None                  # no grad for out_dtype
-# Convenience alias, identical signature to torch.mm
-fp8_mm = _FP8Matmul.apply
-# ---- drop‑in Linear ----------------------------------------------------------
-class FP8Linear(torch.nn.Module):
-    """Same signature as nn.Linear but weight‑stationary FP8 matmul."""
-    def __init__(self, in_features, out_features, bias=False):
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.empty(out_features, in_features))
-        torch.nn.init.trunc_normal_(self.weight, std=0.02)
-        if bias:
-            self.bias = torch.nn.Parameter(torch.zeros(out_features))
-        else:
-            self.register_parameter("bias", None)
-    def forward(self, x):
-        """
-        Accepts x of shape (..., in_features) – any leading dims.
-        Flattens to 2‑D, does the FP8 matmul, then restores the shape.
-        """
-        orig_shape   = x.shape[:-1]              # e.g. (B, T)
-        x2d          = x.view(-1, x.shape[-1])   # (N, in_features)
-        y2d          = fp8_mm(x2d, self.weight)  # (N, out_features)
-        if self.bias is not None:
-            y2d = y2d + self.bias
-        y            = y2d.view(*orig_shape, self.weight.size(0))
-        return y
 class CausalSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
         assert config.n_embd % config.n_head == 0
         # key, query, value projections for all heads, but in a batch
-        self.c_attn = FP8Linear(config.n_embd, 3 * config.n_embd)
         # output projection
-        self.c_proj = FP8Linear(config.n_embd, config.n_embd)
-        self.c_proj.NANOGPT_SCALE_INIT = 1
         # regularization
         self.n_head = config.n_head
         self.n_embd = config.n_embd
@@ -132,10 +52,9 @@ class MLP(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.c_fc    = FP8Linear(config.n_embd, 8 * config.n_embd)
         self.gelu    = nn.SiLU()
-        self.c_proj  = FP8Linear(4 * config.n_embd, config.n_embd)
-        self.c_proj.NANOGPT_SCALE_INIT = 1
     def forward(self, x):
         x, y = self.c_fc(x).split(x.size(-1) * 4, dim=2)
@@ -166,22 +85,7 @@ class GPT(nn.Module):
             h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
             ln_f = RMSNorm(config.n_embd),
         ))
-        self.lm_head = FP8Linear(config.n_embd, config.vocab_size, bias=False)
-        self.lm_head.NANOGPT_SCALE_INIT = 1
-        # init params
-        self.apply(self._init_weights)
-    def _init_weights(self, module):
-        if isinstance(module, FP8Linear):
-            std = 0.02
-            if hasattr(module, 'NANOGPT_SCALE_INIT'):
-                std *= (2 * self.config.n_layer) ** -0.5
-            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
-            if module.bias is not None:
-                torch.nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.Embedding):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
     def forward(self, idx, targets=None):
         x = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
@@ -196,8 +100,9 @@ class GPT(nn.Module):
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
         return logits, loss
-class ChallengerModel(PreTrainedModel):
     config_class = ChallengerConfig
     def __init__(self, config):
         super().__init__(config)
@@ -205,6 +110,7 @@ class ChallengerModel(PreTrainedModel):
     def forward(self, input_ids, labels=None):
         logits, loss = self.model(input_ids, labels)
-        if labels is not None:
-            return {"loss": loss, "logits": logits}
-        return {"logits": logits}

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from transformers import PreTrainedModel, GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
 from .configuration_challenger import ChallengerConfig
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps: float = 1e-8):
         super().__init__()
         output = self._norm(x.float())
         return (output * self.weight.float()).type_as(x)
 class CausalSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
         assert config.n_embd % config.n_head == 0
         # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
         # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
         # regularization
         self.n_head = config.n_head
         self.n_embd = config.n_embd
     def __init__(self, config):
         super().__init__()
+        self.c_fc    = nn.Linear(config.n_embd, 8 * config.n_embd, bias=False)
         self.gelu    = nn.SiLU()
+        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=False)
     def forward(self, x):
         x, y = self.c_fc(x).split(x.size(-1) * 4, dim=2)
             h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
             ln_f = RMSNorm(config.n_embd),
         ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
     def forward(self, idx, targets=None):
         x = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
         return logits, loss
+class ChallengerForCausalLM(PreTrainedModel, GenerationMixin):
     config_class = ChallengerConfig
+    _keys_to_ignore_on_load_unexpected = [r"past_key_values"]
     def __init__(self, config):
         super().__init__(config)
     def forward(self, input_ids, labels=None):
         logits, loss = self.model(input_ids, labels)
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits
+        )