MaxiiMin
/

Challenger-1

Model card Files Files and versions

MaxiiMin commited on May 7, 2025

Commit

a67ae76

·

verified ·

1 Parent(s): 698b5ef

Update modeling_challenger.py

Files changed (1) hide show

modeling_challenger.py +43 -2

modeling_challenger.py CHANGED Viewed

@@ -87,7 +87,7 @@ class FP8Linear(torch.nn.Module):
     def forward(self, x):
         """
-        Accepts x of shape (..., in_features) – any leading dims.
         Flattens to 2‑D, does the FP8 matmul, then restores the shape.
         """
         orig_shape   = x.shape[:-1]              # e.g. (B, T)
@@ -155,6 +155,47 @@ class Block(nn.Module):
     def forward(self, x):
         return x + self.attn(self.ln_1(x)) + self.mlp(self.ln_2(x))
 class ChallengerModel(PreTrainedModel):
     config_class = ChallengerConfig
@@ -166,4 +207,4 @@ class ChallengerModel(PreTrainedModel):
         logits, loss = self.model(input_ids, labels)
         if labels is not None:
             return {"loss": loss, "logits": logits}
-        return {"logits": logits}

     def forward(self, x):
         """
+        Accepts x of shape (..., in_features) – any leading dims.
         Flattens to 2‑D, does the FP8 matmul, then restores the shape.
         """
         orig_shape   = x.shape[:-1]              # e.g. (B, T)
     def forward(self, x):
         return x + self.attn(self.ln_1(x)) + self.mlp(self.ln_2(x))
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f = RMSNorm(config.n_embd),
+        ))
+        self.lm_head = FP8Linear(config.n_embd, config.vocab_size, bias=False)
+        self.lm_head.NANOGPT_SCALE_INIT = 1
+        # init params
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, FP8Linear):
+            std = 0.02
+            if hasattr(module, 'NANOGPT_SCALE_INIT'):
+                std *= (2 * self.config.n_layer) ** -0.5
+            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx, targets=None):
+        x = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
+        # forward the blocks of the transformer
+        for block in self.transformer.h:
+            x = block(x)
+        # forward the final layernorm and the classifier
+        x = self.transformer.ln_f(x)
+        logits = self.lm_head(x).float() # (B, T, vocab_size)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
 class ChallengerModel(PreTrainedModel):
     config_class = ChallengerConfig
         logits, loss = self.model(input_ids, labels)
         if labels is not None:
             return {"loss": loss, "logits": logits}
+        return {"logits": logits}