summerstars
/

EN-summer

summerstars commited on Apr 27

Commit

dde2635

verified ·

1 Parent(s): 808826f

Update modeling_minimythos_hybrid.py

Files changed (1) hide show

modeling_minimythos_hybrid.py CHANGED Viewed

@@ -78,7 +78,12 @@ class RMSNorm(nn.Module):
         self.weight = nn.Parameter(torch.ones(dim))
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.weight * x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
 class ReservoirBlock(nn.Module):
@@ -270,6 +275,9 @@ class MiniMythosHybridForCausalLM(PreTrainedModel):
             hidden_states.append(x)
         logits = self.lm_head(x)
         loss = None
         if labels is not None:

         self.weight = nn.Parameter(torch.ones(dim))
     def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Compute RMSNorm in fp32 for numerical stability, then cast back.
+        orig_dtype = x.dtype
+        x_float = x.float()
+        var = x_float.pow(2).mean(-1, keepdim=True)
+        x_norm = x_float * torch.rsqrt(var + self.eps)
+        return (self.weight.float() * x_norm).to(orig_dtype)
 class ReservoirBlock(nn.Module):
             hidden_states.append(x)
         logits = self.lm_head(x)
+        # Prevent generation from crashing if a checkpoint contains unstable values.
+        # This should not hide training issues, but it makes inference robust.
+        logits = torch.nan_to_num(logits, nan=0.0, posinf=1e4, neginf=-1e4)
         loss = None
         if labels is not None: