fix

Files changed (2) hide show

__pycache__/mlp.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/mlp.cpython-312.pyc and b/__pycache__/mlp.cpython-312.pyc differ

mlp.py CHANGED Viewed

@@ -2,26 +2,21 @@ import torch.nn as nn
 from torch.nn import functional as F
 import torch
 class MLP(nn.Module):
-    def __init__(self, config, dtype=None):
         # https://arxiv.org/pdf/2002.05202
         super().__init__()
-        torch_dtype = config.torch_dtype
-        dtype = dtype if dtype is not None else torch_dtype
-        self.hidden_size = config.n_embd
-        self.intermediate_size = config.n_embd * config.mlp_scale
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.bias, dtype=torch.bfloat16)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.bias, dtype=torch.bfloat16)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.bias, dtype=torch.bfloat16)
         self.dropout = nn.Dropout(config.dropout)
     def forward(self, x):
-        dtype = self.gate_proj.weight.dtype  # Match the dtype of projection layers
-        x = x.to(dtype=dtype)  # Convert input to the same dtype
-        x = x.to(self.gate_proj.weight.dtype)
         gate = self.gate_proj(x)
-        gate = F.gelu(gate, approximate="tanh").to(dtype=dtype)
-        up = self.up_proj(x).to(dtype=dtype)
         fuse = gate * up
-        outputs = self.down_proj(fuse).to(dtype=dtype)
         outputs = self.dropout(outputs)
         return outputs

 from torch.nn import functional as F
 import torch
 class MLP(nn.Module):
+    def __init__(self, config):
         # https://arxiv.org/pdf/2002.05202
         super().__init__()
+        self.hidden_size = config.dim
+        self.intermediate_size = config.dim * config.mlp_scale
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.bias)
         self.dropout = nn.Dropout(config.dropout)
     def forward(self, x):
         gate = self.gate_proj(x)
+        gate = F.gelu(gate, approximate="tanh")
+        up = self.up_proj(x)
         fuse = gate * up
+        outputs = self.down_proj(fuse)
         outputs = self.dropout(outputs)
         return outputs