ml-ryanlee
/

scaled-looped-moe

Model card Files Files and versions

ml-ryanlee commited on Mar 9

Commit

4735b6a

·

verified ·

1 Parent(s): 0a6d9d7

Update modeling_loop_lm.py

Files changed (1) hide show

modeling_loop_lm.py +6 -12

modeling_loop_lm.py CHANGED Viewed

@@ -61,15 +61,12 @@ class Linear(nn.Module):
     def __init__(self, in_features, out_features, width_ratio, std_base, device=None, dtype=None):
         super().__init__()
-        # initialize weights matrix
-        weights = torch.empty(out_features, in_features, dtype=dtype, device=device)
         # for muP, derive initial std deviation from given base model's std_deviation and width ratio
         std_scaled = std_base / math.sqrt(width_ratio)
-        weights = nn.init.trunc_normal_(weights, mean=0.0, std=std_scaled, a=-3*std_scaled, b=3*std_scaled)
-        # assign as instance variable
-        self.weight = nn.Parameter(weights)
     def forward(self, x: Tensor) -> Tensor:
         # Pytorch standard: on input side of expression, d_in is last dim of x so "... d_in"
@@ -81,14 +78,11 @@ class Embedding(nn.Module):
     def __init__(self, num_embeddings, embedding_dim, device=None, dtype=None):
         super().__init__()
-        # initialize a matrix of vocab_size x embedding_dim
-        embeddings = torch.empty(num_embeddings, embedding_dim, dtype=dtype, device=device)
         # normalize the embeddings to spec
-        embeddings = nn.init.trunc_normal_(embeddings, mean=0.0, std=1.0, a=-3, b=3)
-        # save and enroll as torch param
-        self.weight = nn.Parameter(embeddings)
     def forward(self, token_ids: Tensor) -> Tensor:
         # for every id, we need to pull the row vector associated

     def __init__(self, in_features, out_features, width_ratio, std_base, device=None, dtype=None):
         super().__init__()
+        # Register parameter first so shape is always stored (required for HF meta-device loading)
+        self.weight = nn.Parameter(torch.empty(out_features, in_features, dtype=dtype, device=device))
         # for muP, derive initial std deviation from given base model's std_deviation and width ratio
         std_scaled = std_base / math.sqrt(width_ratio)
+        nn.init.trunc_normal_(self.weight, mean=0.0, std=std_scaled, a=-3*std_scaled, b=3*std_scaled)
     def forward(self, x: Tensor) -> Tensor:
         # Pytorch standard: on input side of expression, d_in is last dim of x so "... d_in"
     def __init__(self, num_embeddings, embedding_dim, device=None, dtype=None):
         super().__init__()
+        # Register parameter first so shape is always stored (required for HF meta-device loading)
+        self.weight = nn.Parameter(torch.empty(num_embeddings, embedding_dim, dtype=dtype, device=device))
         # normalize the embeddings to spec
+        nn.init.trunc_normal_(self.weight, mean=0.0, std=1.0, a=-3, b=3)
     def forward(self, token_ids: Tensor) -> Tensor:
         # for every id, we need to pull the row vector associated