orature
/

ALIF-Base-100M

@@ -69,6 +69,7 @@ class FeedFoward(nn.Module):   #yeh MLP hai karpathy wala -> Feed forward hai se
         return x
     """ a simple linear layer followed by a non-linearity """
 class Block(nn.Module):
     """ Transformer block: communication followed by computation """
@@ -87,22 +88,22 @@ class Block(nn.Module):
 class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
-    def __init__(self, vocab_size=20000, block_size=1024, n_embd=768, n_head=12, n_layer=12):
         super().__init__()
         print("This is vocab size:", vocab_size)
-        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
-        self.position_embedding_table = nn.Embedding(block_size, n_embd)
         self.blocks = nn.Sequential(
-            *[Block(n_embd, n_head=n_head) for _ in range(n_layer)]
         )
-        self.ln_f = nn.LayerNorm(n_embd)
-        self.lm_head = nn.Linear(n_embd, vocab_size)
         self.token_embedding_table.weight = self.lm_head.weight
         self.apply(self._init_weights)
-        self.config = {"BLOCK_SIZE": block_size, "N_EMBD": n_embd, "N_HEAD":n_head, "N_LAYER": n_layer}
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
@@ -115,6 +116,8 @@ class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
     def forward(self, idx, targets=None):
         B, T = idx.shape
         assert T <= BLOCK_SIZE, f"Cannot forward sequence of length {T}, block size is only {BLOCK_SIZE}"
@@ -175,11 +178,13 @@ class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
     # Load the state dict
         state_dict = torch.load(path)["model"]
         new_state_dict = {}
         for key, value in state_dict.items():
             new_key = key.replace('_orig_mod.', '')  # Remove 'orig_mod.' prefix
             new_state_dict[new_key] = value
         self.load_state_dict(new_state_dict)
@@ -197,4 +202,7 @@ class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
         use_fused = fused_available and device == "cuda"
         optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused = use_fused)
         return optimizer
-MODEL_PATH = "Naive_gpt\model_weights_llama"  # Where to save weights

         return x
     """ a simple linear layer followed by a non-linearity """
 class Block(nn.Module):
     """ Transformer block: communication followed by computation """
 class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, vocab_size = 32000, block_size=1024, n_embd=768, n_head=12, n_layer=12):
         super().__init__()
         print("This is vocab size:", vocab_size)
+        self.token_embedding_table = nn.Embedding(vocab_size, N_EMBD)
+        self.position_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBD)
         self.blocks = nn.Sequential(
+            *[Block(N_EMBD, n_head=N_HEAD) for _ in range(N_LAYER)]
         )
+        self.ln_f = nn.LayerNorm(N_EMBD)
+        self.lm_head = nn.Linear(N_EMBD, vocab_size)
         self.token_embedding_table.weight = self.lm_head.weight
         self.apply(self._init_weights)
+        self.config = {"BLOCK_SIZE": BLOCK_SIZE, "N_EMBD": N_EMBD, "N_HEAD":N_HEAD, "N_LAYER": N_LAYER}
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
     def forward(self, idx, targets=None):
         B, T = idx.shape
         assert T <= BLOCK_SIZE, f"Cannot forward sequence of length {T}, block size is only {BLOCK_SIZE}"
     # Load the state dict
         state_dict = torch.load(path)["model"]
+        # Rename the keys to match the expected ones (remove "orig_mod." prefix)
         new_state_dict = {}
         for key, value in state_dict.items():
             new_key = key.replace('_orig_mod.', '')  # Remove 'orig_mod.' prefix
             new_state_dict[new_key] = value
+        # Load the renamed state dict into the model
         self.load_state_dict(new_state_dict)
         use_fused = fused_available and device == "cuda"
         optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused = use_fused)
         return optimizer
+    def load_optimizer(self, optimizer, path):
+        optimizer.load_state_dict(torch.load(path)["optimizer"])
+        return optimizer