AliMuhammad73 commited on
Commit
d64b4c8
·
verified ·
1 Parent(s): 5003c37

Upload model and tokenizer

Browse files
Files changed (2) hide show
  1. config.json +1 -0
  2. modeling_gpt.py +17 -9
config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "block_size": 1024,
3
  "n_embd": 768,
4
  "n_head": 12,
 
1
  {
2
+ "model_type": "llama",
3
  "block_size": 1024,
4
  "n_embd": 768,
5
  "n_head": 12,
modeling_gpt.py CHANGED
@@ -69,6 +69,7 @@ class FeedFoward(nn.Module): #yeh MLP hai karpathy wala -> Feed forward hai se
69
  return x
70
  """ a simple linear layer followed by a non-linearity """
71
 
 
72
  class Block(nn.Module):
73
  """ Transformer block: communication followed by computation """
74
 
@@ -87,22 +88,22 @@ class Block(nn.Module):
87
 
88
  class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
89
 
90
- def __init__(self, vocab_size=20000, block_size=1024, n_embd=768, n_head=12, n_layer=12):
91
  super().__init__()
92
  print("This is vocab size:", vocab_size)
93
- self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
94
- self.position_embedding_table = nn.Embedding(block_size, n_embd)
95
  self.blocks = nn.Sequential(
96
- *[Block(n_embd, n_head=n_head) for _ in range(n_layer)]
97
  )
98
- self.ln_f = nn.LayerNorm(n_embd)
99
- self.lm_head = nn.Linear(n_embd, vocab_size)
100
 
101
  self.token_embedding_table.weight = self.lm_head.weight
102
 
103
  self.apply(self._init_weights)
104
- self.config = {"BLOCK_SIZE": block_size, "N_EMBD": n_embd, "N_HEAD":n_head, "N_LAYER": n_layer}
105
-
106
 
107
  def _init_weights(self, module):
108
  if isinstance(module, nn.Linear):
@@ -115,6 +116,8 @@ class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
115
  elif isinstance(module, nn.Embedding):
116
  torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
117
 
 
 
118
  def forward(self, idx, targets=None):
119
  B, T = idx.shape
120
  assert T <= BLOCK_SIZE, f"Cannot forward sequence of length {T}, block size is only {BLOCK_SIZE}"
@@ -175,11 +178,13 @@ class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
175
  # Load the state dict
176
  state_dict = torch.load(path)["model"]
177
 
 
178
  new_state_dict = {}
179
  for key, value in state_dict.items():
180
  new_key = key.replace('_orig_mod.', '') # Remove 'orig_mod.' prefix
181
  new_state_dict[new_key] = value
182
 
 
183
  self.load_state_dict(new_state_dict)
184
 
185
 
@@ -197,4 +202,7 @@ class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
197
  use_fused = fused_available and device == "cuda"
198
  optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused = use_fused)
199
  return optimizer
200
- MODEL_PATH = "Naive_gpt\model_weights_llama" # Where to save weights
 
 
 
 
69
  return x
70
  """ a simple linear layer followed by a non-linearity """
71
 
72
+
73
  class Block(nn.Module):
74
  """ Transformer block: communication followed by computation """
75
 
 
88
 
89
  class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
90
 
91
+ def __init__(self, vocab_size = 32000, block_size=1024, n_embd=768, n_head=12, n_layer=12):
92
  super().__init__()
93
  print("This is vocab size:", vocab_size)
94
+ self.token_embedding_table = nn.Embedding(vocab_size, N_EMBD)
95
+ self.position_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBD)
96
  self.blocks = nn.Sequential(
97
+ *[Block(N_EMBD, n_head=N_HEAD) for _ in range(N_LAYER)]
98
  )
99
+ self.ln_f = nn.LayerNorm(N_EMBD)
100
+ self.lm_head = nn.Linear(N_EMBD, vocab_size)
101
 
102
  self.token_embedding_table.weight = self.lm_head.weight
103
 
104
  self.apply(self._init_weights)
105
+ self.config = {"BLOCK_SIZE": BLOCK_SIZE, "N_EMBD": N_EMBD, "N_HEAD":N_HEAD, "N_LAYER": N_LAYER}
106
+
107
 
108
  def _init_weights(self, module):
109
  if isinstance(module, nn.Linear):
 
116
  elif isinstance(module, nn.Embedding):
117
  torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
118
 
119
+
120
+
121
  def forward(self, idx, targets=None):
122
  B, T = idx.shape
123
  assert T <= BLOCK_SIZE, f"Cannot forward sequence of length {T}, block size is only {BLOCK_SIZE}"
 
178
  # Load the state dict
179
  state_dict = torch.load(path)["model"]
180
 
181
+ # Rename the keys to match the expected ones (remove "orig_mod." prefix)
182
  new_state_dict = {}
183
  for key, value in state_dict.items():
184
  new_key = key.replace('_orig_mod.', '') # Remove 'orig_mod.' prefix
185
  new_state_dict[new_key] = value
186
 
187
+ # Load the renamed state dict into the model
188
  self.load_state_dict(new_state_dict)
189
 
190
 
 
202
  use_fused = fused_available and device == "cuda"
203
  optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused = use_fused)
204
  return optimizer
205
+
206
+ def load_optimizer(self, optimizer, path):
207
+ optimizer.load_state_dict(torch.load(path)["optimizer"])
208
+ return optimizer