aframson commited on
Commit
f514d5b
·
1 Parent(s): 40416f7
Files changed (1) hide show
  1. model.py +56 -56
model.py CHANGED
@@ -245,62 +245,62 @@ class GPT(PreTrainedModel):
245
  if hasattr(block.attn, 'bias'):
246
  block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
247
 
248
- # @classmethod
249
- # def from_pretrained(cls, model_type, override_args=None):
250
- # assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
251
- # override_args = override_args or {} # default to empty dict
252
- # # only dropout can be overridden see more notes below
253
- # assert all(k == 'dropout' for k in override_args)
254
- # from transformers import GPT2LMHeadModel
255
- # print("loading weights from pretrained gpt: %s" % model_type)
256
-
257
- # # n_layer, n_head and n_embd are determined from model_type
258
- # config_args = {
259
- # 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
260
- # 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
261
- # 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
262
- # 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
263
- # }[model_type]
264
- # print("forcing vocab_size=50257, block_size=1024, bias=True")
265
- # config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
266
- # config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
267
- # config_args['bias'] = True # always True for GPT model checkpoints
268
- # # we can override the dropout rate, if desired
269
- # if 'dropout' in override_args:
270
- # print(f"overriding dropout rate to {override_args['dropout']}")
271
- # config_args['dropout'] = override_args['dropout']
272
- # # create a from-scratch initialized minGPT model
273
- # config = GPTConfig(**config_args)
274
- # model = GPT(config)
275
- # sd = model.state_dict()
276
- # sd_keys = sd.keys()
277
- # sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
278
-
279
- # # init a huggingface/transformers model
280
- # model_hf = GPT2LMHeadModel.from_pretrained(model_type)
281
- # sd_hf = model_hf.state_dict()
282
-
283
- # # copy while ensuring all of the parameters are aligned and match in names and shapes
284
- # sd_keys_hf = sd_hf.keys()
285
- # sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
286
- # sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
287
- # transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
288
- # # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
289
- # # this means that we have to transpose these weights when we import them
290
- # assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
291
- # for k in sd_keys_hf:
292
- # if any(k.endswith(w) for w in transposed):
293
- # # special treatment for the Conv1D weights we need to transpose
294
- # assert sd_hf[k].shape[::-1] == sd[k].shape
295
- # with torch.no_grad():
296
- # sd[k].copy_(sd_hf[k].t())
297
- # else:
298
- # # vanilla copy over the other parameters
299
- # assert sd_hf[k].shape == sd[k].shape
300
- # with torch.no_grad():
301
- # sd[k].copy_(sd_hf[k])
302
-
303
- # return model
304
 
305
  def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
306
  # start with all of the candidate parameters
 
245
  if hasattr(block.attn, 'bias'):
246
  block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
247
 
248
+ @classmethod
249
+ def from_pretrained(cls, model_type, override_args=None):
250
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
251
+ override_args = override_args or {} # default to empty dict
252
+ # only dropout can be overridden see more notes below
253
+ assert all(k == 'dropout' for k in override_args)
254
+ from transformers import GPT2LMHeadModel
255
+ print("loading weights from pretrained gpt: %s" % model_type)
256
+
257
+ # n_layer, n_head and n_embd are determined from model_type
258
+ config_args = {
259
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
260
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
261
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
262
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
263
+ }[model_type]
264
+ print("forcing vocab_size=50257, block_size=1024, bias=True")
265
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
266
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
267
+ config_args['bias'] = True # always True for GPT model checkpoints
268
+ # we can override the dropout rate, if desired
269
+ if 'dropout' in override_args:
270
+ print(f"overriding dropout rate to {override_args['dropout']}")
271
+ config_args['dropout'] = override_args['dropout']
272
+ # create a from-scratch initialized minGPT model
273
+ config = GPTConfig(**config_args)
274
+ model = GPT(config)
275
+ sd = model.state_dict()
276
+ sd_keys = sd.keys()
277
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
278
+
279
+ # init a huggingface/transformers model
280
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
281
+ sd_hf = model_hf.state_dict()
282
+
283
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
284
+ sd_keys_hf = sd_hf.keys()
285
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
286
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
287
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
288
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
289
+ # this means that we have to transpose these weights when we import them
290
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
291
+ for k in sd_keys_hf:
292
+ if any(k.endswith(w) for w in transposed):
293
+ # special treatment for the Conv1D weights we need to transpose
294
+ assert sd_hf[k].shape[::-1] == sd[k].shape
295
+ with torch.no_grad():
296
+ sd[k].copy_(sd_hf[k].t())
297
+ else:
298
+ # vanilla copy over the other parameters
299
+ assert sd_hf[k].shape == sd[k].shape
300
+ with torch.no_grad():
301
+ sd[k].copy_(sd_hf[k])
302
+
303
+ return model
304
 
305
  def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
306
  # start with all of the candidate parameters