aframson commited on
Commit
40416f7
·
1 Parent(s): 39c6477
Files changed (1) hide show
  1. model.py +56 -56
model.py CHANGED
@@ -245,62 +245,62 @@ class GPT(PreTrainedModel):
245
  if hasattr(block.attn, 'bias'):
246
  block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
247
 
248
- @classmethod
249
- def from_pretrained(cls, model_type, override_args=None):
250
- assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
251
- override_args = override_args or {} # default to empty dict
252
- # only dropout can be overridden see more notes below
253
- assert all(k == 'dropout' for k in override_args)
254
- from transformers import GPT2LMHeadModel
255
- print("loading weights from pretrained gpt: %s" % model_type)
256
-
257
- # n_layer, n_head and n_embd are determined from model_type
258
- config_args = {
259
- 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
260
- 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
261
- 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
262
- 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
263
- }[model_type]
264
- print("forcing vocab_size=50257, block_size=1024, bias=True")
265
- config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
266
- config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
267
- config_args['bias'] = True # always True for GPT model checkpoints
268
- # we can override the dropout rate, if desired
269
- if 'dropout' in override_args:
270
- print(f"overriding dropout rate to {override_args['dropout']}")
271
- config_args['dropout'] = override_args['dropout']
272
- # create a from-scratch initialized minGPT model
273
- config = GPTConfig(**config_args)
274
- model = GPT(config)
275
- sd = model.state_dict()
276
- sd_keys = sd.keys()
277
- sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
278
-
279
- # init a huggingface/transformers model
280
- model_hf = GPT2LMHeadModel.from_pretrained(model_type)
281
- sd_hf = model_hf.state_dict()
282
-
283
- # copy while ensuring all of the parameters are aligned and match in names and shapes
284
- sd_keys_hf = sd_hf.keys()
285
- sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
286
- sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
287
- transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
288
- # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
289
- # this means that we have to transpose these weights when we import them
290
- assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
291
- for k in sd_keys_hf:
292
- if any(k.endswith(w) for w in transposed):
293
- # special treatment for the Conv1D weights we need to transpose
294
- assert sd_hf[k].shape[::-1] == sd[k].shape
295
- with torch.no_grad():
296
- sd[k].copy_(sd_hf[k].t())
297
- else:
298
- # vanilla copy over the other parameters
299
- assert sd_hf[k].shape == sd[k].shape
300
- with torch.no_grad():
301
- sd[k].copy_(sd_hf[k])
302
-
303
- return model
304
 
305
  def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
306
  # start with all of the candidate parameters
 
245
  if hasattr(block.attn, 'bias'):
246
  block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
247
 
248
+ # @classmethod
249
+ # def from_pretrained(cls, model_type, override_args=None):
250
+ # assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
251
+ # override_args = override_args or {} # default to empty dict
252
+ # # only dropout can be overridden see more notes below
253
+ # assert all(k == 'dropout' for k in override_args)
254
+ # from transformers import GPT2LMHeadModel
255
+ # print("loading weights from pretrained gpt: %s" % model_type)
256
+
257
+ # # n_layer, n_head and n_embd are determined from model_type
258
+ # config_args = {
259
+ # 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
260
+ # 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
261
+ # 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
262
+ # 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
263
+ # }[model_type]
264
+ # print("forcing vocab_size=50257, block_size=1024, bias=True")
265
+ # config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
266
+ # config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
267
+ # config_args['bias'] = True # always True for GPT model checkpoints
268
+ # # we can override the dropout rate, if desired
269
+ # if 'dropout' in override_args:
270
+ # print(f"overriding dropout rate to {override_args['dropout']}")
271
+ # config_args['dropout'] = override_args['dropout']
272
+ # # create a from-scratch initialized minGPT model
273
+ # config = GPTConfig(**config_args)
274
+ # model = GPT(config)
275
+ # sd = model.state_dict()
276
+ # sd_keys = sd.keys()
277
+ # sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
278
+
279
+ # # init a huggingface/transformers model
280
+ # model_hf = GPT2LMHeadModel.from_pretrained(model_type)
281
+ # sd_hf = model_hf.state_dict()
282
+
283
+ # # copy while ensuring all of the parameters are aligned and match in names and shapes
284
+ # sd_keys_hf = sd_hf.keys()
285
+ # sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
286
+ # sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
287
+ # transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
288
+ # # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
289
+ # # this means that we have to transpose these weights when we import them
290
+ # assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
291
+ # for k in sd_keys_hf:
292
+ # if any(k.endswith(w) for w in transposed):
293
+ # # special treatment for the Conv1D weights we need to transpose
294
+ # assert sd_hf[k].shape[::-1] == sd[k].shape
295
+ # with torch.no_grad():
296
+ # sd[k].copy_(sd_hf[k].t())
297
+ # else:
298
+ # # vanilla copy over the other parameters
299
+ # assert sd_hf[k].shape == sd[k].shape
300
+ # with torch.no_grad():
301
+ # sd[k].copy_(sd_hf[k])
302
+
303
+ # return model
304
 
305
  def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
306
  # start with all of the candidate parameters