| model_config = 'gpt2' | |
| log_file = 'logs/gpt2.log' | |
| out_dir = 'out/gpt2' | |
| eval_interval = 500 | |
| log_interval = 1 | |
| eval_iters = 500 | |
| eval_only = False | |
| always_save_checkpoint = False | |
| init_from = 'scratch' | |
| wandb_log = False | |
| wandb_project = 'owt' | |
| wandb_run_name = 'gpt2' | |
| dataset = 'openwebtext' | |
| gradient_accumulation_steps = 64 | |
| batch_size = 1 | |
| block_size = 1024 | |
| n_layer = 36 | |
| n_head = 16 | |
| n_embd = 1024 | |
| vocab_size = 50304 | |
| dropout = 0.0 | |
| bias = False | |
| learning_rate = 3e-4 | |
| max_iters = 50000 | |
| weight_decay = 1e-1 | |
| beta1 = 0.9 | |
| beta2 = 0.95 | |
| grad_clip = 1.0 | |
| decay_lr = True | |
| warmup_iters = 1000 | |
| lr_decay_iters = 50000 | |
| min_lr = 3e-5 | |
| backend = 'nccl' | |
| device = 'cuda' | |
| dtype = 'float16' | |
| compile = True | |