| out_dir = 'out' | |
| eval_interval = 2000 | |
| log_interval = 1 | |
| eval_iters = 200 | |
| eval_only = False | |
| always_save_checkpoint = True | |
| init_from = 'scratch' | |
| wandb_log = False | |
| wandb_project = 'owt' | |
| wandb_run_name = 'gpt2' | |
| dataset = 'openwebtext' | |
| gradient_accumulation_steps = 5 * 8 | |
| batch_size = 12 | |
| block_size = 1024 | |
| n_layer = 12 | |
| n_head = 12 | |
| n_embd = 768 | |
| dropout = 0.0 | |
| bias = False | |
| learning_rate = 6e-4 | |
| max_iters = 600000 | |
| weight_decay = 1e-1 | |
| beta1 = 0.9 | |
| beta2 = 0.95 | |
| grad_clip = 1.0 | |
| decay_lr = True | |
| warmup_iters = 2000 | |
| lr_decay_iters = 600000 | |
| min_lr = 6e-5 | |
| backend = 'nccl' | |
| device = 'cuda' | |
| dtype = 'bfloat16' | |
| compile = True | |