{ "architectures": [ "GPT2ForCausalLM" ], "batch_size": 8, "checkpoint_dir": "gpt2/checkpoints", "d_model": 768, "dtype": "float32", "grad_accum_steps": 8, "grad_clip": 1.0, "learning_rate": 0.0003, "max_seq_len": 1024, "max_steps": 150000, "min_lr": 3e-05, "model_type": "custom_gpt2", "n_blocks": 12, "n_heads": 12, "resume_from": null, "save_every": 5000, "transformers_version": "4.57.1", "vocab_size": 50257, "warmup_steps": 2000, "weight_decay": 0.1 }