| { | |
| "model_type": "gpt2", | |
| "architectures": [ | |
| "GPT2LMHeadModel" | |
| ], | |
| "n_ctx": 1024, | |
| "block_size": 1024, | |
| "vocab_size": 50304, | |
| "n_layer": 12, | |
| "n_head": 12, | |
| "n_embd": 768, | |
| "val_loss_accum": 3.0538008362054825, | |
| "train_config": { | |
| "seed": 1337, | |
| "step": 19072, | |
| "total_batch_size": 524288, | |
| "micro_batch_size": 32, | |
| "sequence_length": 1024, | |
| "max_lr": 0.0006, | |
| "min_lr_ratio": 0.1, | |
| "warmup_steps": 715, | |
| "max_steps": 19073, | |
| "eval_steps": 250, | |
| "checkpoint_steps": 5000, | |
| "checkpoint_dir": "checkpoints", | |
| "log_file": "train_2025-04-06_01-53-23.log" | |
| }, | |
| "dataset_config": { | |
| "dataset_dir": "dataset_cache", | |
| "dataset_name": "finewebedu", | |
| "micro_batch_size": 32, | |
| "sequence_length": 1024 | |
| }, | |
| "task_specific_params": { | |
| "eval_config": { | |
| "validation_steps": 20, | |
| "hellaswag_samples": 250 | |
| }, | |
| "sample_config": { | |
| "num_return_sequences": 5, | |
| "max_length": 30, | |
| "text": "Hello, I'm a language model,", | |
| "seed": 42 | |
| } | |
| } | |
| } |