| { | |
| "model": { | |
| "n_layer": 24, | |
| "n_head": 16, | |
| "n_embd": 1024 | |
| }, | |
| "block_size": 1024, | |
| "training": { | |
| "per_device_train_batch_size": 2, | |
| "gradient_accumulation_steps": 8, | |
| "num_train_epochs": 1, | |
| "learning_rate": 0.0002, | |
| "weight_decay": 0.01, | |
| "fp16": true, | |
| "logging_steps": 100, | |
| "save_steps": 1000 | |
| } | |
| } |