| { | |
| "model": { | |
| "vocab_size": 50257, | |
| "max_seq_len": 2048, | |
| "dim": 1024, | |
| "n_layers": 16, | |
| "n_heads": 16, | |
| "hidden_dim": 2736, | |
| "dropout": 0.0 | |
| }, | |
| "training": { | |
| "batch_size": 1, | |
| "gradient_accumulation_steps": 32, | |
| "max_steps": 50000, | |
| "warmup_steps": 2000, | |
| "learning_rate": 0.0003, | |
| "weight_decay": 0.01, | |
| "grad_clip": 1.0, | |
| "mixed_precision": "bf16", | |
| "gradient_checkpointing": true | |
| }, | |
| "data": { | |
| "seq_length": 1024, | |
| "data_path": "data/tokens/packed_1024.txt" | |
| }, | |
| "hardware": { | |
| "device": "cuda", | |
| "compile_model": false | |
| }, | |
| "logging": { | |
| "log_interval": 10, | |
| "save_interval": 2000, | |
| "output_dir": "checkpoints" | |
| } | |
| } |