| { | |
| "vocab_size": 50257, | |
| "d_model": 512, | |
| "n_layers": 8, | |
| "n_heads": 8, | |
| "d_latent": 128, | |
| "n_bits": 3, | |
| "d_rope": 16, | |
| "fff_depth": 2, | |
| "max_seq_len": 2048, | |
| "batch_size": 8, | |
| "lr": 0.0003, | |
| "total_steps": 7500, | |
| "warmup_steps": 500, | |
| "grad_clip": 1.0, | |
| "tokens_target": 5000000000, | |
| "log_every": 50, | |
| "save_every": 1000, | |
| "hf_repo": "alplusplus/maple-attn-test" | |
| } |