| { | |
| "depth": 8, | |
| "vocab_size": 186, | |
| "max_seq_len": 4096, | |
| "mlp_type": "relu2", | |
| "logit_cap": 15.0, | |
| "use_residual_scalars": true, | |
| "learning_rate": 0.001, | |
| "warmup_frac": 0.02, | |
| "weight_decay": 0.1, | |
| "grad_clip": 1.0, | |
| "num_epochs": 20 | |
| } |
| { | |
| "depth": 8, | |
| "vocab_size": 186, | |
| "max_seq_len": 4096, | |
| "mlp_type": "relu2", | |
| "logit_cap": 15.0, | |
| "use_residual_scalars": true, | |
| "learning_rate": 0.001, | |
| "warmup_frac": 0.02, | |
| "weight_decay": 0.1, | |
| "grad_clip": 1.0, | |
| "num_epochs": 20 | |
| } |