| { | |
| n_layers = 6, | |
| d_model = 128, | |
| d_head = 64, | |
| n_heads = 8, | |
| d_mlp = 512, | |
| d_vocab = 61, | |
| n_ctx = 59, | |
| act_fn="gelu", | |
| normalization_type="LNPre", | |
| } |
| { | |
| n_layers = 6, | |
| d_model = 128, | |
| d_head = 64, | |
| n_heads = 8, | |
| d_mlp = 512, | |
| d_vocab = 61, | |
| n_ctx = 59, | |
| act_fn="gelu", | |
| normalization_type="LNPre", | |
| } |