| vocab_size: 50304 | |
| tie_weights: True | |
| mlstm_block: | |
| mlstm: | |
| conv1d_kernel_size: 4 | |
| qkv_proj_blocksize: 4 | |
| num_heads: 4 | |
| slstm_block: | |
| slstm: | |
| backend: vanilla | |
| num_heads: 4 | |
| conv1d_kernel_size: 4 | |
| bias_init: powerlaw_blockdependent | |
| feedforward: | |
| proj_factor: 1.3 | |
| act_fn: gelu | |
| context_length: 256 | |
| num_blocks: 24 | |
| embedding_dim: 768 | |
| slstm_at: [3, 20] | |