| { |
| "mlstm_block": { |
| "mlstm": { |
| "proj_factor": 2.0, |
| "round_proj_up_dim_up": true, |
| "round_proj_up_to_multiple_of": 64, |
| "_proj_up_dim": 768, |
| "conv1d_kernel_size": 4, |
| "qkv_proj_blocksize": 32, |
| "num_heads": 4, |
| "embedding_dim": 384, |
| "bias": false, |
| "dropout": 0.0, |
| "context_length": 256, |
| "_num_blocks": 6, |
| "_inner_embedding_dim": 768 |
| }, |
| "_num_blocks": 6, |
| "_block_idx": null |
| }, |
| "slstm_block": { |
| "slstm": { |
| "hidden_size": 384, |
| "num_heads": 4, |
| "num_states": 4, |
| "backend": "vanilla", |
| "function": "slstm", |
| "bias_init": "powerlaw_blockdependent", |
| "recurrent_weight_init": "zeros", |
| "_block_idx": null, |
| "_num_blocks": 6, |
| "num_gates": 4, |
| "gradient_recurrent_clipval": null, |
| "forward_clipval": null, |
| "batch_size": 8, |
| "input_shape": "BSGNH", |
| "internal_input_shape": "SBNGH", |
| "output_shape": "BNSH", |
| "dtype": "bfloat16", |
| "dtype_b": "float32", |
| "dtype_r": "bfloat16", |
| "dtype_w": "bfloat16", |
| "dtype_g": "bfloat16", |
| "dtype_s": "bfloat16", |
| "dtype_a": "float32", |
| "initial_val": 0.0, |
| "enable_automatic_mixed_precision": true, |
| "embedding_dim": 384, |
| "conv1d_kernel_size": 4, |
| "group_norm_weight": true, |
| "dropout": 0.0 |
| }, |
| "feedforward": { |
| "proj_factor": 1.3, |
| "round_proj_up_dim_up": true, |
| "round_proj_up_to_multiple_of": 64, |
| "_proj_up_dim": 0, |
| "act_fn": "swish", |
| "embedding_dim": -1, |
| "dropout": 0.0, |
| "bias": false, |
| "ff_type": "ffn_gated", |
| "_num_blocks": 1 |
| }, |
| "_num_blocks": 6, |
| "_block_idx": null |
| }, |
| "context_length": 256, |
| "num_blocks": 6, |
| "embedding_dim": 384, |
| "add_post_blocks_norm": true, |
| "bias": false, |
| "dropout": 0.0, |
| "slstm_at": [], |
| "_block_map": "0,0,0,0,0,0", |
| "vocab_size": 49152, |
| "tie_weights": false, |
| "weight_decay_on_embedding": false, |
| "add_embedding_dropout": false, |
| "pad_token_id": 0 |
| } |