| { |
| "att_dropout": 0.0, |
| "att_experts": null, |
| "att_groups": 8, |
| "att_heads": 16, |
| "att_query_experts": null, |
| "att_query_groups": 8, |
| "att_type": "sqa", |
| "debug_interval": 10, |
| "debug_mode": false, |
| "embed_dim": 512, |
| "interlayer_att_dropout": 0.0, |
| "interlayer_att_experts": null, |
| "interlayer_att_groups": 8, |
| "interlayer_att_query_experts": null, |
| "interlayer_att_query_groups": 8, |
| "interlayer_att_type": "sqa", |
| "norm_decay": 0.9, |
| "norm_init_gate": -2.0, |
| "norm_per_dim_scale": false, |
| "norm_type": "classic-rms", |
| "num_groups": 3, |
| "num_layers": 21, |
| "residual_gate_init": 3.0, |
| "residual_gate_slot_status_type": "mean", |
| "residual_gate_type": "elementwise", |
| "residual_per_slot_gate": true, |
| "rope_base": 100000, |
| "seq_len": 8192, |
| "stm_size": 4096, |
| "use_flash_attention": true, |
| "use_gated_residual": true, |
| "use_tanh_residual_gate": false |
| } |