| { | |
| "att_dropout": 0.0, | |
| "att_experts": null, | |
| "att_groups": 8, | |
| "att_heads": 16, | |
| "att_query_experts": null, | |
| "att_query_groups": 8, | |
| "att_type": "sqa", | |
| "debug_interval": 10, | |
| "debug_mode": false, | |
| "embed_dim": 512, | |
| "interlayer_att_dropout": 0.0, | |
| "interlayer_att_experts": null, | |
| "interlayer_att_groups": 8, | |
| "interlayer_att_query_experts": null, | |
| "interlayer_att_query_groups": 8, | |
| "interlayer_att_type": "sqa", | |
| "norm_decay": 0.9, | |
| "norm_init_gate": -2.0, | |
| "norm_per_dim_scale": false, | |
| "norm_type": "classic-rms", | |
| "num_groups": 3, | |
| "num_layers": 21, | |
| "residual_gate_init": 3.0, | |
| "residual_gate_slot_status_type": "mean", | |
| "residual_gate_type": "elementwise", | |
| "residual_per_slot_gate": true, | |
| "rope_base": 100000, | |
| "seq_len": 8192, | |
| "stm_size": 4096, | |
| "use_flash_attention": true, | |
| "use_gated_residual": true, | |
| "use_tanh_residual_gate": false | |
| } |