| { | |
| "decoder_config": { | |
| "att_groups": 4, | |
| "att_heads": 16, | |
| "att_query_groups": 8, | |
| "cross_att_type": "sqa", | |
| "dense_layer_dim": 1536, | |
| "embed_dim": 512, | |
| "ff_activation": "silu", | |
| "ff_dim": 192, | |
| "ff_dropout": 0.0, | |
| "final_stateless_layers_config": [ | |
| "moe", | |
| "moe" | |
| ], | |
| "head_norm_type": "rms_norm", | |
| "moe_bias_mode": "global", | |
| "moe_grouped_gemm": true, | |
| "moe_shared_experts_bias_mode": "global", | |
| "moe_top_k": 10, | |
| "moe_use_cutlass_grouped_gemm": true, | |
| "moe_use_weighted_shared_experts": false, | |
| "num_experts": 384, | |
| "num_layers": 21, | |
| "num_shared_experts": 2, | |
| "rope_base": 100000, | |
| "router_amp": true, | |
| "router_dtype": "bfloat16", | |
| "self_att_type": "sqa", | |
| "seq_len": 8192, | |
| "shared_expert_dim": 384, | |
| "stateless_layers_config": [ | |
| "dense", | |
| "moe" | |
| ], | |
| "stm_size": 4096, | |
| "use_attention_output_bias": false, | |
| "use_flash_attention": true, | |
| "use_gated": true, | |
| "use_gated_attention": true, | |
| "use_gated_cross_attention": false, | |
| "use_head_norm": true, | |
| "use_moe": true, | |
| "use_vectorized_moe": true, | |
| "vocab_size": 65536 | |
| }, | |
| "encoder_config": { | |
| "att_groups": 8, | |
| "att_heads": 16, | |
| "att_query_groups": 8, | |
| "cross_att_type": "sqa", | |
| "embed_dim": 512, | |
| "ff_activation": "silu", | |
| "ff_dim": 1536, | |
| "ff_dropout": 0.0, | |
| "num_layers": 21, | |
| "rope_base": 100000, | |
| "self_att_type": "sqa", | |
| "seq_len": 8192, | |
| "skip_memory_cross_attention": true, | |
| "stm_size": 4096, | |
| "use_attention_output_bias": false, | |
| "use_flash_attention": true, | |
| "use_gated": true, | |
| "use_gated_attention": true, | |
| "vocab_size": 65536 | |
| }, | |
| "memory_attention_config": { | |
| "att_groups": 8, | |
| "att_heads": 16, | |
| "att_query_groups": 8, | |
| "att_type": "sqa", | |
| "embed_dim": 512, | |
| "interlayer_att_groups": 8, | |
| "interlayer_att_query_groups": 8, | |
| "interlayer_att_type": "sqa", | |
| "norm_type": "classic-rms", | |
| "num_groups": 3, | |
| "num_layers": 21, | |
| "residual_gate_type": "elementwise", | |
| "residual_per_slot_gate": true, | |
| "rope_base": 100000, | |
| "seq_len": 8192, | |
| "stm_size": 4096, | |
| "use_flash_attention": false, | |
| "use_gated_residual": true, | |
| "use_tanh_residual_gate": false | |
| }, | |
| "memory_attention_variant": "grouped-self-interlayer", | |
| "system_prompt_title": "SYSTEM INSTRUCTIONS", | |
| "tokenizer": null, | |
| "tokenizer_config": { | |
| "answer_token_id": 6, | |
| "bos_token_id": 2, | |
| "eos_token_id": 3, | |
| "internal_token_id": 8, | |
| "pad_token_id": 0, | |
| "query_token_id": 5, | |
| "think_token_id": 7, | |
| "tool_call_token_id": 9, | |
| "tool_use_token_id": 10 | |
| } | |
| } |