File size: 2,659 Bytes
9ec127d c3d88cd 67acedf c3d88cd 9ec127d 5897a73 9ec127d 67acedf 9ec127d 67acedf 9ec127d 67acedf 9ec127d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | {
"decoder_config": {
"att_groups": 4,
"att_heads": 16,
"att_query_groups": 8,
"cross_att_type": "sqa",
"dense_layer_dim": 1536,
"embed_dim": 512,
"ff_activation": "silu",
"ff_dim": 192,
"ff_dropout": 0.0,
"final_stateless_layers_config": [
"moe",
"moe"
],
"head_norm_type": "rms_norm",
"moe_bias_mode": "global",
"moe_grouped_gemm": true,
"moe_shared_experts_bias_mode": "global",
"moe_top_k": 10,
"moe_use_cutlass_grouped_gemm": true,
"moe_use_weighted_shared_experts": false,
"num_experts": 384,
"num_layers": 21,
"num_shared_experts": 2,
"rope_base": 100000,
"router_amp": true,
"router_dtype": "bfloat16",
"self_att_type": "sqa",
"seq_len": 8192,
"shared_expert_dim": 384,
"stateless_layers_config": [
"dense",
"moe"
],
"stm_size": 4096,
"use_attention_output_bias": false,
"use_flash_attention": true,
"use_gated": true,
"use_gated_attention": true,
"use_gated_cross_attention": false,
"use_head_norm": true,
"use_moe": true,
"use_vectorized_moe": true,
"vocab_size": 65536
},
"encoder_config": {
"att_groups": 8,
"att_heads": 16,
"att_query_groups": 8,
"cross_att_type": "sqa",
"embed_dim": 512,
"ff_activation": "silu",
"ff_dim": 1536,
"ff_dropout": 0.0,
"num_layers": 21,
"rope_base": 100000,
"self_att_type": "sqa",
"seq_len": 8192,
"skip_memory_cross_attention": true,
"stm_size": 4096,
"use_attention_output_bias": false,
"use_flash_attention": true,
"use_gated": true,
"use_gated_attention": true,
"vocab_size": 65536
},
"memory_attention_config": {
"att_groups": 8,
"att_heads": 16,
"att_query_groups": 8,
"att_type": "sqa",
"embed_dim": 512,
"interlayer_att_groups": 8,
"interlayer_att_query_groups": 8,
"interlayer_att_type": "sqa",
"norm_type": "classic-rms",
"num_groups": 3,
"num_layers": 21,
"residual_gate_type": "elementwise",
"residual_per_slot_gate": true,
"rope_base": 100000,
"seq_len": 8192,
"stm_size": 4096,
"use_flash_attention": false,
"use_gated_residual": true,
"use_tanh_residual_gate": false
},
"memory_attention_variant": "grouped-self-interlayer",
"system_prompt_title": "SYSTEM INSTRUCTIONS",
"tokenizer": null,
"tokenizer_config": {
"answer_token_id": 6,
"bos_token_id": 2,
"eos_token_id": 3,
"internal_token_id": 8,
"pad_token_id": 0,
"query_token_id": 5,
"think_token_id": 7,
"tool_call_token_id": 9,
"tool_use_token_id": 10
}
} |