RxT-Beta-Supervised / config.json
AdamF92's picture
Push model using huggingface_hub.
5897a73 verified
{
"decoder_config": {
"att_groups": 4,
"att_heads": 16,
"att_query_groups": 8,
"cross_att_type": "sqa",
"dense_layer_dim": 1536,
"embed_dim": 512,
"ff_activation": "silu",
"ff_dim": 192,
"ff_dropout": 0.0,
"final_stateless_layers_config": [
"moe",
"moe"
],
"head_norm_type": "rms_norm",
"moe_bias_mode": "global",
"moe_grouped_gemm": true,
"moe_shared_experts_bias_mode": "global",
"moe_top_k": 10,
"moe_use_cutlass_grouped_gemm": true,
"moe_use_weighted_shared_experts": false,
"num_experts": 384,
"num_layers": 21,
"num_shared_experts": 2,
"rope_base": 100000,
"router_amp": true,
"router_dtype": "bfloat16",
"self_att_type": "sqa",
"seq_len": 8192,
"shared_expert_dim": 384,
"stateless_layers_config": [
"dense",
"moe"
],
"stm_size": 4096,
"use_attention_output_bias": false,
"use_flash_attention": true,
"use_gated": true,
"use_gated_attention": true,
"use_gated_cross_attention": false,
"use_head_norm": true,
"use_moe": true,
"use_vectorized_moe": true,
"vocab_size": 65536
},
"encoder_config": {
"att_groups": 8,
"att_heads": 16,
"att_query_groups": 8,
"cross_att_type": "sqa",
"embed_dim": 512,
"ff_activation": "silu",
"ff_dim": 1536,
"ff_dropout": 0.0,
"num_layers": 21,
"rope_base": 100000,
"self_att_type": "sqa",
"seq_len": 8192,
"skip_memory_cross_attention": true,
"stm_size": 4096,
"use_attention_output_bias": false,
"use_flash_attention": true,
"use_gated": true,
"use_gated_attention": true,
"vocab_size": 65536
},
"memory_attention_config": {
"att_groups": 8,
"att_heads": 16,
"att_query_groups": 8,
"att_type": "sqa",
"embed_dim": 512,
"interlayer_att_groups": 8,
"interlayer_att_query_groups": 8,
"interlayer_att_type": "sqa",
"norm_type": "classic-rms",
"num_groups": 3,
"num_layers": 21,
"residual_gate_type": "elementwise",
"residual_per_slot_gate": true,
"rope_base": 100000,
"seq_len": 8192,
"stm_size": 4096,
"use_flash_attention": false,
"use_gated_residual": true,
"use_tanh_residual_gate": false
},
"memory_attention_variant": "grouped-self-interlayer",
"system_prompt_title": "SYSTEM INSTRUCTIONS",
"tokenizer": null,
"tokenizer_config": {
"answer_token_id": 6,
"bos_token_id": 2,
"eos_token_id": 3,
"internal_token_id": 8,
"pad_token_id": 0,
"query_token_id": 5,
"think_token_id": 7,
"tool_call_token_id": 9,
"tool_use_token_id": 10
}
}