Leanstral-2603 / params.json
patrickvonplaten's picture
Super-squash branch 'main' using huggingface_hub
1e3282a
{
"dim": 4096,
"n_layers": 36,
"head_dim": 128,
"hidden_dim": 12288,
"n_heads": 32,
"n_kv_heads": 32,
"rope_theta": 10000.0,
"norm_eps": 1e-06,
"vocab_size": 131072,
"tied_embeddings": false,
"max_position_embeddings": 1048576,
"llama_4_scaling": {
"original_max_position_embeddings": 8192,
"beta": 0.1
},
"q_lora_rank": 1024,
"qk_rope_head_dim": 64,
"qk_nope_head_dim": 64,
"kv_lora_rank": 256,
"v_head_dim": 128,
"quantization": {
"qformat_weight": "fp8_e4m3",
"qscheme_act": "TENSOR"
},
"yarn": {
"original_max_position_embeddings": 8192,
"factor": 128,
"apply_scale": false,
"beta": 32,
"alpha": 1
},
"moe": {
"expert_parallel": 1,
"expert_model_parallel": 1,
"route_every_n": 1,
"first_k_dense_replace": 0,
"num_experts": 128,
"num_experts_per_tok": 4,
"num_expert_groups": 1,
"num_expert_groups_per_tok": 1,
"routed_scale": 1.0,
"expert_hidden_dim": 2048,
"num_shared_experts": 1
},
"vision_encoder": {
"image_token_id": 10,
"image_break_token_id": 12,
"image_end_token_id": 13,
"intermediate_size": 4096,
"num_hidden_layers": 24,
"num_attention_heads": 16,
"mm_projector_id": "patch_merge",
"spatial_merge_size": 2,
"hidden_size": 1024,
"num_channels": 3,
"image_size": 1540,
"max_image_size": 1540,
"patch_size": 14,
"rope_theta": 10000.0,
"add_pre_mm_projector_layer_norm": true,
"adapter_bias": false
}
}