| { |
| "architectures": [ |
| "StepVLForConditionalGeneration" |
| ], |
| "auto_map": { |
| "AutoConfig": "configuration_step_vl.StepRoboticsConfig", |
| "AutoModelForCausalLM": "modeling_step_vl.Step3VL10BForCausalLM" |
| }, |
| "dtype": "bfloat16", |
| "hidden_size": 4096, |
| "im_end_token": "<im_end>", |
| "im_patch_token": "<im_patch>", |
| "im_start_token": "<im_start>", |
| "image_token_id": 151679, |
| "image_token_len": 169, |
| "model_type": "step_robotics", |
| "patch_token_len": 81, |
| "projector_bias": false, |
| "quantization_config": { |
| "activation_scheme": "dynamic", |
| "modules_to_not_convert": [ |
| "vision_model.transformer.resblocks.0.attn.qkv_proj", |
| "vision_model.transformer.resblocks.0.attn.out_proj", |
| "vision_model.transformer.resblocks.0.mlp.c_fc", |
| "vision_model.transformer.resblocks.0.mlp.c_proj", |
| "vision_model.transformer.resblocks.1.attn.qkv_proj", |
| "vision_model.transformer.resblocks.1.attn.out_proj", |
| "vision_model.transformer.resblocks.1.mlp.c_fc", |
| "vision_model.transformer.resblocks.1.mlp.c_proj", |
| "vision_model.transformer.resblocks.2.attn.qkv_proj", |
| "vision_model.transformer.resblocks.2.attn.out_proj", |
| "vision_model.transformer.resblocks.2.mlp.c_fc", |
| "vision_model.transformer.resblocks.2.mlp.c_proj", |
| "vision_model.transformer.resblocks.3.attn.qkv_proj", |
| "vision_model.transformer.resblocks.3.attn.out_proj", |
| "vision_model.transformer.resblocks.3.mlp.c_fc", |
| "vision_model.transformer.resblocks.3.mlp.c_proj", |
| "vision_model.transformer.resblocks.4.attn.qkv_proj", |
| "vision_model.transformer.resblocks.4.attn.out_proj", |
| "vision_model.transformer.resblocks.4.mlp.c_fc", |
| "vision_model.transformer.resblocks.4.mlp.c_proj", |
| "vision_model.transformer.resblocks.5.attn.qkv_proj", |
| "vision_model.transformer.resblocks.5.attn.out_proj", |
| "vision_model.transformer.resblocks.5.mlp.c_fc", |
| "vision_model.transformer.resblocks.5.mlp.c_proj", |
| "vision_model.transformer.resblocks.6.attn.qkv_proj", |
| "vision_model.transformer.resblocks.6.attn.out_proj", |
| "vision_model.transformer.resblocks.6.mlp.c_fc", |
| "vision_model.transformer.resblocks.6.mlp.c_proj", |
| "vision_model.transformer.resblocks.7.attn.qkv_proj", |
| "vision_model.transformer.resblocks.7.attn.out_proj", |
| "vision_model.transformer.resblocks.7.mlp.c_fc", |
| "vision_model.transformer.resblocks.7.mlp.c_proj", |
| "vision_model.transformer.resblocks.8.attn.qkv_proj", |
| "vision_model.transformer.resblocks.8.attn.out_proj", |
| "vision_model.transformer.resblocks.8.mlp.c_fc", |
| "vision_model.transformer.resblocks.8.mlp.c_proj", |
| "vision_model.transformer.resblocks.9.attn.qkv_proj", |
| "vision_model.transformer.resblocks.9.attn.out_proj", |
| "vision_model.transformer.resblocks.9.mlp.c_fc", |
| "vision_model.transformer.resblocks.9.mlp.c_proj", |
| "vision_model.transformer.resblocks.10.attn.qkv_proj", |
| "vision_model.transformer.resblocks.10.attn.out_proj", |
| "vision_model.transformer.resblocks.10.mlp.c_fc", |
| "vision_model.transformer.resblocks.10.mlp.c_proj", |
| "vision_model.transformer.resblocks.11.attn.qkv_proj", |
| "vision_model.transformer.resblocks.11.attn.out_proj", |
| "vision_model.transformer.resblocks.11.mlp.c_fc", |
| "vision_model.transformer.resblocks.11.mlp.c_proj", |
| "vision_model.transformer.resblocks.12.attn.qkv_proj", |
| "vision_model.transformer.resblocks.12.attn.out_proj", |
| "vision_model.transformer.resblocks.12.mlp.c_fc", |
| "vision_model.transformer.resblocks.12.mlp.c_proj", |
| "vision_model.transformer.resblocks.13.attn.qkv_proj", |
| "vision_model.transformer.resblocks.13.attn.out_proj", |
| "vision_model.transformer.resblocks.13.mlp.c_fc", |
| "vision_model.transformer.resblocks.13.mlp.c_proj", |
| "vision_model.transformer.resblocks.14.attn.qkv_proj", |
| "vision_model.transformer.resblocks.14.attn.out_proj", |
| "vision_model.transformer.resblocks.14.mlp.c_fc", |
| "vision_model.transformer.resblocks.14.mlp.c_proj", |
| "vision_model.transformer.resblocks.15.attn.qkv_proj", |
| "vision_model.transformer.resblocks.15.attn.out_proj", |
| "vision_model.transformer.resblocks.15.mlp.c_fc", |
| "vision_model.transformer.resblocks.15.mlp.c_proj", |
| "vision_model.transformer.resblocks.16.attn.qkv_proj", |
| "vision_model.transformer.resblocks.16.attn.out_proj", |
| "vision_model.transformer.resblocks.16.mlp.c_fc", |
| "vision_model.transformer.resblocks.16.mlp.c_proj", |
| "vision_model.transformer.resblocks.17.attn.qkv_proj", |
| "vision_model.transformer.resblocks.17.attn.out_proj", |
| "vision_model.transformer.resblocks.17.mlp.c_fc", |
| "vision_model.transformer.resblocks.17.mlp.c_proj", |
| "vision_model.transformer.resblocks.18.attn.qkv_proj", |
| "vision_model.transformer.resblocks.18.attn.out_proj", |
| "vision_model.transformer.resblocks.18.mlp.c_fc", |
| "vision_model.transformer.resblocks.18.mlp.c_proj", |
| "vision_model.transformer.resblocks.19.attn.qkv_proj", |
| "vision_model.transformer.resblocks.19.attn.out_proj", |
| "vision_model.transformer.resblocks.19.mlp.c_fc", |
| "vision_model.transformer.resblocks.19.mlp.c_proj", |
| "vision_model.transformer.resblocks.20.attn.qkv_proj", |
| "vision_model.transformer.resblocks.20.attn.out_proj", |
| "vision_model.transformer.resblocks.20.mlp.c_fc", |
| "vision_model.transformer.resblocks.20.mlp.c_proj", |
| "vision_model.transformer.resblocks.21.attn.qkv_proj", |
| "vision_model.transformer.resblocks.21.attn.out_proj", |
| "vision_model.transformer.resblocks.21.mlp.c_fc", |
| "vision_model.transformer.resblocks.21.mlp.c_proj", |
| "vision_model.transformer.resblocks.22.attn.qkv_proj", |
| "vision_model.transformer.resblocks.22.attn.out_proj", |
| "vision_model.transformer.resblocks.22.mlp.c_fc", |
| "vision_model.transformer.resblocks.22.mlp.c_proj", |
| "vision_model.transformer.resblocks.23.attn.qkv_proj", |
| "vision_model.transformer.resblocks.23.attn.out_proj", |
| "vision_model.transformer.resblocks.23.mlp.c_fc", |
| "vision_model.transformer.resblocks.23.mlp.c_proj", |
| "vision_model.transformer.resblocks.24.attn.qkv_proj", |
| "vision_model.transformer.resblocks.24.attn.out_proj", |
| "vision_model.transformer.resblocks.24.mlp.c_fc", |
| "vision_model.transformer.resblocks.24.mlp.c_proj", |
| "vision_model.transformer.resblocks.25.attn.qkv_proj", |
| "vision_model.transformer.resblocks.25.attn.out_proj", |
| "vision_model.transformer.resblocks.25.mlp.c_fc", |
| "vision_model.transformer.resblocks.25.mlp.c_proj", |
| "vision_model.transformer.resblocks.26.attn.qkv_proj", |
| "vision_model.transformer.resblocks.26.attn.out_proj", |
| "vision_model.transformer.resblocks.26.mlp.c_fc", |
| "vision_model.transformer.resblocks.26.mlp.c_proj", |
| "vision_model.transformer.resblocks.27.attn.qkv_proj", |
| "vision_model.transformer.resblocks.27.attn.out_proj", |
| "vision_model.transformer.resblocks.27.mlp.c_fc", |
| "vision_model.transformer.resblocks.27.mlp.c_proj", |
| "vision_model.transformer.resblocks.28.attn.qkv_proj", |
| "vision_model.transformer.resblocks.28.attn.out_proj", |
| "vision_model.transformer.resblocks.28.mlp.c_fc", |
| "vision_model.transformer.resblocks.28.mlp.c_proj", |
| "vision_model.transformer.resblocks.29.attn.qkv_proj", |
| "vision_model.transformer.resblocks.29.attn.out_proj", |
| "vision_model.transformer.resblocks.29.mlp.c_fc", |
| "vision_model.transformer.resblocks.29.mlp.c_proj", |
| "vision_model.transformer.resblocks.30.attn.qkv_proj", |
| "vision_model.transformer.resblocks.30.attn.out_proj", |
| "vision_model.transformer.resblocks.30.mlp.c_fc", |
| "vision_model.transformer.resblocks.30.mlp.c_proj", |
| "vision_model.transformer.resblocks.31.attn.qkv_proj", |
| "vision_model.transformer.resblocks.31.attn.out_proj", |
| "vision_model.transformer.resblocks.31.mlp.c_fc", |
| "vision_model.transformer.resblocks.31.mlp.c_proj", |
| "vision_model.transformer.resblocks.32.attn.qkv_proj", |
| "vision_model.transformer.resblocks.32.attn.out_proj", |
| "vision_model.transformer.resblocks.32.mlp.c_fc", |
| "vision_model.transformer.resblocks.32.mlp.c_proj", |
| "vision_model.transformer.resblocks.33.attn.qkv_proj", |
| "vision_model.transformer.resblocks.33.attn.out_proj", |
| "vision_model.transformer.resblocks.33.mlp.c_fc", |
| "vision_model.transformer.resblocks.33.mlp.c_proj", |
| "vision_model.transformer.resblocks.34.attn.qkv_proj", |
| "vision_model.transformer.resblocks.34.attn.out_proj", |
| "vision_model.transformer.resblocks.34.mlp.c_fc", |
| "vision_model.transformer.resblocks.34.mlp.c_proj", |
| "vision_model.transformer.resblocks.35.attn.qkv_proj", |
| "vision_model.transformer.resblocks.35.attn.out_proj", |
| "vision_model.transformer.resblocks.35.mlp.c_fc", |
| "vision_model.transformer.resblocks.35.mlp.c_proj", |
| "vision_model.transformer.resblocks.36.attn.qkv_proj", |
| "vision_model.transformer.resblocks.36.attn.out_proj", |
| "vision_model.transformer.resblocks.36.mlp.c_fc", |
| "vision_model.transformer.resblocks.36.mlp.c_proj", |
| "vision_model.transformer.resblocks.37.attn.qkv_proj", |
| "vision_model.transformer.resblocks.37.attn.out_proj", |
| "vision_model.transformer.resblocks.37.mlp.c_fc", |
| "vision_model.transformer.resblocks.37.mlp.c_proj", |
| "vision_model.transformer.resblocks.38.attn.qkv_proj", |
| "vision_model.transformer.resblocks.38.attn.out_proj", |
| "vision_model.transformer.resblocks.38.mlp.c_fc", |
| "vision_model.transformer.resblocks.38.mlp.c_proj", |
| "vision_model.transformer.resblocks.39.attn.qkv_proj", |
| "vision_model.transformer.resblocks.39.attn.out_proj", |
| "vision_model.transformer.resblocks.39.mlp.c_fc", |
| "vision_model.transformer.resblocks.39.mlp.c_proj", |
| "vision_model.transformer.resblocks.40.attn.qkv_proj", |
| "vision_model.transformer.resblocks.40.attn.out_proj", |
| "vision_model.transformer.resblocks.40.mlp.c_fc", |
| "vision_model.transformer.resblocks.40.mlp.c_proj", |
| "vision_model.transformer.resblocks.41.attn.qkv_proj", |
| "vision_model.transformer.resblocks.41.attn.out_proj", |
| "vision_model.transformer.resblocks.41.mlp.c_fc", |
| "vision_model.transformer.resblocks.41.mlp.c_proj", |
| "vision_model.transformer.resblocks.42.attn.qkv_proj", |
| "vision_model.transformer.resblocks.42.attn.out_proj", |
| "vision_model.transformer.resblocks.42.mlp.c_fc", |
| "vision_model.transformer.resblocks.42.mlp.c_proj", |
| "vision_model.transformer.resblocks.43.attn.qkv_proj", |
| "vision_model.transformer.resblocks.43.attn.out_proj", |
| "vision_model.transformer.resblocks.43.mlp.c_fc", |
| "vision_model.transformer.resblocks.43.mlp.c_proj", |
| "vision_model.transformer.resblocks.44.attn.qkv_proj", |
| "vision_model.transformer.resblocks.44.attn.out_proj", |
| "vision_model.transformer.resblocks.44.mlp.c_fc", |
| "vision_model.transformer.resblocks.44.mlp.c_proj", |
| "vision_model.transformer.resblocks.45.attn.qkv_proj", |
| "vision_model.transformer.resblocks.45.attn.out_proj", |
| "vision_model.transformer.resblocks.45.mlp.c_fc", |
| "vision_model.transformer.resblocks.45.mlp.c_proj", |
| "vision_model.transformer.resblocks.46.attn.qkv_proj", |
| "vision_model.transformer.resblocks.46.attn.out_proj", |
| "vision_model.transformer.resblocks.46.mlp.c_fc", |
| "vision_model.transformer.resblocks.46.mlp.c_proj", |
| "vit_large_projector", |
| "lm_head" |
| ], |
| "quant_method": "fp8", |
| "weight_block_size": [ |
| 128, |
| 128 |
| ] |
| }, |
| "text_config": { |
| "architectures": [ |
| "Qwen3ForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 151643, |
| "dtype": "bfloat16", |
| "eos_token_id": [ |
| 151643, |
| 151645, |
| 151679 |
| ], |
| "head_dim": 128, |
| "hidden_act": "silu", |
| "hidden_size": 4096, |
| "initializer_range": 0.02, |
| "intermediate_size": 12288, |
| "layer_types": [ |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention", |
| "full_attention" |
| ], |
| "max_position_embeddings": 65536, |
| "max_window_layers": 36, |
| "model_type": "qwen3", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 36, |
| "num_key_value_heads": 8, |
| "rms_norm_eps": 1e-06, |
| "rope_scaling": null, |
| "rope_theta": 1000000, |
| "sliding_window": null, |
| "use_cache": true, |
| "use_sliding_window": false, |
| "vocab_size": 151936 |
| }, |
| "transformers_version": "4.57.0", |
| "understand_projector_stride": 2, |
| "use_im_start_end": "true", |
| "vision_config": { |
| "heads": 16, |
| "hidden_act": "quick_gelu", |
| "image_size": 728, |
| "layer_norm_eps": 1e-05, |
| "layers": 47, |
| "ls_init_value": 0.1, |
| "mlp_ratio": 5.833333333333333, |
| "model_type": "", |
| "num_channels": 3, |
| "output_dim": null, |
| "patch_size": 14, |
| "pool_type": "none", |
| "ues_cls_token": false, |
| "use_abs_posemb": true, |
| "use_cls_token": false, |
| "use_ln_post": false, |
| "use_ln_pre": true, |
| "use_rope2d": true, |
| "width": 1536 |
| }, |
| "vision_select_layer": -1 |
| } |
|
|