Step3-VL-10B-Base / configuration_step_vl.py
luotingdan
add model deploy
42cc38c
from typing import Any, Optional, Union
from transformers.configuration_utils import PretrainedConfig
from transformers import Qwen3Config
class StepRoboticsVisionEncoderConfig(PretrainedConfig):
def __init__(
self,
width=1536,
layers=47,
heads=16,
num_channels=3,
image_size=728,
mlp_ratio = 8960/1536,
patch_size=14,
hidden_act="quick_gelu",
layer_norm_eps=1e-5,
ues_cls_token=False,
use_ln_pre=True,
use_ln_post=False,
use_abs_posemb=True,
use_rope2d=True,
ls_init_value=0.1,
**kwargs,
):
self.width = width
self.layers = layers
self.heads = heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.mlp_ratio = mlp_ratio
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.ues_cls_token = ues_cls_token
self.use_ln_pre = use_ln_pre
self.ls_init_value = ls_init_value
self.use_ln_post = use_ln_post
self.use_abs_posemb = use_abs_posemb
self.use_rope2d = use_rope2d
super().__init__(**kwargs)
class StepRoboticsConfig(PretrainedConfig):
model_type = "step_robotics"
architectures = ["StepVLForConditionalGeneration"]
def __init__(
self,
vision_config: Optional[Union[dict, StepRoboticsVisionEncoderConfig]] = None,
text_config: Optional[Union[dict, Qwen3Config]] = None,
understand_projector_stride: int = 2,
projector_bias: bool = False,
image_token_id: int = 151679,
**kwargs,
) -> None:
if vision_config is None:
vision_config = StepRoboticsVisionEncoderConfig()
elif isinstance(vision_config, dict):
vision_config = StepRoboticsVisionEncoderConfig(**vision_config)
self.vision_config = vision_config
if text_config is None:
text_config = Qwen3Config()
elif isinstance(text_config, dict):
text_config = Qwen3Config(**text_config)
self.text_config = text_config
self.understand_projector_stride = understand_projector_stride
self.projector_bias = projector_bias
self.hidden_size = text_config.hidden_size
self.image_token_id = image_token_id
# Help Auto classes find the correct implementation when saving/loading.
super().__init__(**kwargs)