SEM-RoboTwin-Tiny / model.config.json
xuewu.lin
init
51da6a1
{
"__config_type__": "robo_orchard_lab.models.bip3d.structure:BIP3DConfig",
"class_type": "robo_orchard_lab.models.bip3d.structure:BIP3D",
"backbone": {
"type": "robo_orchard_lab.models.modules.swin_transformer:SwinTransformer",
"embed_dims": 96,
"depths": [
2,
2,
6,
2
],
"num_heads": [
3,
6,
12,
24
],
"window_size": 7,
"mlp_ratio": 4,
"qkv_bias": true,
"qk_scale": null,
"drop_rate": 0.0,
"attn_drop_rate": 0.0,
"out_indices": [
1,
2,
3
],
"with_cp": true,
"convert_weights": false
},
"decoder": {
"type": "robo_orchard_lab.models.sem_modules.action_decoder:SEMActionDecoder",
"img_cross_attn": {
"type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention",
"embed_dims": 256,
"num_heads": 8,
"max_position_embeddings": 32
},
"norm_layer": {
"type": "torch.nn.modules.normalization:RMSNorm",
"normalized_shape": 256
},
"ffn": {
"type": "robo_orchard_lab.models.layers.transformer_layers:FFN",
"embed_dims": 256,
"feedforward_channels": 2048,
"act_cfg": {
"type": "torch.nn.modules.activation:SiLU",
"inplace": true
}
},
"head": {
"type": "robo_orchard_lab.models.sem_modules.layers:UpsampleHead",
"upsample_sizes": [
16,
32,
64
],
"input_dim": 256,
"dims": [
128,
64,
8
],
"norm": {
"type": "torch.nn.modules.normalization:RMSNorm",
"normalized_shape": 256
},
"act": {
"type": "torch.nn.modules.activation:SiLU",
"inplace": true
},
"norm_act_idx": [
0,
1,
2
]
},
"training_noise_scheduler": {
"type": "diffusers.schedulers.scheduling_ddpm:DDPMScheduler",
"num_train_timesteps": 1000,
"beta_schedule": "squaredcos_cap_v2",
"prediction_type": "sample",
"clip_sample": false
},
"test_noise_scheduler": {
"type": "diffusers.schedulers.scheduling_dpmsolver_multistep:DPMSolverMultistepScheduler",
"num_train_timesteps": 1000,
"beta_schedule": "squaredcos_cap_v2",
"prediction_type": "sample"
},
"num_inference_timesteps": 10,
"joint_self_attn": {
"type": "robo_orchard_lab.models.sem_modules.layers:JointGraphAttention",
"embed_dims": 256,
"num_heads": 8
},
"temp_cross_attn": {
"type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention",
"embed_dims": 256,
"num_heads": 8,
"max_position_embeddings": 32
},
"text_cross_attn": {
"type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention",
"embed_dims": 256,
"num_heads": 8,
"max_position_embeddings": 256
},
"pred_steps": 64,
"timestep_norm_layer": {
"type": "robo_orchard_lab.models.sem_modules.layers:AdaRMSNorm",
"normalized_shape": 256,
"condition_dims": 256,
"zero": true
},
"operation_order": [
"t_norm",
"joint_self_attn",
"gate_msa",
"norm",
"temp_cross_attn",
"norm",
"img_cross_attn",
"norm",
null,
null,
"scale_shift",
"ffn",
"gate_mlp",
"t_norm",
"joint_self_attn",
"gate_msa",
"norm",
"temp_cross_attn",
"norm",
"img_cross_attn",
"norm",
null,
null,
"scale_shift",
"ffn",
"gate_mlp",
"t_norm",
"joint_self_attn",
"gate_msa",
"norm",
"temp_cross_attn",
"norm",
"img_cross_attn",
"norm",
null,
null,
"scale_shift",
"ffn",
"gate_mlp",
"t_norm",
"joint_self_attn",
"gate_msa",
"norm",
"temp_cross_attn",
"norm",
"img_cross_attn",
"norm",
null,
null,
"scale_shift",
"ffn",
"gate_mlp",
"t_norm",
"joint_self_attn",
"gate_msa",
"norm",
"temp_cross_attn",
"norm",
"img_cross_attn",
"norm",
null,
null,
"scale_shift",
"ffn",
"gate_mlp",
"t_norm",
"joint_self_attn",
"gate_msa",
"norm",
"temp_cross_attn",
"norm",
"img_cross_attn",
"norm",
null,
null,
"scale_shift",
"ffn",
"gate_mlp"
],
"feature_level": [
1,
2
],
"act_cfg": {
"type": "torch.nn.modules.activation:SiLU",
"inplace": true
},
"robot_encoder": {
"type": "robo_orchard_lab.models.sem_modules.robot_state_encoder:SEMRobotStateEncoder",
"embed_dims": 256,
"chunk_size": 1,
"joint_self_attn": {
"type": "robo_orchard_lab.models.sem_modules.layers:JointGraphAttention",
"embed_dims": 256,
"num_heads": 8
},
"norm_layer": {
"type": "torch.nn.modules.normalization:RMSNorm",
"normalized_shape": 256
},
"ffn": {
"type": "robo_orchard_lab.models.layers.transformer_layers:FFN",
"embed_dims": 256,
"feedforward_channels": 2048,
"act_cfg": {
"type": "torch.nn.modules.activation:SiLU",
"inplace": true
}
},
"temp_self_attn": {
"type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention",
"embed_dims": 256,
"num_heads": 8,
"max_position_embeddings": 32
},
"act_cfg": {
"type": "torch.nn.modules.activation:SiLU",
"inplace": true
},
"operation_order": [
"norm",
"joint_self_attn",
null,
null,
"norm",
"ffn",
"norm",
"joint_self_attn",
null,
null,
"norm",
"ffn",
"norm",
"joint_self_attn",
null,
null,
"norm",
"ffn",
"norm",
"joint_self_attn",
null,
null,
"norm",
"ffn",
"norm"
],
"state_dims": 8
},
"state_loss_weights": [
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
2.0,
2.0,
2.0,
0.2,
0.2,
0.2,
0.2
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
2.0,
2.0,
2.0,
0.2,
0.2,
0.2,
0.2
]
],
"fk_loss_weight": [
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
2.0,
2.0,
2.0,
0.2,
0.2,
0.2,
0.2
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
1.0,
1.0,
1.0,
0.1,
0.1,
0.1,
0.1
],
[
1.0,
2.0,
2.0,
2.0,
0.2,
0.2,
0.2,
0.2
]
],
"state_dims": 8
},
"neck": {
"type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
"in_channels": [
192,
384,
768
],
"kernel_size": 1,
"out_channels": 256,
"act_cfg": null,
"bias": true,
"norm_cfg": {
"type": "torch.nn.modules.normalization:GroupNorm",
"num_groups": 32
},
"num_outs": 3
},
"text_encoder": null,
"feature_enhancer": null,
"spatial_enhancer": {
"type": "robo_orchard_lab.models.bip3d.spatial_enhancer:DepthFusionSpatialEnhancer",
"embed_dims": 256,
"feature_3d_dim": 32,
"num_depth_layers": 2,
"min_depth": 0.01,
"max_depth": 1.2,
"num_depth": 128,
"with_feature_3d": true,
"loss_depth_weight": 1.0
},
"data_preprocessor": {
"type": "robo_orchard_lab.models.layers.data_preprocessors:BaseDataPreprocessor",
"mean": [
123.675,
116.28,
103.53
],
"std": [
58.395,
57.12,
57.375
],
"channel_flip": false,
"unsqueeze_depth_channel": true,
"batch_transforms": [
{
"type": "robo_orchard_lab.models.bip3d.spatial_enhancer:BatchDepthProbGTGenerator",
"min_depth": 0.01,
"max_depth": 1.2,
"num_depth": 128,
"origin_stride": 2,
"valid_threshold": 0.5,
"stride": [
8,
16,
32
]
}
]
},
"backbone_3d": {
"type": "robo_orchard_lab.models.modules.resnet:ResNet",
"depth": 34,
"in_channels": 1,
"base_channels": 4,
"num_stages": 4,
"out_indices": [
1,
2,
3
],
"bn_eval": true,
"with_cp": true,
"style": "pytorch"
},
"neck_3d": {
"type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
"in_channels": [
8,
16,
32
],
"kernel_size": 1,
"out_channels": 32,
"act_cfg": null,
"bias": true,
"norm_cfg": {
"type": "torch.nn.modules.normalization:GroupNorm",
"num_groups": 4
},
"num_outs": 3
},
"input_2d": "imgs",
"input_3d": "depths",
"embed_dims": 256,
"pre_spatial_enhancer": false
}