Safetensors
HoloBrain_v0.0_GD / pretrain /model.config.json
xuewu.lin
feat: add readme, model, processor and urdfs
01d5155
{
"__config_type__": "robo_orchard_lab.models.bip3d.structure:BIP3DConfig",
"class_type": "robo_orchard_lab.models.bip3d.structure:BIP3D",
"backbone": {
"type": "robo_orchard_lab.models.modules.swin_transformer:SwinTransformer",
"embed_dims": 96,
"depths": [
2,
2,
6,
2
],
"num_heads": [
3,
6,
12,
24
],
"window_size": 7,
"mlp_ratio": 4,
"qkv_bias": true,
"qk_scale": null,
"drop_rate": 0.0,
"attn_drop_rate": 0.0,
"out_indices": [
1,
2,
3
],
"with_cp": true,
"convert_weights": false
},
"decoder": {
"type": "robo_orchard_lab.models.holobrain.action_decoder:HoloBrainActionDecoder",
"head": {
"type": "robo_orchard_lab.models.holobrain.layers:UpsampleHead",
"upsample_sizes": [
32,
64
],
"input_dim": 256,
"dims": [
128,
64
],
"norm": {
"type": "torch.nn.modules.normalization:RMSNorm",
"normalized_shape": 256
},
"act": {
"type": "torch.nn.modules.activation:SiLU",
"inplace": true
},
"norm_act_idx": [
0,
1
],
"num_output_layers": 2,
"out_dim": 8
},
"mobile_head": null,
"transformer_cfg": {
"__config_type__": "robo_orchard_lab.models.holobrain.action_decoder:HoloBrainDecoderTransformerConfig",
"img_cross_attn": {
"type": "robo_orchard_lab.models.holobrain.layers:RotaryAttention",
"embed_dims": 256,
"num_heads": 8,
"max_position_embeddings": 32
},
"norm_layer": {
"type": "torch.nn.modules.normalization:RMSNorm",
"normalized_shape": 256
},
"ffn": {
"type": "robo_orchard_lab.models.layers.transformer_layers:FFN",
"embed_dims": 256,
"feedforward_channels": 2048,
"act_cfg": {
"type": "torch.nn.modules.activation:SiLU",
"inplace": true
}
},
"operation_order": [
"t_norm",
"temp_joint_attn",
"gate_msa",
"norm",
"img_cross_attn",
"norm",
"text_cross_attn",
"norm",
"scale_shift",
"ffn",
"gate_mlp",
"t_norm",
"temp_joint_attn",
"gate_msa",
"norm",
"img_cross_attn",
"norm",
"text_cross_attn",
"norm",
"scale_shift",
"ffn",
"gate_mlp",
"t_norm",
"temp_joint_attn",
"gate_msa",
"norm",
"img_cross_attn",
"norm",
"text_cross_attn",
"norm",
"scale_shift",
"ffn",
"gate_mlp",
"t_norm",
"temp_joint_attn",
"gate_msa",
"norm",
"img_cross_attn",
"norm",
"text_cross_attn",
"norm",
"scale_shift",
"ffn",
"gate_mlp",
"t_norm",
"temp_joint_attn",
"gate_msa",
"norm",
"img_cross_attn",
"norm",
"text_cross_attn",
"norm",
"scale_shift",
"ffn",
"gate_mlp",
"t_norm",
"temp_joint_attn",
"gate_msa",
"norm",
"img_cross_attn",
"norm",
"text_cross_attn",
"norm",
"scale_shift",
"ffn",
"gate_mlp"
],
"joint_self_attn": {
"type": "robo_orchard_lab.models.holobrain.layers:JointGraphAttention",
"embed_dims": 256,
"num_heads": 8
},
"temp_cross_attn": {
"type": "robo_orchard_lab.models.holobrain.layers:RotaryAttention",
"embed_dims": 256,
"num_heads": 8,
"max_position_embeddings": 32
},
"text_cross_attn": {
"type": "robo_orchard_lab.models.holobrain.layers:RotaryAttention",
"embed_dims": 256,
"num_heads": 8,
"max_position_embeddings": 512
},
"temp_joint_attn": {
"type": "robo_orchard_lab.models.holobrain.layers:TemporalJointGraphAttention",
"embed_dims": 256,
"num_heads": 8,
"max_position_embeddings": 32
},
"timestep_norm_layer": {
"type": "robo_orchard_lab.models.holobrain.layers:AdaRMSNorm",
"normalized_shape": 256,
"condition_dims": 256,
"zero": true
},
"pre_norm": true
},
"base_cfg": {
"__config_type__": "robo_orchard_lab.models.holobrain.action_decoder:HoloBrainDecoderBaseConfig",
"training_noise_scheduler": {
"type": "diffusers.schedulers.scheduling_ddpm:DDPMScheduler",
"num_train_timesteps": 1000,
"beta_schedule": "squaredcos_cap_v2",
"prediction_type": "sample",
"clip_sample": false
},
"test_noise_scheduler": {
"type": "diffusers.schedulers.scheduling_dpmsolver_multistep:DPMSolverMultistepScheduler",
"num_train_timesteps": 1000,
"beta_schedule": "squaredcos_cap_v2",
"prediction_type": "sample"
},
"num_inference_timesteps": 10,
"feature_level": [
1,
2
],
"state_dims": 8,
"embed_dims": 256,
"pred_steps": 64,
"act_cfg": {
"type": "torch.nn.modules.activation:SiLU",
"inplace": true
},
"num_test_traj": 1,
"chunk_size": 4,
"force_kinematics": false,
"with_mobile": false,
"mobile_traj_state_dims": 2,
"use_joint_mask": true,
"noise_type": "local_joint",
"pred_scaled_joint": false,
"prediction_type": "relative_joint_relative_pose"
},
"training_cfg": {
"__config_type__": "robo_orchard_lab.models.holobrain.action_decoder:HoloBrainTrainingConfig",
"loss": {
"type": "robo_orchard_lab.models.holobrain.loss:HoloBrainActionLoss",
"timestep_loss_weight": 1000,
"parallel_loss_weight": 0.1,
"smooth_l1_beta": 0.04,
"loss_mode": "smooth_l1"
},
"temporal_attn_drop": 0.05,
"num_parallel_training_sample": 4,
"teacher_forcing_rate": 0.02,
"teacher_forcing_mean_steps": null
},
"robot_encoder": {
"type": "robo_orchard_lab.models.holobrain.robot_state_encoder:HoloBrainRobotStateEncoder",
"transformer_cfg": {
"__config_type__": "robo_orchard_lab.models.holobrain.robot_state_encoder:HoloBrainEncoderTransformerConfig",
"joint_self_attn": {
"type": "robo_orchard_lab.models.holobrain.layers:JointGraphAttention",
"embed_dims": 256,
"num_heads": 8
},
"norm_layer": {
"type": "torch.nn.modules.normalization:RMSNorm",
"normalized_shape": 256
},
"ffn": {
"type": "robo_orchard_lab.models.layers.transformer_layers:FFN",
"embed_dims": 256,
"feedforward_channels": 2048,
"act_cfg": {
"type": "torch.nn.modules.activation:SiLU",
"inplace": true
}
},
"operation_order": [
"norm",
"joint_self_attn",
null,
null,
"norm",
"ffn",
"norm",
"joint_self_attn",
null,
null,
"norm",
"ffn",
"norm",
"joint_self_attn",
null,
null,
"norm",
"ffn",
"norm",
"joint_self_attn",
null,
null,
"norm",
"ffn",
"norm"
],
"temp_self_attn": {
"type": "robo_orchard_lab.models.holobrain.layers:RotaryAttention",
"embed_dims": 256,
"num_heads": 8,
"max_position_embeddings": 32
},
"pre_norm": true
},
"base_cfg": {
"__config_type__": "robo_orchard_lab.models.holobrain.robot_state_encoder:HoloBrainEncoderBaseConfig",
"embed_dims": 256,
"state_dims": 8,
"act_cfg": {
"type": "torch.nn.modules.activation:SiLU",
"inplace": true
},
"chunk_size": 1
}
}
},
"neck": {
"type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
"in_channels": [
192,
384,
768
],
"kernel_size": 1,
"out_channels": 256,
"act_cfg": null,
"bias": true,
"norm_cfg": {
"type": "torch.nn.modules.normalization:GroupNorm",
"num_groups": 32
},
"num_outs": 4
},
"text_encoder": {
"type": "robo_orchard_lab.models.bip3d.bert:BertModel",
"special_tokens_list": [
"[CLS]",
"[SEP]"
],
"name": "./ckpt/bert-base-uncased",
"pad_to_max": false,
"use_sub_sentence_represent": true,
"add_pooling_layer": false,
"max_tokens": 768,
"use_checkpoint": true,
"return_tokenized": true
},
"feature_enhancer": {
"type": "robo_orchard_lab.models.bip3d.feature_enhancer:TextImageDeformable2DEnhancer",
"embed_dims": 256,
"num_layers": 6,
"text_img_attn_block": {
"v_dim": 256,
"l_dim": 256,
"embed_dim": 1024,
"num_heads": 4,
"init_values": 0.0001
},
"img_attn_block": {
"self_attn_cfg": {
"embed_dims": 256,
"num_levels": 4,
"im2col_step": 1
},
"ffn_cfg": {
"embed_dims": 256,
"feedforward_channels": 2048,
"ffn_drop": 0.0
}
},
"text_attn_block": {
"self_attn_cfg": {
"num_heads": 4,
"embed_dims": 256
},
"ffn_cfg": {
"embed_dims": 256,
"feedforward_channels": 1024,
"ffn_drop": 0.0
}
},
"num_feature_levels": 4,
"positional_encoding": {
"num_feats": 128,
"normalize": true,
"offset": 0.0,
"temperature": 20
}
},
"spatial_enhancer": {
"type": "robo_orchard_lab.models.bip3d.spatial_enhancer:DepthFusionSpatialEnhancer",
"embed_dims": 256,
"feature_3d_dim": 128,
"num_depth_layers": 2,
"min_depth": 0.01,
"max_depth": 1.2,
"num_depth": 128,
"with_feature_3d": true,
"loss_depth_weight": 1.0
},
"data_preprocessor": {
"type": "robo_orchard_lab.models.layers.data_preprocessors:BaseDataPreprocessor",
"channel_flip": true,
"unsqueeze_depth_channel": true,
"mean": [
123.675,
116.28,
103.53
],
"std": [
58.395,
57.12,
57.375
],
"batch_transforms": [
{
"type": "robo_orchard_lab.models.bip3d.spatial_enhancer:BatchDepthProbGTGenerator",
"min_depth": 0.01,
"max_depth": 1.2,
"num_depth": 128,
"origin_stride": 2,
"valid_threshold": 0.5,
"stride": [
8,
16,
32,
64
]
}
]
},
"backbone_3d": {
"type": "robo_orchard_lab.models.modules.swin_transformer:SwinTransformer",
"in_channels": 1,
"embed_dims": 16,
"depths": [
2,
2,
6,
2
],
"num_heads": [
4,
8,
8,
16
],
"window_size": 7,
"mlp_ratio": 4,
"qkv_bias": true,
"qk_scale": null,
"drop_rate": 0.0,
"attn_drop_rate": 0.0,
"out_indices": [
1,
2,
3
],
"with_cp": true,
"convert_weights": false
},
"neck_3d": {
"type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
"in_channels": [
32,
64,
128
],
"kernel_size": 1,
"out_channels": 128,
"act_cfg": null,
"bias": true,
"norm_cfg": {
"type": "torch.nn.modules.normalization:GroupNorm",
"num_groups": 4
},
"num_outs": 4
},
"input_2d": "imgs",
"input_3d": "depths",
"embed_dims": 256,
"pre_spatial_enhancer": false
}