HoloBrain_v0.0_GD / pretrain /model.config.json

xuewu.lin

feat: add readme, model, processor and urdfs

01d5155 1 day ago

14.8 kB

	{
	"__config_type__": "robo_orchard_lab.models.bip3d.structure:BIP3DConfig",
	"class_type": "robo_orchard_lab.models.bip3d.structure:BIP3D",
	"backbone": {
	"type": "robo_orchard_lab.models.modules.swin_transformer:SwinTransformer",
	"embed_dims": 96,
	"depths": [
	2,
	2,
	6,
	2
	],
	"num_heads": [
	3,
	6,
	12,
	24
	],
	"window_size": 7,
	"mlp_ratio": 4,
	"qkv_bias": true,
	"qk_scale": null,
	"drop_rate": 0.0,
	"attn_drop_rate": 0.0,
	"out_indices": [
	1,
	2,
	3
	],
	"with_cp": true,
	"convert_weights": false
	},
	"decoder": {
	"type": "robo_orchard_lab.models.holobrain.action_decoder:HoloBrainActionDecoder",
	"head": {
	"type": "robo_orchard_lab.models.holobrain.layers:UpsampleHead",
	"upsample_sizes": [
	32,
	64
	],
	"input_dim": 256,
	"dims": [
	128,
	64
	],
	"norm": {
	"type": "torch.nn.modules.normalization:RMSNorm",
	"normalized_shape": 256
	},
	"act": {
	"type": "torch.nn.modules.activation:SiLU",
	"inplace": true
	},
	"norm_act_idx": [
	0,
	1
	],
	"num_output_layers": 2,
	"out_dim": 8
	},
	"mobile_head": null,
	"transformer_cfg": {
	"__config_type__": "robo_orchard_lab.models.holobrain.action_decoder:HoloBrainDecoderTransformerConfig",
	"img_cross_attn": {
	"type": "robo_orchard_lab.models.holobrain.layers:RotaryAttention",
	"embed_dims": 256,
	"num_heads": 8,
	"max_position_embeddings": 32
	},
	"norm_layer": {
	"type": "torch.nn.modules.normalization:RMSNorm",
	"normalized_shape": 256
	},
	"ffn": {
	"type": "robo_orchard_lab.models.layers.transformer_layers:FFN",
	"embed_dims": 256,
	"feedforward_channels": 2048,
	"act_cfg": {
	"type": "torch.nn.modules.activation:SiLU",
	"inplace": true
	}
	},
	"operation_order": [
	"t_norm",
	"temp_joint_attn",
	"gate_msa",
	"norm",
	"img_cross_attn",
	"norm",
	"text_cross_attn",
	"norm",
	"scale_shift",
	"ffn",
	"gate_mlp",
	"t_norm",
	"temp_joint_attn",
	"gate_msa",
	"norm",
	"img_cross_attn",
	"norm",
	"text_cross_attn",
	"norm",
	"scale_shift",
	"ffn",
	"gate_mlp",
	"t_norm",
	"temp_joint_attn",
	"gate_msa",
	"norm",
	"img_cross_attn",
	"norm",
	"text_cross_attn",
	"norm",
	"scale_shift",
	"ffn",
	"gate_mlp",
	"t_norm",
	"temp_joint_attn",
	"gate_msa",
	"norm",
	"img_cross_attn",
	"norm",
	"text_cross_attn",
	"norm",
	"scale_shift",
	"ffn",
	"gate_mlp",
	"t_norm",
	"temp_joint_attn",
	"gate_msa",
	"norm",
	"img_cross_attn",
	"norm",
	"text_cross_attn",
	"norm",
	"scale_shift",
	"ffn",
	"gate_mlp",
	"t_norm",
	"temp_joint_attn",
	"gate_msa",
	"norm",
	"img_cross_attn",
	"norm",
	"text_cross_attn",
	"norm",
	"scale_shift",
	"ffn",
	"gate_mlp"
	],
	"joint_self_attn": {
	"type": "robo_orchard_lab.models.holobrain.layers:JointGraphAttention",
	"embed_dims": 256,
	"num_heads": 8
	},
	"temp_cross_attn": {
	"type": "robo_orchard_lab.models.holobrain.layers:RotaryAttention",
	"embed_dims": 256,
	"num_heads": 8,
	"max_position_embeddings": 32
	},
	"text_cross_attn": {
	"type": "robo_orchard_lab.models.holobrain.layers:RotaryAttention",
	"embed_dims": 256,
	"num_heads": 8,
	"max_position_embeddings": 512
	},
	"temp_joint_attn": {
	"type": "robo_orchard_lab.models.holobrain.layers:TemporalJointGraphAttention",
	"embed_dims": 256,
	"num_heads": 8,
	"max_position_embeddings": 32
	},
	"timestep_norm_layer": {
	"type": "robo_orchard_lab.models.holobrain.layers:AdaRMSNorm",
	"normalized_shape": 256,
	"condition_dims": 256,
	"zero": true
	},
	"pre_norm": true
	},
	"base_cfg": {
	"__config_type__": "robo_orchard_lab.models.holobrain.action_decoder:HoloBrainDecoderBaseConfig",
	"training_noise_scheduler": {
	"type": "diffusers.schedulers.scheduling_ddpm:DDPMScheduler",
	"num_train_timesteps": 1000,
	"beta_schedule": "squaredcos_cap_v2",
	"prediction_type": "sample",
	"clip_sample": false
	},
	"test_noise_scheduler": {
	"type": "diffusers.schedulers.scheduling_dpmsolver_multistep:DPMSolverMultistepScheduler",
	"num_train_timesteps": 1000,
	"beta_schedule": "squaredcos_cap_v2",
	"prediction_type": "sample"
	},
	"num_inference_timesteps": 10,
	"feature_level": [
	1,
	2
	],
	"state_dims": 8,
	"embed_dims": 256,
	"pred_steps": 64,
	"act_cfg": {
	"type": "torch.nn.modules.activation:SiLU",
	"inplace": true
	},
	"num_test_traj": 1,
	"chunk_size": 4,
	"force_kinematics": false,
	"with_mobile": false,
	"mobile_traj_state_dims": 2,
	"use_joint_mask": true,
	"noise_type": "local_joint",
	"pred_scaled_joint": false,
	"prediction_type": "relative_joint_relative_pose"
	},
	"training_cfg": {
	"__config_type__": "robo_orchard_lab.models.holobrain.action_decoder:HoloBrainTrainingConfig",
	"loss": {
	"type": "robo_orchard_lab.models.holobrain.loss:HoloBrainActionLoss",
	"timestep_loss_weight": 1000,
	"parallel_loss_weight": 0.1,
	"smooth_l1_beta": 0.04,
	"loss_mode": "smooth_l1"
	},
	"temporal_attn_drop": 0.05,
	"num_parallel_training_sample": 4,
	"teacher_forcing_rate": 0.02,
	"teacher_forcing_mean_steps": null
	},
	"robot_encoder": {
	"type": "robo_orchard_lab.models.holobrain.robot_state_encoder:HoloBrainRobotStateEncoder",
	"transformer_cfg": {
	"__config_type__": "robo_orchard_lab.models.holobrain.robot_state_encoder:HoloBrainEncoderTransformerConfig",
	"joint_self_attn": {
	"type": "robo_orchard_lab.models.holobrain.layers:JointGraphAttention",
	"embed_dims": 256,
	"num_heads": 8
	},
	"norm_layer": {
	"type": "torch.nn.modules.normalization:RMSNorm",
	"normalized_shape": 256
	},
	"ffn": {
	"type": "robo_orchard_lab.models.layers.transformer_layers:FFN",
	"embed_dims": 256,
	"feedforward_channels": 2048,
	"act_cfg": {
	"type": "torch.nn.modules.activation:SiLU",
	"inplace": true
	}
	},
	"operation_order": [
	"norm",
	"joint_self_attn",
	null,
	null,
	"norm",
	"ffn",
	"norm",
	"joint_self_attn",
	null,
	null,
	"norm",
	"ffn",
	"norm",
	"joint_self_attn",
	null,
	null,
	"norm",
	"ffn",
	"norm",
	"joint_self_attn",
	null,
	null,
	"norm",
	"ffn",
	"norm"
	],
	"temp_self_attn": {
	"type": "robo_orchard_lab.models.holobrain.layers:RotaryAttention",
	"embed_dims": 256,
	"num_heads": 8,
	"max_position_embeddings": 32
	},
	"pre_norm": true
	},
	"base_cfg": {
	"__config_type__": "robo_orchard_lab.models.holobrain.robot_state_encoder:HoloBrainEncoderBaseConfig",
	"embed_dims": 256,
	"state_dims": 8,
	"act_cfg": {
	"type": "torch.nn.modules.activation:SiLU",
	"inplace": true
	},
	"chunk_size": 1
	}
	}
	},
	"neck": {
	"type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
	"in_channels": [
	192,
	384,
	768
	],
	"kernel_size": 1,
	"out_channels": 256,
	"act_cfg": null,
	"bias": true,
	"norm_cfg": {
	"type": "torch.nn.modules.normalization:GroupNorm",
	"num_groups": 32
	},
	"num_outs": 4
	},
	"text_encoder": {
	"type": "robo_orchard_lab.models.bip3d.bert:BertModel",
	"special_tokens_list": [
	"[CLS]",
	"[SEP]"
	],
	"name": "./ckpt/bert-base-uncased",
	"pad_to_max": false,
	"use_sub_sentence_represent": true,
	"add_pooling_layer": false,
	"max_tokens": 768,
	"use_checkpoint": true,
	"return_tokenized": true
	},
	"feature_enhancer": {
	"type": "robo_orchard_lab.models.bip3d.feature_enhancer:TextImageDeformable2DEnhancer",
	"embed_dims": 256,
	"num_layers": 6,
	"text_img_attn_block": {
	"v_dim": 256,
	"l_dim": 256,
	"embed_dim": 1024,
	"num_heads": 4,
	"init_values": 0.0001
	},
	"img_attn_block": {
	"self_attn_cfg": {
	"embed_dims": 256,
	"num_levels": 4,
	"im2col_step": 1
	},
	"ffn_cfg": {
	"embed_dims": 256,
	"feedforward_channels": 2048,
	"ffn_drop": 0.0
	}
	},
	"text_attn_block": {
	"self_attn_cfg": {
	"num_heads": 4,
	"embed_dims": 256
	},
	"ffn_cfg": {
	"embed_dims": 256,
	"feedforward_channels": 1024,
	"ffn_drop": 0.0
	}
	},
	"num_feature_levels": 4,
	"positional_encoding": {
	"num_feats": 128,
	"normalize": true,
	"offset": 0.0,
	"temperature": 20
	}
	},
	"spatial_enhancer": {
	"type": "robo_orchard_lab.models.bip3d.spatial_enhancer:DepthFusionSpatialEnhancer",
	"embed_dims": 256,
	"feature_3d_dim": 128,
	"num_depth_layers": 2,
	"min_depth": 0.01,
	"max_depth": 1.2,
	"num_depth": 128,
	"with_feature_3d": true,
	"loss_depth_weight": 1.0
	},
	"data_preprocessor": {
	"type": "robo_orchard_lab.models.layers.data_preprocessors:BaseDataPreprocessor",
	"channel_flip": true,
	"unsqueeze_depth_channel": true,
	"mean": [
	123.675,
	116.28,
	103.53
	],
	"std": [
	58.395,
	57.12,
	57.375
	],
	"batch_transforms": [
	{
	"type": "robo_orchard_lab.models.bip3d.spatial_enhancer:BatchDepthProbGTGenerator",
	"min_depth": 0.01,
	"max_depth": 1.2,
	"num_depth": 128,
	"origin_stride": 2,
	"valid_threshold": 0.5,
	"stride": [
	8,
	16,
	32,
	64
	]
	}
	]
	},
	"backbone_3d": {
	"type": "robo_orchard_lab.models.modules.swin_transformer:SwinTransformer",
	"in_channels": 1,
	"embed_dims": 16,
	"depths": [
	2,
	2,
	6,
	2
	],
	"num_heads": [
	4,
	8,
	8,
	16
	],
	"window_size": 7,
	"mlp_ratio": 4,
	"qkv_bias": true,
	"qk_scale": null,
	"drop_rate": 0.0,
	"attn_drop_rate": 0.0,
	"out_indices": [
	1,
	2,
	3
	],
	"with_cp": true,
	"convert_weights": false
	},
	"neck_3d": {
	"type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
	"in_channels": [
	32,
	64,
	128
	],
	"kernel_size": 1,
	"out_channels": 128,
	"act_cfg": null,
	"bias": true,
	"norm_cfg": {
	"type": "torch.nn.modules.normalization:GroupNorm",
	"num_groups": 4
	},
	"num_outs": 4
	},
	"input_2d": "imgs",
	"input_3d": "depths",
	"embed_dims": 256,
	"pre_spatial_enhancer": false
	}