BIP3D_Tiny_Det / model.config.json
xuewu.lin
update
7d023fa
{
"__config_type__": "robo_orchard_lab.models.bip3d.structure:BIP3DConfig",
"class_type": "robo_orchard_lab.models.bip3d.structure:BIP3D",
"backbone": {
"type": "robo_orchard_lab.models.modules.swin_transformer:SwinTransformer",
"embed_dims": 96,
"depths": [
2,
2,
6,
2
],
"num_heads": [
3,
6,
12,
24
],
"window_size": 7,
"mlp_ratio": 4,
"qkv_bias": true,
"qk_scale": null,
"drop_rate": 0.0,
"attn_drop_rate": 0.0,
"out_indices": [
1,
2,
3
],
"with_cp": true,
"convert_weights": false
},
"decoder": {
"type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:BBox3DDecoder",
"look_forward_twice": true,
"instance_bank": {
"type": "robo_orchard_lab.models.bip3d.grounding_decoder.instance_bank:InstanceBank",
"num_anchor": 50,
"anchor": "./anchor_files/embodiedscan_kmeans_det_cam_log_z-0.2-3.npy",
"embed_dims": 256,
"anchor_in_camera": true
},
"anchor_encoder": {
"type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:DoF9BoxEncoder",
"embed_dims": 256,
"rot_dims": 3
},
"graph_model": {
"type": "robo_orchard_lab.models.layers.transformer_layers:MultiheadAttention",
"embed_dims": 256,
"num_heads": 8,
"batch_first": true
},
"ffn": {
"type": "robo_orchard_lab.models.layers.transformer_layers:FFN",
"embed_dims": 256,
"feedforward_channels": 2048,
"ffn_drop": 0.0
},
"norm_layer": {
"type": "torch.nn.modules.normalization:LayerNorm",
"normalized_shape": 256
},
"deformable_model": {
"type": "robo_orchard_lab.models.bip3d.grounding_decoder.deformable_aggregation:DeformableFeatureAggregation",
"embed_dims": 256,
"num_groups": 8,
"num_levels": 4,
"use_camera_embed": true,
"with_depth": true,
"min_depth": 0.25,
"max_depth": 10,
"kps_generator": {
"type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:SparseBox3DKeyPointsGenerator",
"fix_scale": [
[
0,
0,
0
],
[
0.45,
0,
0
],
[
-0.45,
0,
0
],
[
0,
0.45,
0
],
[
0,
-0.45,
0
],
[
0,
0,
0.45
],
[
0,
0,
-0.45
]
],
"num_learnable_pts": 9
},
"with_value_proj": true,
"filter_outlier": true
},
"text_cross_attn": {
"type": "robo_orchard_lab.models.layers.transformer_layers:MultiheadAttention",
"embed_dims": 256,
"num_heads": 8,
"batch_first": true
},
"refine_layer": {
"type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:GroundingRefineClsHead",
"embed_dims": 256,
"output_dim": 9,
"cls_bias": true
},
"loss_cls": {
"type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:FocalLoss",
"use_sigmoid": true,
"gamma": 2.0,
"alpha": 0.25,
"loss_weight": 1.0
},
"loss_reg": {
"type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:DoF9BoxLoss",
"loss_weight_wd": 1.0,
"loss_weight_cd": 0.8
},
"sampler": {
"type": "robo_orchard_lab.models.bip3d.grounding_decoder.target:Grounding3DTarget",
"cls_weight": 1.0,
"box_weight": 1.0,
"num_dn": 100,
"cost_weight_wd": 1.0,
"cost_weight_cd": 0.8,
"with_dn_query": true,
"num_classes": 284,
"embed_dims": 256
},
"gt_reg_key": "gt_bboxes_3d",
"gt_cls_key": "tokens_positive",
"post_processor": {
"type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:GroundingBox3DPostProcess",
"num_output": 1000
}
},
"neck": {
"type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
"in_channels": [
192,
384,
768
],
"kernel_size": 1,
"out_channels": 256,
"act_cfg": null,
"bias": true,
"norm_cfg": {
"type": "torch.nn.modules.normalization:GroupNorm",
"num_groups": 32
},
"num_outs": 4
},
"text_encoder": {
"type": "robo_orchard_lab.models.bip3d.bert:BertModel",
"special_tokens_list": [
"[CLS]",
"[SEP]"
],
"name": "./ckpt/bert-base-uncased",
"pad_to_max": false,
"use_sub_sentence_represent": true,
"add_pooling_layer": false,
"max_tokens": 768,
"use_checkpoint": true,
"return_tokenized": true
},
"feature_enhancer": {
"type": "robo_orchard_lab.models.bip3d.feature_enhancer:TextImageDeformable2DEnhancer",
"embed_dims": 256,
"num_layers": 6,
"text_img_attn_block": {
"v_dim": 256,
"l_dim": 256,
"embed_dim": 1024,
"num_heads": 4,
"init_values": 0.0001
},
"img_attn_block": {
"self_attn_cfg": {
"embed_dims": 256,
"num_levels": 4,
"im2col_step": 1
},
"ffn_cfg": {
"embed_dims": 256,
"feedforward_channels": 2048,
"ffn_drop": 0.0
}
},
"text_attn_block": {
"self_attn_cfg": {
"num_heads": 4,
"embed_dims": 256
},
"ffn_cfg": {
"embed_dims": 256,
"feedforward_channels": 1024,
"ffn_drop": 0.0
}
},
"num_feature_levels": 4,
"positional_encoding": {
"num_feats": 128,
"normalize": true,
"offset": 0.0,
"temperature": 20
}
},
"spatial_enhancer": {
"type": "robo_orchard_lab.models.bip3d.spatial_enhancer:DepthFusionSpatialEnhancer",
"embed_dims": 256,
"feature_3d_dim": 32,
"num_depth_layers": 2,
"min_depth": 0.25,
"max_depth": 10,
"num_depth": 64,
"with_feature_3d": true,
"loss_depth_weight": 1.0
},
"data_preprocessor": {
"type": "robo_orchard_lab.models.layers.data_preprocessors:BaseDataPreprocessor",
"mean": [
123.675,
116.28,
103.53
],
"std": [
58.395,
57.12,
57.375
],
"channel_flip": true,
"batch_transforms": [
{
"type": "robo_orchard_lab.models.bip3d.spatial_enhancer:BatchDepthProbGTGenerator",
"min_depth": 0.25,
"max_depth": 10,
"num_depth": 64,
"origin_stride": 4,
"valid_threshold": 0.0,
"stride": [
8,
16,
32,
64
]
},
{
"type": "robo_orchard_lab.models.layers.data_preprocessors:GridMask",
"apply_grid_mask_keys": [
"imgs",
"depths"
]
}
]
},
"backbone_3d": {
"type": "robo_orchard_lab.models.modules.resnet:ResNet",
"depth": 34,
"in_channels": 1,
"base_channels": 4,
"num_stages": 4,
"out_indices": [
1,
2,
3
],
"bn_eval": true,
"with_cp": true,
"style": "pytorch"
},
"neck_3d": {
"type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
"in_channels": [
8,
16,
32
],
"kernel_size": 1,
"out_channels": 32,
"act_cfg": null,
"bias": true,
"norm_cfg": {
"type": "torch.nn.modules.normalization:GroupNorm",
"num_groups": 4
},
"num_outs": 4
},
"input_2d": "imgs",
"input_3d": "depths",
"embed_dims": 256,
"pre_spatial_enhancer": false
}