| { | |
| "__config_type__": "robo_orchard_lab.models.bip3d.structure:BIP3DConfig", | |
| "class_type": "robo_orchard_lab.models.bip3d.structure:BIP3D", | |
| "backbone": { | |
| "type": "robo_orchard_lab.models.modules.swin_transformer:SwinTransformer", | |
| "embed_dims": 96, | |
| "depths": [ | |
| 2, | |
| 2, | |
| 6, | |
| 2 | |
| ], | |
| "num_heads": [ | |
| 3, | |
| 6, | |
| 12, | |
| 24 | |
| ], | |
| "window_size": 7, | |
| "mlp_ratio": 4, | |
| "qkv_bias": true, | |
| "qk_scale": null, | |
| "drop_rate": 0.0, | |
| "attn_drop_rate": 0.0, | |
| "out_indices": [ | |
| 1, | |
| 2, | |
| 3 | |
| ], | |
| "with_cp": true, | |
| "convert_weights": false | |
| }, | |
| "decoder": { | |
| "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:BBox3DDecoder", | |
| "look_forward_twice": true, | |
| "instance_bank": { | |
| "type": "robo_orchard_lab.models.bip3d.grounding_decoder.instance_bank:InstanceBank", | |
| "num_anchor": 50, | |
| "anchor": "./anchor_files/embodiedscan_kmeans_det_cam_log_z-0.2-3.npy", | |
| "embed_dims": 256, | |
| "anchor_in_camera": true | |
| }, | |
| "anchor_encoder": { | |
| "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:DoF9BoxEncoder", | |
| "embed_dims": 256, | |
| "rot_dims": 3 | |
| }, | |
| "graph_model": { | |
| "type": "robo_orchard_lab.models.layers.transformer_layers:MultiheadAttention", | |
| "embed_dims": 256, | |
| "num_heads": 8, | |
| "batch_first": true | |
| }, | |
| "ffn": { | |
| "type": "robo_orchard_lab.models.layers.transformer_layers:FFN", | |
| "embed_dims": 256, | |
| "feedforward_channels": 2048, | |
| "ffn_drop": 0.0 | |
| }, | |
| "norm_layer": { | |
| "type": "torch.nn.modules.normalization:LayerNorm", | |
| "normalized_shape": 256 | |
| }, | |
| "deformable_model": { | |
| "type": "robo_orchard_lab.models.bip3d.grounding_decoder.deformable_aggregation:DeformableFeatureAggregation", | |
| "embed_dims": 256, | |
| "num_groups": 8, | |
| "num_levels": 4, | |
| "use_camera_embed": true, | |
| "with_depth": true, | |
| "min_depth": 0.25, | |
| "max_depth": 10, | |
| "kps_generator": { | |
| "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:SparseBox3DKeyPointsGenerator", | |
| "fix_scale": [ | |
| [ | |
| 0, | |
| 0, | |
| 0 | |
| ], | |
| [ | |
| 0.45, | |
| 0, | |
| 0 | |
| ], | |
| [ | |
| -0.45, | |
| 0, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 0.45, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| -0.45, | |
| 0 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| 0.45 | |
| ], | |
| [ | |
| 0, | |
| 0, | |
| -0.45 | |
| ] | |
| ], | |
| "num_learnable_pts": 9 | |
| }, | |
| "with_value_proj": true, | |
| "filter_outlier": true | |
| }, | |
| "text_cross_attn": { | |
| "type": "robo_orchard_lab.models.layers.transformer_layers:MultiheadAttention", | |
| "embed_dims": 256, | |
| "num_heads": 8, | |
| "batch_first": true | |
| }, | |
| "refine_layer": { | |
| "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:GroundingRefineClsHead", | |
| "embed_dims": 256, | |
| "output_dim": 9, | |
| "cls_bias": true | |
| }, | |
| "loss_cls": { | |
| "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:FocalLoss", | |
| "use_sigmoid": true, | |
| "gamma": 2.0, | |
| "alpha": 0.25, | |
| "loss_weight": 1.0 | |
| }, | |
| "loss_reg": { | |
| "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:DoF9BoxLoss", | |
| "loss_weight_wd": 1.0, | |
| "loss_weight_cd": 0.8 | |
| }, | |
| "sampler": { | |
| "type": "robo_orchard_lab.models.bip3d.grounding_decoder.target:Grounding3DTarget", | |
| "cls_weight": 1.0, | |
| "box_weight": 1.0, | |
| "num_dn": 100, | |
| "cost_weight_wd": 1.0, | |
| "cost_weight_cd": 0.8, | |
| "with_dn_query": true, | |
| "num_classes": 284, | |
| "embed_dims": 256 | |
| }, | |
| "gt_reg_key": "gt_bboxes_3d", | |
| "gt_cls_key": "tokens_positive", | |
| "post_processor": { | |
| "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:GroundingBox3DPostProcess", | |
| "num_output": 1000 | |
| } | |
| }, | |
| "neck": { | |
| "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper", | |
| "in_channels": [ | |
| 192, | |
| 384, | |
| 768 | |
| ], | |
| "kernel_size": 1, | |
| "out_channels": 256, | |
| "act_cfg": null, | |
| "bias": true, | |
| "norm_cfg": { | |
| "type": "torch.nn.modules.normalization:GroupNorm", | |
| "num_groups": 32 | |
| }, | |
| "num_outs": 4 | |
| }, | |
| "text_encoder": { | |
| "type": "robo_orchard_lab.models.bip3d.bert:BertModel", | |
| "special_tokens_list": [ | |
| "[CLS]", | |
| "[SEP]" | |
| ], | |
| "name": "./ckpt/bert-base-uncased", | |
| "pad_to_max": false, | |
| "use_sub_sentence_represent": true, | |
| "add_pooling_layer": false, | |
| "max_tokens": 768, | |
| "use_checkpoint": true, | |
| "return_tokenized": true | |
| }, | |
| "feature_enhancer": { | |
| "type": "robo_orchard_lab.models.bip3d.feature_enhancer:TextImageDeformable2DEnhancer", | |
| "embed_dims": 256, | |
| "num_layers": 6, | |
| "text_img_attn_block": { | |
| "v_dim": 256, | |
| "l_dim": 256, | |
| "embed_dim": 1024, | |
| "num_heads": 4, | |
| "init_values": 0.0001 | |
| }, | |
| "img_attn_block": { | |
| "self_attn_cfg": { | |
| "embed_dims": 256, | |
| "num_levels": 4, | |
| "im2col_step": 1 | |
| }, | |
| "ffn_cfg": { | |
| "embed_dims": 256, | |
| "feedforward_channels": 2048, | |
| "ffn_drop": 0.0 | |
| } | |
| }, | |
| "text_attn_block": { | |
| "self_attn_cfg": { | |
| "num_heads": 4, | |
| "embed_dims": 256 | |
| }, | |
| "ffn_cfg": { | |
| "embed_dims": 256, | |
| "feedforward_channels": 1024, | |
| "ffn_drop": 0.0 | |
| } | |
| }, | |
| "num_feature_levels": 4, | |
| "positional_encoding": { | |
| "num_feats": 128, | |
| "normalize": true, | |
| "offset": 0.0, | |
| "temperature": 20 | |
| } | |
| }, | |
| "spatial_enhancer": { | |
| "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:DepthFusionSpatialEnhancer", | |
| "embed_dims": 256, | |
| "feature_3d_dim": 32, | |
| "num_depth_layers": 2, | |
| "min_depth": 0.25, | |
| "max_depth": 10, | |
| "num_depth": 64, | |
| "with_feature_3d": true, | |
| "loss_depth_weight": 1.0 | |
| }, | |
| "data_preprocessor": { | |
| "type": "robo_orchard_lab.models.layers.data_preprocessors:BaseDataPreprocessor", | |
| "mean": [ | |
| 123.675, | |
| 116.28, | |
| 103.53 | |
| ], | |
| "std": [ | |
| 58.395, | |
| 57.12, | |
| 57.375 | |
| ], | |
| "channel_flip": true, | |
| "batch_transforms": [ | |
| { | |
| "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:BatchDepthProbGTGenerator", | |
| "min_depth": 0.25, | |
| "max_depth": 10, | |
| "num_depth": 64, | |
| "origin_stride": 4, | |
| "valid_threshold": 0.0, | |
| "stride": [ | |
| 8, | |
| 16, | |
| 32, | |
| 64 | |
| ] | |
| }, | |
| { | |
| "type": "robo_orchard_lab.models.layers.data_preprocessors:GridMask", | |
| "apply_grid_mask_keys": [ | |
| "imgs", | |
| "depths" | |
| ] | |
| } | |
| ] | |
| }, | |
| "backbone_3d": { | |
| "type": "robo_orchard_lab.models.modules.resnet:ResNet", | |
| "depth": 34, | |
| "in_channels": 1, | |
| "base_channels": 4, | |
| "num_stages": 4, | |
| "out_indices": [ | |
| 1, | |
| 2, | |
| 3 | |
| ], | |
| "bn_eval": true, | |
| "with_cp": true, | |
| "style": "pytorch" | |
| }, | |
| "neck_3d": { | |
| "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper", | |
| "in_channels": [ | |
| 8, | |
| 16, | |
| 32 | |
| ], | |
| "kernel_size": 1, | |
| "out_channels": 32, | |
| "act_cfg": null, | |
| "bias": true, | |
| "norm_cfg": { | |
| "type": "torch.nn.modules.normalization:GroupNorm", | |
| "num_groups": 4 | |
| }, | |
| "num_outs": 4 | |
| }, | |
| "input_2d": "imgs", | |
| "input_3d": "depths", | |
| "embed_dims": 256, | |
| "pre_spatial_enhancer": false | |
| } |