{ "__config_type__": "robo_orchard_lab.models.bip3d.structure:BIP3DConfig", "class_type": "robo_orchard_lab.models.bip3d.structure:BIP3D", "backbone": { "type": "robo_orchard_lab.models.modules.swin_transformer:SwinTransformer", "embed_dims": 96, "depths": [ 2, 2, 6, 2 ], "num_heads": [ 3, 6, 12, 24 ], "window_size": 7, "mlp_ratio": 4, "qkv_bias": true, "qk_scale": null, "drop_rate": 0.0, "attn_drop_rate": 0.0, "out_indices": [ 1, 2, 3 ], "with_cp": true, "convert_weights": false }, "decoder": { "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:BBox3DDecoder", "look_forward_twice": true, "instance_bank": { "type": "robo_orchard_lab.models.bip3d.grounding_decoder.instance_bank:InstanceBank", "num_anchor": 50, "anchor": "./anchor_files/embodiedscan_kmeans_det_cam_log_z-0.2-3.npy", "embed_dims": 256, "anchor_in_camera": true }, "anchor_encoder": { "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:DoF9BoxEncoder", "embed_dims": 256, "rot_dims": 3 }, "graph_model": { "type": "robo_orchard_lab.models.layers.transformer_layers:MultiheadAttention", "embed_dims": 256, "num_heads": 8, "batch_first": true }, "ffn": { "type": "robo_orchard_lab.models.layers.transformer_layers:FFN", "embed_dims": 256, "feedforward_channels": 2048, "ffn_drop": 0.0 }, "norm_layer": { "type": "torch.nn.modules.normalization:LayerNorm", "normalized_shape": 256 }, "deformable_model": { "type": "robo_orchard_lab.models.bip3d.grounding_decoder.deformable_aggregation:DeformableFeatureAggregation", "embed_dims": 256, "num_groups": 8, "num_levels": 4, "use_camera_embed": true, "with_depth": true, "min_depth": 0.25, "max_depth": 10, "kps_generator": { "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:SparseBox3DKeyPointsGenerator", "fix_scale": [ [ 0, 0, 0 ], [ 0.45, 0, 0 ], [ -0.45, 0, 0 ], [ 0, 0.45, 0 ], [ 0, -0.45, 0 ], [ 0, 0, 0.45 ], [ 0, 0, -0.45 ] ], "num_learnable_pts": 9 }, "with_value_proj": true, "filter_outlier": true }, "text_cross_attn": { "type": "robo_orchard_lab.models.layers.transformer_layers:MultiheadAttention", "embed_dims": 256, "num_heads": 8, "batch_first": true }, "refine_layer": { "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:GroundingRefineClsHead", "embed_dims": 256, "output_dim": 9, "cls_bias": true }, "loss_cls": { "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:FocalLoss", "use_sigmoid": true, "gamma": 2.0, "alpha": 0.25, "loss_weight": 1.0 }, "loss_reg": { "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:DoF9BoxLoss", "loss_weight_wd": 1.0, "loss_weight_cd": 0.8 }, "sampler": { "type": "robo_orchard_lab.models.bip3d.grounding_decoder.target:Grounding3DTarget", "cls_weight": 1.0, "box_weight": 1.0, "num_dn": 100, "cost_weight_wd": 1.0, "cost_weight_cd": 0.8, "with_dn_query": true, "num_classes": 284, "embed_dims": 256 }, "gt_reg_key": "gt_bboxes_3d", "gt_cls_key": "tokens_positive", "post_processor": { "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:GroundingBox3DPostProcess", "num_output": 1000 } }, "neck": { "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper", "in_channels": [ 192, 384, 768 ], "kernel_size": 1, "out_channels": 256, "act_cfg": null, "bias": true, "norm_cfg": { "type": "torch.nn.modules.normalization:GroupNorm", "num_groups": 32 }, "num_outs": 4 }, "text_encoder": { "type": "robo_orchard_lab.models.bip3d.bert:BertModel", "special_tokens_list": [ "[CLS]", "[SEP]" ], "name": "./ckpt/bert-base-uncased", "pad_to_max": false, "use_sub_sentence_represent": true, "add_pooling_layer": false, "max_tokens": 768, "use_checkpoint": true, "return_tokenized": true }, "feature_enhancer": { "type": "robo_orchard_lab.models.bip3d.feature_enhancer:TextImageDeformable2DEnhancer", "embed_dims": 256, "num_layers": 6, "text_img_attn_block": { "v_dim": 256, "l_dim": 256, "embed_dim": 1024, "num_heads": 4, "init_values": 0.0001 }, "img_attn_block": { "self_attn_cfg": { "embed_dims": 256, "num_levels": 4, "im2col_step": 1 }, "ffn_cfg": { "embed_dims": 256, "feedforward_channels": 2048, "ffn_drop": 0.0 } }, "text_attn_block": { "self_attn_cfg": { "num_heads": 4, "embed_dims": 256 }, "ffn_cfg": { "embed_dims": 256, "feedforward_channels": 1024, "ffn_drop": 0.0 } }, "num_feature_levels": 4, "positional_encoding": { "num_feats": 128, "normalize": true, "offset": 0.0, "temperature": 20 } }, "spatial_enhancer": { "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:DepthFusionSpatialEnhancer", "embed_dims": 256, "feature_3d_dim": 32, "num_depth_layers": 2, "min_depth": 0.25, "max_depth": 10, "num_depth": 64, "with_feature_3d": true, "loss_depth_weight": 1.0 }, "data_preprocessor": { "type": "robo_orchard_lab.models.layers.data_preprocessors:BaseDataPreprocessor", "mean": [ 123.675, 116.28, 103.53 ], "std": [ 58.395, 57.12, 57.375 ], "channel_flip": true, "batch_transforms": [ { "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:BatchDepthProbGTGenerator", "min_depth": 0.25, "max_depth": 10, "num_depth": 64, "origin_stride": 4, "valid_threshold": 0.0, "stride": [ 8, 16, 32, 64 ] }, { "type": "robo_orchard_lab.models.layers.data_preprocessors:GridMask", "apply_grid_mask_keys": [ "imgs", "depths" ] } ] }, "backbone_3d": { "type": "robo_orchard_lab.models.modules.resnet:ResNet", "depth": 34, "in_channels": 1, "base_channels": 4, "num_stages": 4, "out_indices": [ 1, 2, 3 ], "bn_eval": true, "with_cp": true, "style": "pytorch" }, "neck_3d": { "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper", "in_channels": [ 8, 16, 32 ], "kernel_size": 1, "out_channels": 32, "act_cfg": null, "bias": true, "norm_cfg": { "type": "torch.nn.modules.normalization:GroupNorm", "num_groups": 4 }, "num_outs": 4 }, "input_2d": "imgs", "input_3d": "depths", "embed_dims": 256, "pre_spatial_enhancer": false }