| { | |
| "model": { | |
| "_target_": "sam2.modeling.sam2_base.SAM2Plus", | |
| "image_encoder": { | |
| "_target_": "sam2.modeling.backbones.image_encoder.ImageEncoder", | |
| "scalp": 1, | |
| "trunk": { | |
| "_target_": "sam2.modeling.backbones.hieradet.Hiera", | |
| "embed_dim": 112, | |
| "num_heads": 2 | |
| }, | |
| "neck": { | |
| "_target_": "sam2.modeling.backbones.image_encoder.FpnNeck", | |
| "position_encoding": { | |
| "_target_": "sam2.modeling.position_encoding.PositionEmbeddingSine", | |
| "num_pos_feats": 256, | |
| "normalize": true, | |
| "scale": null, | |
| "temperature": 10000 | |
| }, | |
| "d_model": 256, | |
| "backbone_channel_list": [ | |
| 896, | |
| 448, | |
| 224, | |
| 112 | |
| ], | |
| "fpn_top_down_levels": [ | |
| 2, | |
| 3 | |
| ], | |
| "fpn_interp_model": "nearest" | |
| } | |
| }, | |
| "memory_attention": { | |
| "_target_": "sam2.modeling.memory_attention.MemoryAttention", | |
| "d_model": 256, | |
| "pos_enc_at_input": true, | |
| "layer": { | |
| "_target_": "sam2.modeling.memory_attention.MemoryAttentionLayer", | |
| "activation": "relu", | |
| "dim_feedforward": 2048, | |
| "dropout": 0.1, | |
| "pos_enc_at_attn": false, | |
| "self_attention": { | |
| "_target_": "sam2.modeling.sam.transformer.RoPEAttention", | |
| "rope_theta": 10000.0, | |
| "feat_sizes": [ | |
| 64, | |
| 64 | |
| ], | |
| "embedding_dim": 256, | |
| "num_heads": 1, | |
| "downsample_rate": 1, | |
| "dropout": 0.1 | |
| }, | |
| "d_model": 256, | |
| "pos_enc_at_cross_attn_keys": true, | |
| "pos_enc_at_cross_attn_queries": false, | |
| "cross_attention": { | |
| "_target_": "sam2.modeling.sam.transformer.RoPEAttention", | |
| "rope_theta": 10000.0, | |
| "feat_sizes": [ | |
| 64, | |
| 64 | |
| ], | |
| "rope_k_repeat": true, | |
| "embedding_dim": 256, | |
| "num_heads": 1, | |
| "downsample_rate": 1, | |
| "dropout": 0.1, | |
| "kv_in_dim": 64 | |
| } | |
| }, | |
| "num_layers": 4 | |
| }, | |
| "memory_encoder": { | |
| "_target_": "sam2.modeling.memory_encoder.MemoryEncoder", | |
| "out_dim": 64, | |
| "position_encoding": { | |
| "_target_": "sam2.modeling.position_encoding.PositionEmbeddingSine", | |
| "num_pos_feats": 64, | |
| "normalize": true, | |
| "scale": null, | |
| "temperature": 10000 | |
| }, | |
| "mask_downsampler": { | |
| "_target_": "sam2.modeling.memory_encoder.MaskDownSampler", | |
| "kernel_size": 3, | |
| "stride": 2, | |
| "padding": 1 | |
| }, | |
| "fuser": { | |
| "_target_": "sam2.modeling.memory_encoder.Fuser", | |
| "layer": { | |
| "_target_": "sam2.modeling.memory_encoder.CXBlock", | |
| "dim": 256, | |
| "kernel_size": 7, | |
| "padding": 3, | |
| "layer_scale_init_value": "1e-6", | |
| "use_dwconv": true | |
| }, | |
| "num_layers": 2 | |
| } | |
| }, | |
| "num_maskmem": 7, | |
| "image_size": 1024, | |
| "sigmoid_scale_for_mem_enc": 20.0, | |
| "sigmoid_bias_for_mem_enc": -10.0, | |
| "use_mask_input_as_output_without_sam": true, | |
| "directly_add_no_mem_embed": true, | |
| "no_obj_embed_spatial": true, | |
| "use_high_res_features_in_sam": true, | |
| "separate_image_encoder": false, | |
| "separate_memory_attention": true, | |
| "separate_memory_encoder": true, | |
| "unified_decoder_box_head_freeze_bn": true, | |
| "unified_decoder_box_head_inner_dim": 256, | |
| "unified_decoder_box_head_pred_masks": true, | |
| "multimask_output_in_sam": true, | |
| "iou_prediction_use_sigmoid": true, | |
| "use_obj_ptrs_in_encoder": true, | |
| "add_tpos_enc_to_obj_ptrs": true, | |
| "proj_tpos_enc_in_obj_ptrs": true, | |
| "use_signed_tpos_enc_to_obj_ptrs": true, | |
| "only_obj_ptrs_in_the_past_for_eval": true, | |
| "pred_obj_scores": true, | |
| "pred_obj_scores_mlp": true, | |
| "fixed_no_obj_ptr": true, | |
| "multimask_output_for_tracking": true, | |
| "use_multimask_token_for_obj_ptr": true, | |
| "multimask_min_pt_num": 0, | |
| "multimask_max_pt_num": 1, | |
| "use_mlp_for_obj_ptr_proj": true, | |
| "compile_image_encoder": false | |
| } | |
| } |