{ "model": { "_target_": "sam2.modeling.sam2_base.SAM2Plus", "image_encoder": { "_target_": "sam2.modeling.backbones.image_encoder.ImageEncoder", "scalp": 1, "trunk": { "_target_": "sam2.modeling.backbones.hieradet.Hiera", "embed_dim": 112, "num_heads": 2 }, "neck": { "_target_": "sam2.modeling.backbones.image_encoder.FpnNeck", "position_encoding": { "_target_": "sam2.modeling.position_encoding.PositionEmbeddingSine", "num_pos_feats": 256, "normalize": true, "scale": null, "temperature": 10000 }, "d_model": 256, "backbone_channel_list": [ 896, 448, 224, 112 ], "fpn_top_down_levels": [ 2, 3 ], "fpn_interp_model": "nearest" } }, "memory_attention": { "_target_": "sam2.modeling.memory_attention.MemoryAttention", "d_model": 256, "pos_enc_at_input": true, "layer": { "_target_": "sam2.modeling.memory_attention.MemoryAttentionLayer", "activation": "relu", "dim_feedforward": 2048, "dropout": 0.1, "pos_enc_at_attn": false, "self_attention": { "_target_": "sam2.modeling.sam.transformer.RoPEAttention", "rope_theta": 10000.0, "feat_sizes": [ 64, 64 ], "embedding_dim": 256, "num_heads": 1, "downsample_rate": 1, "dropout": 0.1 }, "d_model": 256, "pos_enc_at_cross_attn_keys": true, "pos_enc_at_cross_attn_queries": false, "cross_attention": { "_target_": "sam2.modeling.sam.transformer.RoPEAttention", "rope_theta": 10000.0, "feat_sizes": [ 64, 64 ], "rope_k_repeat": true, "embedding_dim": 256, "num_heads": 1, "downsample_rate": 1, "dropout": 0.1, "kv_in_dim": 64 } }, "num_layers": 4 }, "memory_encoder": { "_target_": "sam2.modeling.memory_encoder.MemoryEncoder", "out_dim": 64, "position_encoding": { "_target_": "sam2.modeling.position_encoding.PositionEmbeddingSine", "num_pos_feats": 64, "normalize": true, "scale": null, "temperature": 10000 }, "mask_downsampler": { "_target_": "sam2.modeling.memory_encoder.MaskDownSampler", "kernel_size": 3, "stride": 2, "padding": 1 }, "fuser": { "_target_": "sam2.modeling.memory_encoder.Fuser", "layer": { "_target_": "sam2.modeling.memory_encoder.CXBlock", "dim": 256, "kernel_size": 7, "padding": 3, "layer_scale_init_value": "1e-6", "use_dwconv": true }, "num_layers": 2 } }, "num_maskmem": 7, "image_size": 1024, "sigmoid_scale_for_mem_enc": 20.0, "sigmoid_bias_for_mem_enc": -10.0, "use_mask_input_as_output_without_sam": true, "directly_add_no_mem_embed": true, "no_obj_embed_spatial": true, "use_high_res_features_in_sam": true, "separate_image_encoder": false, "separate_memory_attention": true, "separate_memory_encoder": true, "unified_decoder_box_head_freeze_bn": true, "unified_decoder_box_head_inner_dim": 256, "unified_decoder_box_head_pred_masks": true, "multimask_output_in_sam": true, "iou_prediction_use_sigmoid": true, "use_obj_ptrs_in_encoder": true, "add_tpos_enc_to_obj_ptrs": true, "proj_tpos_enc_in_obj_ptrs": true, "use_signed_tpos_enc_to_obj_ptrs": true, "only_obj_ptrs_in_the_past_for_eval": true, "pred_obj_scores": true, "pred_obj_scores_mlp": true, "fixed_no_obj_ptr": true, "multimask_output_for_tracking": true, "use_multimask_token_for_obj_ptr": true, "multimask_min_pt_num": 0, "multimask_max_pt_num": 1, "use_mlp_for_obj_ptr_proj": true, "compile_image_encoder": false } }