Transformers
SAM2-Plus / config.json
jiamingZ's picture
Upload config.json
c108889 verified
{
"model": {
"_target_": "sam2.modeling.sam2_base.SAM2Plus",
"image_encoder": {
"_target_": "sam2.modeling.backbones.image_encoder.ImageEncoder",
"scalp": 1,
"trunk": {
"_target_": "sam2.modeling.backbones.hieradet.Hiera",
"embed_dim": 112,
"num_heads": 2
},
"neck": {
"_target_": "sam2.modeling.backbones.image_encoder.FpnNeck",
"position_encoding": {
"_target_": "sam2.modeling.position_encoding.PositionEmbeddingSine",
"num_pos_feats": 256,
"normalize": true,
"scale": null,
"temperature": 10000
},
"d_model": 256,
"backbone_channel_list": [
896,
448,
224,
112
],
"fpn_top_down_levels": [
2,
3
],
"fpn_interp_model": "nearest"
}
},
"memory_attention": {
"_target_": "sam2.modeling.memory_attention.MemoryAttention",
"d_model": 256,
"pos_enc_at_input": true,
"layer": {
"_target_": "sam2.modeling.memory_attention.MemoryAttentionLayer",
"activation": "relu",
"dim_feedforward": 2048,
"dropout": 0.1,
"pos_enc_at_attn": false,
"self_attention": {
"_target_": "sam2.modeling.sam.transformer.RoPEAttention",
"rope_theta": 10000.0,
"feat_sizes": [
64,
64
],
"embedding_dim": 256,
"num_heads": 1,
"downsample_rate": 1,
"dropout": 0.1
},
"d_model": 256,
"pos_enc_at_cross_attn_keys": true,
"pos_enc_at_cross_attn_queries": false,
"cross_attention": {
"_target_": "sam2.modeling.sam.transformer.RoPEAttention",
"rope_theta": 10000.0,
"feat_sizes": [
64,
64
],
"rope_k_repeat": true,
"embedding_dim": 256,
"num_heads": 1,
"downsample_rate": 1,
"dropout": 0.1,
"kv_in_dim": 64
}
},
"num_layers": 4
},
"memory_encoder": {
"_target_": "sam2.modeling.memory_encoder.MemoryEncoder",
"out_dim": 64,
"position_encoding": {
"_target_": "sam2.modeling.position_encoding.PositionEmbeddingSine",
"num_pos_feats": 64,
"normalize": true,
"scale": null,
"temperature": 10000
},
"mask_downsampler": {
"_target_": "sam2.modeling.memory_encoder.MaskDownSampler",
"kernel_size": 3,
"stride": 2,
"padding": 1
},
"fuser": {
"_target_": "sam2.modeling.memory_encoder.Fuser",
"layer": {
"_target_": "sam2.modeling.memory_encoder.CXBlock",
"dim": 256,
"kernel_size": 7,
"padding": 3,
"layer_scale_init_value": "1e-6",
"use_dwconv": true
},
"num_layers": 2
}
},
"num_maskmem": 7,
"image_size": 1024,
"sigmoid_scale_for_mem_enc": 20.0,
"sigmoid_bias_for_mem_enc": -10.0,
"use_mask_input_as_output_without_sam": true,
"directly_add_no_mem_embed": true,
"no_obj_embed_spatial": true,
"use_high_res_features_in_sam": true,
"separate_image_encoder": false,
"separate_memory_attention": true,
"separate_memory_encoder": true,
"unified_decoder_box_head_freeze_bn": true,
"unified_decoder_box_head_inner_dim": 256,
"unified_decoder_box_head_pred_masks": true,
"multimask_output_in_sam": true,
"iou_prediction_use_sigmoid": true,
"use_obj_ptrs_in_encoder": true,
"add_tpos_enc_to_obj_ptrs": true,
"proj_tpos_enc_in_obj_ptrs": true,
"use_signed_tpos_enc_to_obj_ptrs": true,
"only_obj_ptrs_in_the_past_for_eval": true,
"pred_obj_scores": true,
"pred_obj_scores_mlp": true,
"fixed_no_obj_ptr": true,
"multimask_output_for_tracking": true,
"use_multimask_token_for_obj_ptr": true,
"multimask_min_pt_num": 0,
"multimask_max_pt_num": 1,
"use_mlp_for_obj_ptr_proj": true,
"compile_image_encoder": false
}
}