|
|
|
|
|
|
| model:
|
| _target_: sam2.modeling.sam2_base.SAM2Base
|
| image_encoder:
|
| _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
| scalp: 1
|
| trunk:
|
| _target_: sam2.modeling.backbones.hieradet.Hiera
|
| embed_dim: 96
|
| num_heads: 1
|
| stages: [1, 2, 7, 2]
|
| global_att_blocks: [5, 7, 9]
|
| window_pos_embed_bkg_spatial_size: [7, 7]
|
| neck:
|
| _target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
| position_encoding:
|
| _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
| num_pos_feats: 256
|
| normalize: true
|
| scale: null
|
| temperature: 10000
|
| d_model: 256
|
| backbone_channel_list: [768, 384, 192, 96]
|
| fpn_top_down_levels: [2, 3]
|
| fpn_interp_model: nearest
|
|
|
| memory_attention:
|
| _target_: sam2.modeling.memory_attention.MemoryAttention
|
| d_model: 256
|
| pos_enc_at_input: true
|
| layer:
|
| _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
| activation: relu
|
| dim_feedforward: 2048
|
| dropout: 0.1
|
| pos_enc_at_attn: false
|
| self_attention:
|
| _target_: sam2.modeling.sam.transformer.RoPEAttention
|
| rope_theta: 10000.0
|
| feat_sizes: [32, 32]
|
| embedding_dim: 256
|
| num_heads: 1
|
| downsample_rate: 1
|
| dropout: 0.1
|
| d_model: 256
|
| pos_enc_at_cross_attn_keys: true
|
| pos_enc_at_cross_attn_queries: false
|
| cross_attention:
|
| _target_: sam2.modeling.sam.transformer.RoPEAttention
|
| rope_theta: 10000.0
|
| feat_sizes: [32, 32]
|
| rope_k_repeat: True
|
| embedding_dim: 256
|
| num_heads: 1
|
| downsample_rate: 1
|
| dropout: 0.1
|
| kv_in_dim: 64
|
| num_layers: 4
|
|
|
| memory_encoder:
|
| _target_: sam2.modeling.memory_encoder.MemoryEncoder
|
| out_dim: 64
|
| position_encoding:
|
| _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
| num_pos_feats: 64
|
| normalize: true
|
| scale: null
|
| temperature: 10000
|
| mask_downsampler:
|
| _target_: sam2.modeling.memory_encoder.MaskDownSampler
|
| kernel_size: 3
|
| stride: 2
|
| padding: 1
|
| fuser:
|
| _target_: sam2.modeling.memory_encoder.Fuser
|
| layer:
|
| _target_: sam2.modeling.memory_encoder.CXBlock
|
| dim: 256
|
| kernel_size: 7
|
| padding: 3
|
| layer_scale_init_value: 1e-6
|
| use_dwconv: True
|
| num_layers: 2
|
|
|
| num_maskmem: 7
|
| image_size: 1024
|
|
|
|
|
| sigmoid_scale_for_mem_enc: 20.0
|
| sigmoid_bias_for_mem_enc: -10.0
|
| use_mask_input_as_output_without_sam: true
|
|
|
| directly_add_no_mem_embed: true
|
|
|
| use_high_res_features_in_sam: true
|
|
|
| multimask_output_in_sam: true
|
|
|
| iou_prediction_use_sigmoid: True
|
|
|
| use_obj_ptrs_in_encoder: true
|
| add_tpos_enc_to_obj_ptrs: false
|
| only_obj_ptrs_in_the_past_for_eval: true
|
|
|
| pred_obj_scores: true
|
| pred_obj_scores_mlp: true
|
| fixed_no_obj_ptr: true
|
|
|
| multimask_output_for_tracking: true
|
| use_multimask_token_for_obj_ptr: true
|
| multimask_min_pt_num: 0
|
| multimask_max_pt_num: 1
|
| use_mlp_for_obj_ptr_proj: true
|
|
|
|
|
| compile_image_encoder: False
|
|
|