| |
|
|
| |
| model: |
| _target_: sam2.modeling.sam2_base.SAM2Base |
| image_encoder: |
| _target_: sam2.modeling.backbones.image_encoder.ImageEncoder |
| scalp: 1 |
| trunk: |
| _target_: sam2.modeling.backbones.hieradet.Hiera |
| embed_dim: 144 |
| num_heads: 2 |
| stages: [2, 6, 36, 4] |
| global_att_blocks: [23, 33, 43] |
| window_pos_embed_bkg_spatial_size: [7, 7] |
| window_spec: [8, 4, 16, 8] |
| neck: |
| _target_: sam2.modeling.backbones.image_encoder.FpnNeck |
| position_encoding: |
| _target_: sam2.modeling.position_encoding.PositionEmbeddingSine |
| num_pos_feats: 256 |
| normalize: true |
| scale: null |
| temperature: 10000 |
| d_model: 256 |
| backbone_channel_list: [1152, 576, 288, 144] |
| fpn_top_down_levels: [2, 3] |
| fpn_interp_model: nearest |
|
|
| memory_attention: |
| _target_: sam2.modeling.memory_attention.MemoryAttention |
| d_model: 256 |
| pos_enc_at_input: true |
| layer: |
| _target_: sam2.modeling.memory_attention.MemoryAttentionLayer |
| activation: relu |
| dim_feedforward: 2048 |
| dropout: 0.1 |
| pos_enc_at_attn: false |
| self_attention: |
| _target_: sam2.modeling.sam.transformer.RoPEAttention |
| rope_theta: 10000.0 |
| feat_sizes: [32, 32] |
| embedding_dim: 256 |
| num_heads: 1 |
| downsample_rate: 1 |
| dropout: 0.1 |
| d_model: 256 |
| pos_enc_at_cross_attn_keys: true |
| pos_enc_at_cross_attn_queries: false |
| cross_attention: |
| _target_: sam2.modeling.sam.transformer.RoPEAttention |
| rope_theta: 10000.0 |
| feat_sizes: [32, 32] |
| rope_k_repeat: True |
| embedding_dim: 256 |
| num_heads: 1 |
| downsample_rate: 1 |
| dropout: 0.1 |
| kv_in_dim: 64 |
| num_layers: 4 |
|
|
| memory_encoder: |
| _target_: sam2.modeling.memory_encoder.MemoryEncoder |
| out_dim: 64 |
| position_encoding: |
| _target_: sam2.modeling.position_encoding.PositionEmbeddingSine |
| num_pos_feats: 64 |
| normalize: true |
| scale: null |
| temperature: 10000 |
| mask_downsampler: |
| _target_: sam2.modeling.memory_encoder.MaskDownSampler |
| kernel_size: 3 |
| stride: 2 |
| padding: 1 |
| fuser: |
| _target_: sam2.modeling.memory_encoder.Fuser |
| layer: |
| _target_: sam2.modeling.memory_encoder.CXBlock |
| dim: 256 |
| kernel_size: 7 |
| padding: 3 |
| layer_scale_init_value: 1e-6 |
| use_dwconv: True |
| num_layers: 2 |
|
|
| num_maskmem: 7 |
| image_size: 1024 |
| |
| sigmoid_scale_for_mem_enc: 20.0 |
| sigmoid_bias_for_mem_enc: -10.0 |
| use_mask_input_as_output_without_sam: true |
| |
| directly_add_no_mem_embed: true |
| |
| use_high_res_features_in_sam: true |
| |
| multimask_output_in_sam: true |
| |
| iou_prediction_use_sigmoid: True |
| |
| use_obj_ptrs_in_encoder: true |
| add_tpos_enc_to_obj_ptrs: false |
| only_obj_ptrs_in_the_past_for_eval: true |
| |
| pred_obj_scores: true |
| pred_obj_scores_mlp: true |
| fixed_no_obj_ptr: true |
| |
| multimask_output_for_tracking: true |
| use_multimask_token_for_obj_ptr: true |
| multimask_min_pt_num: 0 |
| multimask_max_pt_num: 1 |
| use_mlp_for_obj_ptr_proj: true |
| |
| compile_image_encoder: False |
|
|