Transformers
File size: 5,092 Bytes
c108889
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
{
    "model": {
        "_target_": "sam2.modeling.sam2_base.SAM2Plus",
        "image_encoder": {
            "_target_": "sam2.modeling.backbones.image_encoder.ImageEncoder",
            "scalp": 1,
            "trunk": {
                "_target_": "sam2.modeling.backbones.hieradet.Hiera",
                "embed_dim": 112,
                "num_heads": 2
            },
            "neck": {
                "_target_": "sam2.modeling.backbones.image_encoder.FpnNeck",
                "position_encoding": {
                    "_target_": "sam2.modeling.position_encoding.PositionEmbeddingSine",
                    "num_pos_feats": 256,
                    "normalize": true,
                    "scale": null,
                    "temperature": 10000
                },
                "d_model": 256,
                "backbone_channel_list": [
                    896,
                    448,
                    224,
                    112
                ],
                "fpn_top_down_levels": [
                    2,
                    3
                ],
                "fpn_interp_model": "nearest"
            }
        },
        "memory_attention": {
            "_target_": "sam2.modeling.memory_attention.MemoryAttention",
            "d_model": 256,
            "pos_enc_at_input": true,
            "layer": {
                "_target_": "sam2.modeling.memory_attention.MemoryAttentionLayer",
                "activation": "relu",
                "dim_feedforward": 2048,
                "dropout": 0.1,
                "pos_enc_at_attn": false,
                "self_attention": {
                    "_target_": "sam2.modeling.sam.transformer.RoPEAttention",
                    "rope_theta": 10000.0,
                    "feat_sizes": [
                        64,
                        64
                    ],
                    "embedding_dim": 256,
                    "num_heads": 1,
                    "downsample_rate": 1,
                    "dropout": 0.1
                },
                "d_model": 256,
                "pos_enc_at_cross_attn_keys": true,
                "pos_enc_at_cross_attn_queries": false,
                "cross_attention": {
                    "_target_": "sam2.modeling.sam.transformer.RoPEAttention",
                    "rope_theta": 10000.0,
                    "feat_sizes": [
                        64,
                        64
                    ],
                    "rope_k_repeat": true,
                    "embedding_dim": 256,
                    "num_heads": 1,
                    "downsample_rate": 1,
                    "dropout": 0.1,
                    "kv_in_dim": 64
                }
            },
            "num_layers": 4
        },
        "memory_encoder": {
            "_target_": "sam2.modeling.memory_encoder.MemoryEncoder",
            "out_dim": 64,
            "position_encoding": {
                "_target_": "sam2.modeling.position_encoding.PositionEmbeddingSine",
                "num_pos_feats": 64,
                "normalize": true,
                "scale": null,
                "temperature": 10000
            },
            "mask_downsampler": {
                "_target_": "sam2.modeling.memory_encoder.MaskDownSampler",
                "kernel_size": 3,
                "stride": 2,
                "padding": 1
            },
            "fuser": {
                "_target_": "sam2.modeling.memory_encoder.Fuser",
                "layer": {
                    "_target_": "sam2.modeling.memory_encoder.CXBlock",
                    "dim": 256,
                    "kernel_size": 7,
                    "padding": 3,
                    "layer_scale_init_value": "1e-6",
                    "use_dwconv": true
                },
                "num_layers": 2
            }
        },
        "num_maskmem": 7,
        "image_size": 1024,
        "sigmoid_scale_for_mem_enc": 20.0,
        "sigmoid_bias_for_mem_enc": -10.0,
        "use_mask_input_as_output_without_sam": true,
        "directly_add_no_mem_embed": true,
        "no_obj_embed_spatial": true,
        "use_high_res_features_in_sam": true,
        "separate_image_encoder": false,
        "separate_memory_attention": true,
        "separate_memory_encoder": true,
        "unified_decoder_box_head_freeze_bn": true,
        "unified_decoder_box_head_inner_dim": 256,
        "unified_decoder_box_head_pred_masks": true,
        "multimask_output_in_sam": true,
        "iou_prediction_use_sigmoid": true,
        "use_obj_ptrs_in_encoder": true,
        "add_tpos_enc_to_obj_ptrs": true,
        "proj_tpos_enc_in_obj_ptrs": true,
        "use_signed_tpos_enc_to_obj_ptrs": true,
        "only_obj_ptrs_in_the_past_for_eval": true,
        "pred_obj_scores": true,
        "pred_obj_scores_mlp": true,
        "fixed_no_obj_ptr": true,
        "multimask_output_for_tracking": true,
        "use_multimask_token_for_obj_ptr": true,
        "multimask_min_pt_num": 0,
        "multimask_max_pt_num": 1,
        "use_mlp_for_obj_ptr_proj": true,
        "compile_image_encoder": false
    }
}