danelcsb commited on
Commit
6bd14d2
·
verified ·
1 Parent(s): 7628207

Upload model

Browse files
Files changed (2) hide show
  1. config.json +84 -94
  2. model.safetensors +2 -2
config.json CHANGED
@@ -1,89 +1,15 @@
1
  {
2
- "add_all_frames_to_correct_as_cond": false,
3
  "architectures": [
4
  "Sam2Model"
5
  ],
6
- "backbone_stride": 16,
7
  "binarize_mask_from_pts_for_mem_enc": true,
8
  "enable_occlusion_spatial_embedding": true,
9
  "enable_temporal_pos_encoding_for_object_pointers": true,
10
  "fill_hole_area": 8,
11
- "image_encoder_config": {
12
- "backbone_channel_list": [
13
- 768,
14
- 384,
15
- 192,
16
- 96
17
- ],
18
- "backbone_feature_sizes": [
19
- [
20
- 256,
21
- 256
22
- ],
23
- [
24
- 128,
25
- 128
26
- ],
27
- [
28
- 64,
29
- 64
30
- ]
31
- ],
32
- "dim_mul": 2.0,
33
- "drop_path_rate": 0.0,
34
- "fpn_hidden_size": 256,
35
- "fpn_interpolation_mode": "nearest",
36
- "fpn_kernel_size": 1,
37
- "fpn_padding": 0,
38
- "fpn_stride": 1,
39
- "fpn_top_down_levels": [
40
- 2,
41
- 3
42
- ],
43
- "fuse_type": "sum",
44
- "global_attention_blocks": [
45
- 5,
46
- 7,
47
- 9
48
- ],
49
- "head_mul": 2.0,
50
- "hidden_act": "gelu",
51
- "hidden_size": 96,
52
- "image_size": 1024,
53
- "layer_norm_eps": 1e-06,
54
- "model_type": "",
55
- "num_channels": 3,
56
- "num_feature_levels": 3,
57
- "num_heads": 1,
58
- "patch_kernel_size": 7,
59
- "patch_padding": 3,
60
- "patch_stride": 4,
61
- "q_pool": 3,
62
- "q_stride": [
63
- 2,
64
- 2
65
- ],
66
- "stages": [
67
- 1,
68
- 2,
69
- 7,
70
- 2
71
- ],
72
- "window_positional_embedding_background_size": [
73
- 7,
74
- 7
75
- ],
76
- "window_spec": [
77
- 8,
78
- 4,
79
- 14,
80
- 7
81
- ]
82
- },
83
  "image_size": 1024,
84
  "initializer_range": 0.02,
85
- "iou_prediction_use_sigmoid": true,
86
  "mask_decoder_config": {
 
87
  "dynamic_multimask_stability_delta": 0.05,
88
  "dynamic_multimask_stability_thresh": 0.98,
89
  "dynamic_multimask_via_stability": true,
@@ -92,37 +18,31 @@
92
  "hidden_size": 256,
93
  "iou_head_depth": 3,
94
  "iou_head_hidden_dim": 256,
95
- "iou_prediction_use_sigmoid": true,
96
  "model_type": "",
 
 
97
  "num_multimask_outputs": 3,
98
- "two_way_transformer_activation": "relu",
99
- "two_way_transformer_attention_downsample_rate": 2,
100
- "two_way_transformer_depth": 2,
101
- "two_way_transformer_embedding_dim": 256,
102
- "two_way_transformer_mlp_dim": 2048,
103
- "two_way_transformer_num_heads": 8,
104
- "use_multimask_token_for_object_pointer": true
105
  },
106
- "max_cond_frames_in_attn": -1,
107
  "max_object_pointers_in_encoder": 16,
108
  "memory_attention_config": {
109
  "apply_pe_at_cross_attn_keys": true,
110
  "apply_pe_at_cross_attn_queries": false,
111
  "apply_pe_at_self_attn": false,
 
112
  "dim_feedforward": 2048,
113
  "dropout": 0.1,
114
  "hidden_act": "relu",
115
  "hidden_size": 256,
116
  "model_type": "",
 
117
  "num_layers": 4,
118
- "rope_downsample_rate": 1,
119
  "rope_dropout": 0.1,
120
- "rope_embedding_dim": 256,
121
  "rope_feat_sizes": [
122
- 32,
123
- 32
124
  ],
125
- "rope_num_heads": 1,
126
  "rope_theta": 10000
127
  },
128
  "memory_encoder_config": {
@@ -143,7 +63,6 @@
143
  "model_type": "",
144
  "output_channels": 64
145
  },
146
- "memory_temporal_stride_for_eval": 1,
147
  "model_type": "sam2",
148
  "multimask_max_pt_num": 1,
149
  "multimask_min_pt_num": 0,
@@ -157,6 +76,7 @@
157
  "prompt_encoder_config": {
158
  "hidden_act": "gelu",
159
  "hidden_size": 256,
 
160
  "image_size": 1024,
161
  "layer_norm_eps": 1e-06,
162
  "mask_input_channels": 16,
@@ -168,8 +88,78 @@
168
  "sigmoid_bias_for_mem_enc": -10.0,
169
  "sigmoid_scale_for_mem_enc": 20.0,
170
  "torch_dtype": "float32",
171
- "transformers_version": "4.53.0.dev0",
172
- "use_mask_input_as_output_without_sam": true,
173
- "use_multimask_token_for_object_pointer": true,
174
- "use_object_pointers_in_encoder": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  }
 
1
  {
 
2
  "architectures": [
3
  "Sam2Model"
4
  ],
 
5
  "binarize_mask_from_pts_for_mem_enc": true,
6
  "enable_occlusion_spatial_embedding": true,
7
  "enable_temporal_pos_encoding_for_object_pointers": true,
8
  "fill_hole_area": 8,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "image_size": 1024,
10
  "initializer_range": 0.02,
 
11
  "mask_decoder_config": {
12
+ "attention_downsample_rate": 2,
13
  "dynamic_multimask_stability_delta": 0.05,
14
  "dynamic_multimask_stability_thresh": 0.98,
15
  "dynamic_multimask_via_stability": true,
 
18
  "hidden_size": 256,
19
  "iou_head_depth": 3,
20
  "iou_head_hidden_dim": 256,
21
+ "mlp_dim": 2048,
22
  "model_type": "",
23
+ "num_attention_heads": 8,
24
+ "num_hidden_layers": 2,
25
  "num_multimask_outputs": 3,
26
+ "two_way_transformer_activation": "relu"
 
 
 
 
 
 
27
  },
 
28
  "max_object_pointers_in_encoder": 16,
29
  "memory_attention_config": {
30
  "apply_pe_at_cross_attn_keys": true,
31
  "apply_pe_at_cross_attn_queries": false,
32
  "apply_pe_at_self_attn": false,
33
+ "attention_downsample_rate": 1,
34
  "dim_feedforward": 2048,
35
  "dropout": 0.1,
36
  "hidden_act": "relu",
37
  "hidden_size": 256,
38
  "model_type": "",
39
+ "num_attention_heads": 1,
40
  "num_layers": 4,
 
41
  "rope_dropout": 0.1,
 
42
  "rope_feat_sizes": [
43
+ 64,
44
+ 64
45
  ],
 
46
  "rope_theta": 10000
47
  },
48
  "memory_encoder_config": {
 
63
  "model_type": "",
64
  "output_channels": 64
65
  },
 
66
  "model_type": "sam2",
67
  "multimask_max_pt_num": 1,
68
  "multimask_min_pt_num": 0,
 
76
  "prompt_encoder_config": {
77
  "hidden_act": "gelu",
78
  "hidden_size": 256,
79
+ "image_embedding_size": 64,
80
  "image_size": 1024,
81
  "layer_norm_eps": 1e-06,
82
  "mask_input_channels": 16,
 
88
  "sigmoid_bias_for_mem_enc": -10.0,
89
  "sigmoid_scale_for_mem_enc": 20.0,
90
  "torch_dtype": "float32",
91
+ "transformers_version": "4.54.0.dev0",
92
+ "vision_config": {
93
+ "backbone_channel_list": [
94
+ 768,
95
+ 384,
96
+ 192,
97
+ 96
98
+ ],
99
+ "backbone_feature_sizes": [
100
+ [
101
+ 256,
102
+ 256
103
+ ],
104
+ [
105
+ 128,
106
+ 128
107
+ ],
108
+ [
109
+ 64,
110
+ 64
111
+ ]
112
+ ],
113
+ "dim_mul": 2.0,
114
+ "drop_path_rate": 0.0,
115
+ "fpn_hidden_size": 256,
116
+ "fpn_interpolation_mode": "nearest",
117
+ "fpn_kernel_size": 1,
118
+ "fpn_padding": 0,
119
+ "fpn_stride": 1,
120
+ "fpn_top_down_levels": [
121
+ 2,
122
+ 3
123
+ ],
124
+ "fuse_type": "sum",
125
+ "global_attention_blocks": [
126
+ 5,
127
+ 7,
128
+ 9
129
+ ],
130
+ "head_mul": 2.0,
131
+ "hidden_act": "gelu",
132
+ "hidden_size": 96,
133
+ "image_size": 1024,
134
+ "initializer_range": 0.02,
135
+ "layer_norm_eps": 1e-06,
136
+ "model_type": "sam2_vision_model",
137
+ "num_attention_heads": 1,
138
+ "num_channels": 3,
139
+ "num_feature_levels": 3,
140
+ "patch_kernel_size": 7,
141
+ "patch_padding": 3,
142
+ "patch_stride": 4,
143
+ "q_pool": 3,
144
+ "q_stride": [
145
+ 2,
146
+ 2
147
+ ],
148
+ "stages": [
149
+ 1,
150
+ 2,
151
+ 7,
152
+ 2
153
+ ],
154
+ "window_positional_embedding_background_size": [
155
+ 7,
156
+ 7
157
+ ],
158
+ "window_spec": [
159
+ 8,
160
+ 4,
161
+ 14,
162
+ 7
163
+ ]
164
+ }
165
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:030a362dae8f614f7aa8afc76097ecacd3a22b00baf76f8f1d92819a5eedf3f0
3
- size 155906184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff4f3e742dfcaff5b6073362fdd9b74a48cb673b3cb6637d9d9d5cd2b3f5bfef
3
+ size 155906944