danelcsb commited on
Commit
a1fce3a
·
verified ·
1 Parent(s): 31b13a3

Upload model

Browse files
Files changed (1) hide show
  1. config.json +123 -6
config.json CHANGED
@@ -24,21 +24,130 @@
24
  "directly_add_no_memory_embedding": true,
25
  "fixed_no_object_pointer": true,
26
  "image_encoder_config": {
27
- "model_type": ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  },
29
  "image_size": 1024,
30
  "initializer_range": 0.02,
31
  "iou_prediction_use_sigmoid": true,
32
  "mask_decoder_config": {
33
- "model_type": ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  },
35
  "max_cond_frames_in_attn": -1,
36
  "max_object_pointers_in_encoder": 16,
37
  "memory_attention_config": {
38
- "model_type": ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  },
40
  "memory_encoder_config": {
41
- "model_type": ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  },
43
  "memory_temporal_stride_for_eval": 1,
44
  "model_type": "sam2",
@@ -54,14 +163,22 @@
54
  "pred_obj_scores_mlp": true,
55
  "proj_tpos_enc_in_object_pointers": true,
56
  "prompt_encoder_config": {
57
- "model_type": ""
 
 
 
 
 
 
 
 
58
  },
59
  "sam_mask_decoder_extra_args": null,
60
  "sigmoid_bias_for_mem_enc": -10,
61
  "sigmoid_scale_for_mem_enc": 20,
62
  "soft_no_object_pointer": false,
63
  "torch_dtype": "float32",
64
- "transformers_version": "4.47.0.dev0",
65
  "use_mask_input_as_output_without_sam": true,
66
  "use_mlp_for_object_pointer_proj": true,
67
  "use_multimask_token_for_object_pointer": true,
 
24
  "directly_add_no_memory_embedding": true,
25
  "fixed_no_object_pointer": true,
26
  "image_encoder_config": {
27
+ "backbone_channel_list": [
28
+ 768,
29
+ 384,
30
+ 192,
31
+ 96
32
+ ],
33
+ "dim_mul": 2.0,
34
+ "drop_path_rate": 0.0,
35
+ "fpn_hidden_size": 256,
36
+ "fpn_interpolation_mode": "nearest",
37
+ "fpn_kernel_size": 1,
38
+ "fpn_padding": 0,
39
+ "fpn_stride": 1,
40
+ "fpn_top_down_levels": [
41
+ 2,
42
+ 3
43
+ ],
44
+ "fuse_type": "sum",
45
+ "global_attention_blocks": [
46
+ 5,
47
+ 7,
48
+ 9
49
+ ],
50
+ "head_mul": 2.0,
51
+ "hidden_act": "gelu",
52
+ "hidden_size": 96,
53
+ "image_size": 1024,
54
+ "layer_norm_eps": 1e-06,
55
+ "model_type": "",
56
+ "num_channels": 3,
57
+ "num_heads": 1,
58
+ "patch_kernel_size": 7,
59
+ "patch_padding": 3,
60
+ "patch_stride": 4,
61
+ "q_pool": 3,
62
+ "q_stride": [
63
+ 2,
64
+ 2
65
+ ],
66
+ "stages": [
67
+ 1,
68
+ 2,
69
+ 7,
70
+ 2
71
+ ],
72
+ "window_positional_embedding_background_size": [
73
+ 7,
74
+ 7
75
+ ],
76
+ "window_spec": [
77
+ 8,
78
+ 4,
79
+ 14,
80
+ 7
81
+ ]
82
  },
83
  "image_size": 1024,
84
  "initializer_range": 0.02,
85
  "iou_prediction_use_sigmoid": true,
86
  "mask_decoder_config": {
87
+ "dynamic_multimask_stability_delta": 0.05,
88
+ "dynamic_multimask_stability_thresh": 0.98,
89
+ "dynamic_multimask_via_stability": true,
90
+ "feed_forward_hidden_act": "relu",
91
+ "hidden_act": "gelu",
92
+ "hidden_size": 256,
93
+ "iou_head_depth": 3,
94
+ "iou_head_hidden_dim": 256,
95
+ "iou_prediction_use_sigmoid": true,
96
+ "model_type": "",
97
+ "num_multimask_outputs": 3,
98
+ "pred_obj_scores": true,
99
+ "pred_obj_scores_mlp": true,
100
+ "two_way_transformer_activation": "relu",
101
+ "two_way_transformer_attention_downsample_rate": 2,
102
+ "two_way_transformer_depth": 2,
103
+ "two_way_transformer_embedding_dim": 256,
104
+ "two_way_transformer_mlp_dim": 2048,
105
+ "two_way_transformer_num_heads": 8,
106
+ "use_high_resolution_features": true,
107
+ "use_multimask_token_for_object_pointer": true
108
  },
109
  "max_cond_frames_in_attn": -1,
110
  "max_object_pointers_in_encoder": 16,
111
  "memory_attention_config": {
112
+ "apply_pe_at_cross_attn_keys": true,
113
+ "apply_pe_at_cross_attn_queries": false,
114
+ "apply_pe_at_input": true,
115
+ "apply_pe_at_self_attn": false,
116
+ "batch_first": true,
117
+ "dim_feedforward": 2048,
118
+ "dropout": 0.1,
119
+ "hidden_act": "relu",
120
+ "hidden_size": 256,
121
+ "model_type": "",
122
+ "num_layers": 4,
123
+ "rope_downsample_rate": 1,
124
+ "rope_dropout": 0.1,
125
+ "rope_embedding_dim": 256,
126
+ "rope_feat_sizes": [
127
+ 32,
128
+ 32
129
+ ],
130
+ "rope_num_heads": 1,
131
+ "rope_theta": 10000
132
  },
133
  "memory_encoder_config": {
134
+ "hidden_size": 256,
135
+ "mask_downsampler_embed_dim": 256,
136
+ "mask_downsampler_hidden_act": "gelu",
137
+ "mask_downsampler_kernel_size": 3,
138
+ "mask_downsampler_padding": 1,
139
+ "mask_downsampler_stride": 2,
140
+ "mask_downsampler_total_stride": 16,
141
+ "memory_fuser_embed_dim": 256,
142
+ "memory_fuser_hidden_act": "gelu",
143
+ "memory_fuser_input_projection": false,
144
+ "memory_fuser_kernel_size": 7,
145
+ "memory_fuser_layer_scale_init_value": 1e-06,
146
+ "memory_fuser_num_layers": 2,
147
+ "memory_fuser_padding": 3,
148
+ "memory_fuser_use_depthwise_conv": true,
149
+ "model_type": "",
150
+ "output_channels": 64
151
  },
152
  "memory_temporal_stride_for_eval": 1,
153
  "model_type": "sam2",
 
163
  "pred_obj_scores_mlp": true,
164
  "proj_tpos_enc_in_object_pointers": true,
165
  "prompt_encoder_config": {
166
+ "hidden_act": "gelu",
167
+ "hidden_size": 256,
168
+ "image_size": 1024,
169
+ "layer_norm_eps": 1e-06,
170
+ "mask_input_channels": 16,
171
+ "model_type": "",
172
+ "num_point_embeddings": 4,
173
+ "patch_size": 16,
174
+ "scale": 1
175
  },
176
  "sam_mask_decoder_extra_args": null,
177
  "sigmoid_bias_for_mem_enc": -10,
178
  "sigmoid_scale_for_mem_enc": 20,
179
  "soft_no_object_pointer": false,
180
  "torch_dtype": "float32",
181
+ "transformers_version": "4.50.0.dev0",
182
  "use_mask_input_as_output_without_sam": true,
183
  "use_mlp_for_object_pointer_proj": true,
184
  "use_multimask_token_for_object_pointer": true,