maelic commited on
Commit
c5b5895
·
verified ·
1 Parent(s): 4f10971

Upload yolo12s/config.yml with huggingface_hub

Browse files
Files changed (1) hide show
  1. yolo12s/config.yml +336 -0
yolo12s/config.yml ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ seed: 42
2
+ metric_to_track: mR
3
+ dtype: float32
4
+ output_dir: ./checkpoints/PSG/react++_yolo12s
5
+ glove_dir: ./datasets/
6
+ verbose: INFO
7
+ paths_catalog: ''
8
+ paths_data: ''
9
+ input:
10
+ img_size: [640, 640] # [W, H]
11
+
12
+ pixel_mean:
13
+ - 102.9801
14
+ - 115.9465
15
+ - 122.7717
16
+ pixel_std:
17
+ - 1.0
18
+ - 1.0
19
+ - 1.0
20
+ to_bgr255: true
21
+ flip_prob_train: 0.5
22
+ padding: true
23
+ brightness: 0.15
24
+ contrast: 0.15
25
+ saturation: 0.1
26
+ hue: 0.0
27
+ vertical_flip_prob_train: 0.0
28
+ datasets:
29
+ name: "PSG"
30
+ type: "coco"
31
+ data_dir: "datasets/PSG/coco_format"
32
+ dataloader:
33
+ num_workers: 8
34
+ size_divisibility: 32
35
+ aspect_ratio_grouping: true
36
+ model:
37
+ flip_aug: false
38
+ rpn_only: false
39
+ mask_on: false
40
+ attribute_on: false
41
+ relation_on: true
42
+ device: cuda
43
+ meta_architecture: GeneralizedYOLO
44
+ cls_agnostic_bbox_reg: false
45
+ weight: ''
46
+ pretrained_detector_ckpt: ./checkpoints/BACKBONES/yolo12s_psg.pt
47
+ text_embedding: glove.6B
48
+ box_head: false
49
+ backbone:
50
+ type: yolo
51
+ extra_config: ''
52
+ freeze_conv_body_at: 2
53
+ nms_thresh: 0.001
54
+ freeze: true
55
+ freeze_at: 10
56
+ fpn:
57
+ use_gn: false
58
+ use_relu: false
59
+ group_norm:
60
+ dim_per_gp: -1
61
+ num_groups: 32
62
+ epsilon: 1.0e-05
63
+ yolo:
64
+ weights: ''
65
+ size: yolo12s
66
+ img_size: 640
67
+ out_channels:
68
+ - 128
69
+ - 256
70
+ - 512
71
+ rpn:
72
+ use_fpn: false
73
+ rpn_mid_channel: 512
74
+ anchor_sizes:
75
+ - 32
76
+ - 64
77
+ - 128
78
+ - 256
79
+ - 512
80
+ anchor_stride:
81
+ - 16
82
+ aspect_ratios:
83
+ - 0.5
84
+ - 1.0
85
+ - 2.0
86
+ straddle_thresh: 0
87
+ fg_iou_threshold: 0.7
88
+ bg_iou_threshold: 0.3
89
+ batch_size_per_image: 256
90
+ positive_fraction: 0.5
91
+ pre_nms_top_n_train: 12000
92
+ pre_nms_top_n_test: 6000
93
+ post_nms_top_n_train: 2000
94
+ post_nms_top_n_test: 1000
95
+ min_size: 0
96
+ fpn_post_nms_top_n_train: 2000
97
+ fpn_post_nms_top_n_test: 2000
98
+ fpn_post_nms_per_batch: true
99
+ rpn_head: SingleConvRPNHead
100
+ roi_heads:
101
+ fg_iou_threshold: 0.35
102
+ bg_iou_threshold: 0.3
103
+ bbox_reg_weights:
104
+ - 10.0
105
+ - 10.0
106
+ - 5.0
107
+ - 5.0
108
+ batch_size_per_image: 256
109
+ positive_fraction: 0.25
110
+ score_thresh: 0.01
111
+ nms: 0.5
112
+ post_nms_per_cls_topn: 300
113
+ nms_filter_duplicates: false
114
+ detections_per_img: 100
115
+ roi_box_head:
116
+ feature_extractor: DAMPBoxFeatureExtractor
117
+ predictor: FastRCNNPredictor
118
+ pooler_resolution: 14
119
+ pooler_sampling_ratio: 0
120
+ pooler_scales:
121
+ - 0.0625
122
+ mlp_head_dim: 256
123
+ use_gn: false
124
+ dilation: 1
125
+ conv_head_dim: 256
126
+ num_stacked_convs: 4
127
+ num_classes: 134
128
+ patch_size: 32
129
+ feat_idx_multiscale: true
130
+ feat_idx_neighbors: 1
131
+ roi_attribute_head:
132
+ feature_extractor: FPN2MLPFeatureExtractor
133
+ predictor: FPNPredictor
134
+ share_box_feature_extractor: true
135
+ use_binary_loss: true
136
+ attribute_loss_weight: 0.1
137
+ num_attributes: 201
138
+ max_attributes: 10
139
+ attribute_bgfg_sample: true
140
+ attribute_bgfg_ratio: 3
141
+ pos_weight: 5.0
142
+ roi_mask_head:
143
+ feature_extractor: ResNet50Conv5ROIFeatureExtractor
144
+ predictor: MaskRCNNC4Predictor
145
+ pooler_resolution: 14
146
+ pooler_sampling_ratio: 0
147
+ pooler_scales:
148
+ - 0.0625
149
+ mlp_head_dim: 1024
150
+ conv_layers:
151
+ - 256
152
+ - 256
153
+ - 256
154
+ - 256
155
+ resolution: 14
156
+ share_box_feature_extractor: true
157
+ postprocess_masks: false
158
+ postprocess_masks_threshold: 0.5
159
+ dilation: 1
160
+ use_gn: false
161
+ roi_relation_head:
162
+ predictor: REACTPlusPlusPredictor
163
+ feature_extractor: P5SceneContextExtractor
164
+ use_union_features: true
165
+ use_spatial_features: true
166
+ use_union_features_inference: true
167
+ union_dropout: 0.0
168
+ max_pairs_inference: 0
169
+ textual_features_only: false
170
+ visual_features_only: false
171
+ logit_adjustment: false
172
+ logit_adjustment_tau: 0.3
173
+ pooling_all_levels: true
174
+ batch_size_per_image: 512
175
+ positive_fraction: 0.35
176
+ use_gt_box: false
177
+ use_gt_object_label: false
178
+ embed_dim: 200
179
+ context_dropout_rate: 0.2
180
+ context_hidden_dim: 512
181
+ context_pooling_dim: 4096
182
+ context_obj_layer: 1
183
+ context_rel_layer: 1
184
+ mlp_head_dim: 512
185
+ loss:
186
+ loss_type: BalancedLogitAdjustedLoss
187
+ beta: 0.999
188
+ gamma: 0.0
189
+ alpha: 0.15
190
+ fg_boost: 2.0
191
+ fg_weight: 1.0
192
+ label_smoothing_epsilon: 0.01
193
+ logit_adjustment_tau: 0.5
194
+ bg_discount: 0.3
195
+ ccl_weight: 0.1
196
+ decisive_margin: 2.0
197
+ poly_epsilon: 0.0
198
+ label_smoothing: 0.1
199
+ sampler_aux_loss_weight: 0.1
200
+ attn_entropy_weight: 0.01
201
+ offset_reg_weight: 0.005
202
+ containment_loss_weight: 0.02
203
+ num_classes: 57
204
+ decoder_depth: 1
205
+ transformer_depth: 1
206
+ num_rel_layers: 2
207
+ use_scene_context: true
208
+ use_geo_bias: true
209
+ use_cls_emb: true
210
+ use_geo_enc: true
211
+ max_pairs_per_img: 512
212
+ num_queries: 64
213
+ use_cross_attention: true
214
+ attn_type: standard
215
+ geometric_loss_weight: 0.0
216
+ num_sample_points: 6
217
+ num_sample_heads: 6
218
+ feature_strategy: multi_scale
219
+ use_rmsnorm: true
220
+ use_swiglu: true
221
+ clip_rel_path: ''
222
+ react_loss_weights:
223
+ l21_loss: 1.0
224
+ dist_loss2: 0.1
225
+ loss_dis: 0.5
226
+ transformer:
227
+ dropout_rate: 0.1
228
+ obj_layer: 4
229
+ rel_layer: 2
230
+ num_head: 8
231
+ inner_dim: 2048
232
+ key_dim: 64
233
+ val_dim: 64
234
+ squat_module:
235
+ pre_norm: false
236
+ num_decoder: 3
237
+ rho: 0.35
238
+ beta: 0.7
239
+ pretrain_mask: false
240
+ pretrain_mask_epoch: 1
241
+ causal:
242
+ effect_analysis: false
243
+ fusion_type: sum
244
+ context_layer: motifs
245
+ separate_spatial: false
246
+ effect_type: none
247
+ spatial_for_vision: false
248
+ label_smoothing_loss: false
249
+ use_frequency_bias: false
250
+ require_box_overlap: false
251
+ num_sample_per_gt_rel: 8
252
+ add_gtbox_to_proposal_in_train: false
253
+ classifier: linear
254
+ predict_use_vision: false
255
+ use_bg_discounting: false
256
+ bg_discounting_threshold: 0.1
257
+ resnets:
258
+ num_groups: 1
259
+ width_per_group: 64
260
+ stride_in_1x1: true
261
+ trans_func: BottleneckWithFixedBatchNorm
262
+ stem_func: StemWithFixedBatchNorm
263
+ res5_dilation: 1
264
+ backbone_out_channels: 1024
265
+ res2_out_channels: 256
266
+ stem_out_channels: 64
267
+ solver:
268
+ max_iter: 0
269
+ max_epoch: 10
270
+ base_lr: 0.0001
271
+ bias_lr_factor: 1
272
+ momentum: 0.9
273
+ weight_decay: 0.05
274
+ weight_decay_bias: 0.0
275
+ clip_norm: 5.0
276
+ gamma: 0.5
277
+ steps:
278
+ - 41000
279
+ - 50000
280
+ warmup_factor: 0.1
281
+ warmup_epochs: 1
282
+ warmup_method: linear
283
+ checkpoint_period: 250
284
+ grad_norm_clip: 1.0
285
+ print_grad_freq: 250
286
+ to_val: true
287
+ pre_val: true
288
+ val_period: 250
289
+ update_schedule_during_load: false
290
+ ims_per_batch: 8
291
+ optimizer: ADAMW
292
+ slow_ratio: 10.0
293
+ deform_offset_slow_ratio: 1.0
294
+ muon_scaling: 0.2
295
+ adamw_scaling: 0.8
296
+ schedule:
297
+ type: WarmupCosineAnnealingIterLR
298
+ patience: 2
299
+ threshold: 0.0001
300
+ cooldown: 1
301
+ factor: 0.5
302
+ max_decay_step: 7
303
+ eta_min: 5.0e-07
304
+ plateau_epochs: 5
305
+ accum_steps: 4
306
+ test:
307
+ expected_results: []
308
+ expected_results_sigma_tol: 4
309
+ ims_per_batch: 1
310
+ detections_per_img: 100
311
+ informative: false
312
+ bbox_aug:
313
+ enabled: false
314
+ h_flip: false
315
+ scales: []
316
+ max_size: 4000
317
+ scale_h_flip: false
318
+ save_proposals: false
319
+ relation:
320
+ multiple_preds: false
321
+ iou_threshold: 0.5
322
+ require_overlap: false
323
+ later_nms_prediction_thres: 0.5
324
+ sync_gather: true
325
+ allow_load_from_cache: false
326
+ top_k: 100
327
+ custum_eval: false
328
+ custum_path: ''
329
+ global_setting:
330
+ basic_encoder: Cross-Attention
331
+ gcl_setting:
332
+ group_split_mode: divide4
333
+ knowledge_transfer_mode: KL_logit_TopDown
334
+ no_relation_restrain: false
335
+ zero_label_padding_mode: false
336
+ knowledge_loss_coefficient: 1.0